当前位置: 首页 > 工具软件 > UTF-8 CPP > 使用案例 >

c++ 处理utf-8字符串

韦思淼
2023-12-01

c++的字符串中的每一个元素都是一个字节。所以在装入utf8字符串的时候,其实是按照一定的规则编码的。

字符的8位中 如果0开头 则自己就是一个单位。

1字节0xxxxxxx 
2字节110xxxxx 10xxxxxx 
3字节1110xxxx 10xxxxxx 10xxxxxx
4字节11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
5字节111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
6字节1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

 

 

 

  

  

 所以知道这个就好办了。这里有一个类,用来专门处理utf-8的字符串,实现了字符串截取,索引,长度等功能~

#ifndef __IUTF8_STRING__
#define __IUTF8_STRING__

class iutf8string
{
    public:
        iutf8string(const std::string& );

        iutf8string(const char* );

        ~iutf8string();

    public:
    
        int length();

        std::string substring(int start_index, int length);

        std::string get(int index);


        iutf8string operator + (iutf8string& );
        
        std::string operator [](int index);

        std::string stlstring();

        const char* c_str();

        iutf8string utf8substr(int u8start_index, int u8_length);
        
        std::string substr(int u8start_index, int u8_length);

    private:

        std::string data;
        int* offerset;
        int _length;

        void refresh();
};

#endif
#include <iostream>
#include <string>
#include "iutf8string.h"

using namespace std;

iutf8string::iutf8string(const string& str)
{
    data = str;
    refresh();
}

iutf8string::iutf8string(const char* str)
{
    data = string(str);
    refresh();
}

iutf8string::~iutf8string()
{
    delete[] offerset;
}

string iutf8string::stlstring()
{
    return data;
}

const char* iutf8string::c_str()
{
    return data.c_str();
}

iutf8string iutf8string::operator +(iutf8string& ustr)
{
    string temp = data + ustr.stlstring();

    return iutf8string(temp);
}

int iutf8string::length()
{

    return _length;
}

string iutf8string::get(int index)
{
    if(index >= _length) return "";
    string temp = data.substr(offerset[index], offerset[index+1] - offerset[index]);

    return temp;
}

string iutf8string::operator [](int index)
{
    if(index >= _length) return "";
    string temp = data.substr(offerset[index], offerset[index+1] - offerset[index]);

    return temp;
}

string iutf8string::substr(int u8_start_index, int u8_length)
{
    if(u8_start_index + u8_length >= _length) return "";

    return data.substr(offerset[u8_start_index], offerset[u8_start_index+u8_length] - offerset[u8_start_index]);
}

iutf8string iutf8string::utf8substr(int u8_start_index, int u8_length)
{
    if(u8_start_index + u8_length >= _length) return iutf8string("");
    string ret = data.substr(offerset[u8_start_index], offerset[u8_start_index+u8_length] - offerset[u8_start_index]);
    
    return iutf8string(ret);
}

void iutf8string::refresh()
{
    int *tmp = new int[data.length()];
    int i, tmpidx = 0;
    for(i = 0; i < data.length(); i++)
    {
        if(((int)data[i] > 0)||(!(((int)data[i] & 0x00000040) == 0)))
        {
            tmp[tmpidx] = i;
            tmpidx++;
        }
    }

    tmp[tmpidx] = data.length();

    int *tmp2 = new int[tmpidx];
    for(i = 0; i < tmpidx; i++)
    {
        tmp2[i] = tmp[i];
    }


    delete[] tmp;
    offerset = tmp2;
    _length = tmpidx;
}


//----------------test code ----------------------------
int main()
{
    iutf8string str1("_我Love你!中国  ,!");
    cout << "字符串长度:" <<str1.length() <<endl;
    int i; cout << "[" ;
    for(i = 0; i < str1.length(); i++)
    {
        cout << str1[i] << " ";
    }
    cout << "]" << endl;
    string one = str1.substr(2,11);
    cout << one << endl;

    string s1("我们都是好孩子!");
    iutf8string str2(s1);
    cout << "[" ;
    for(i = 0; i < str2.length(); i++)
    {
        cout << str2[i] << " ";
    }
    cout << "]" << endl;
}

 

最后祝您,提乾涉经。告辞。

 

转载于:https://www.cnblogs.com/AkazaAkari/p/8058887.html

 类似资料: