centos下ICU4C字符集检测和转换，C++版本

佴阳辉

2023-12-01

1.ICUUC简介

ICU4C是ICU在C/C++平台下的版本, ICU(International Component for Unicode)是基于”IBM公共许可证”的，与开源组织合作研究的, 用于支持软件国际化的开源项目。ICU4C提供了C/C++平台强大的国际化开发能力，软件开发者几乎可以使用ICU4C解决任何国际化的问题，根据各地的风俗和语言习惯，实现对数字、货币、时间、日期、和消息的格式化、解析，对字符串进行大小写转换、整理、搜索和排序等功能，必须一提的是，ICU4C提供了强大的BIDI算法，对阿拉伯语等BIDI语言提供了完善的支持。

2.安装

2.1 在http://www.icu-project.org/download/4.2.html下载ICU4C库，我下载的是icu4c-49_1_2-src.tgz。
2.2 执行如下命令,安装成功：

tar -zxvf icu4c-49_1_2-src.tgz 
cd icu/source
./configure
make
make install

3.代码

3.1 myicu.h

#ifndef _MYICU_H_
#define _MYICU_H_
#include "unicode/utypes.h"
#include "unicode/ucsdet.h"
#include "unicode/ucnv.h"
#include <iostream>
#include <fstream>
#include <cstdlib>
#include <cstring>
#include <cstdio>
using namespace std;

#define BUF_MAX     4096 
class MyIcu{
public:
    MyIcu(const char* filename);
    bool detectTextEncoding();
    bool convertoUtf8();
    int convert(const char *toConverterName, const char *fromConverterName,  
            char *target, int32_t targetCapacity, const char *source, int32_t sourceLength);
    ~MyIcu();
private:
    const char* m_filename;
    FILE* file;
    char* detected;
};
#endif //_MYICU_H_

3.2 myicu.cpp

#include "myicu.h"
const int BUFFSIZE=8192;
MyIcu::MyIcu(const char* filename):m_filename(filename){
}


MyIcu::~MyIcu(){
    fclose(file);
    delete [] detected;  
}
bool MyIcu::detectTextEncoding(){
    UCharsetDetector* csd;
    const UCharsetMatch **csm;
    UErrorCode status = U_ZERO_ERROR;
    char buffer[BUFFSIZE];
    int inputLength,match, matchCount = 0;
    file = fopen(m_filename, "rb");
    if (file == NULL) {
        cout<<"open file error"<<endl;
        return 0;
    }
    inputLength = (int32_t) fread(buffer, 1, BUFFSIZE, file);


    csd = ucsdet_open(&status);
    ucsdet_setText(csd, buffer,inputLength, &status);
    csm = ucsdet_detectAll(csd, &matchCount, &status);
    if(csm == NULL){
        ucsdet_close(csd);
        return 0;
    }
    detected = new char[128];
#if 0
    for(match = 0; match < matchCount; match += 1) {
        const char *name = ucsdet_getName(csm[match], &status);         
        const char *lang = ucsdet_getLanguage(csm[match], &status);
        int32_t confidence = ucsdet_getConfidence(csm[match], &status);

        if (lang == NULL || strlen(lang) == 0) {
            lang = "**";
        }
        cout<<name <<"("<<lang<<")"<<confidence<<endl;
    }
#endif
        if(matchCount > 0)  
        {  
            detected = strdup(ucsdet_getName(csm[0], &status)); //分配了内存， 需要释放  
            if(status != U_ZERO_ERROR)  
            return false;  
        }  
        cout<<"charset = "<<detected<<endl;
        ucsdet_close(csd);
        return 1;
}


bool MyIcu::convertoUtf8(){
     file = fopen(m_filename, "rb");  
    if(file == NULL)   
    {  
        cout<<"open file error"<<endl;
        return 0;  
    }     

    int len = 0;  
    //char *detected;  

    char *buffer = new char[BUF_MAX];  
    char *target = new char[BUF_MAX * 2];  

    while(true)  
    {  
        memset(buffer, 0, BUF_MAX);  
        memset(target, 0, BUF_MAX * 2);  

        len = (int32_t)fread(buffer, sizeof(char), BUF_MAX, file);  

        if(detected == NULL)  
        {  
            if(!detectTextEncoding()) //编码探测  
                break;  
        }  

        //转换为utf8字符编码  
        if(convert("UTF-8", detected, target, BUF_MAX * 2, (const char*)buffer, len) != U_ZERO_ERROR)  
        {  
            cout<<"ucnv_convert error"<<endl;
            break;  
        }  

        cout<<target<<endl;//打印出转换的文件的字符串  

        if(len < BUF_MAX)  
            break;  
    }  

    delete [] buffer;  
    delete [] target;  

    return 1;
}



int MyIcu::convert(const char *toConverterName, const char *fromConverterName,  
            char *target, int32_t targetCapacity, const char *source, int32_t sourceLength){

                 UErrorCode error = U_ZERO_ERROR;  
                ucnv_convert(toConverterName, fromConverterName, target, targetCapacity,
                    source, sourceLength, &error);  
                  return error;
}

3.3 main.cpp

#include "myicu.h"
#include <string>

#include <cstdio>
#define BUF_MAX     4096


int main(){
    const char* filename = "123.txt";
    MyIcu myicu(filename);
    //char* buff = new char[126];
    bool flag = myicu.detectTextEncoding();
    if(!flag){
        std::cout<<"解析错误!"<<endl;
    }
    bool flag2 = myicu.convertoUtf8();
    if(!flag2){
        std::cout<<"转换错误!"<<endl;
    }
}

4编译

g++ -o target main.cpp myicu.cpp -licuuc -licui18n

如果找不到icuuc和icui18n动态库的话，执行如下命令：

vim /etc/ld.so.conf

将/usr/local/目录加进去,然后再

ldconfig

就行了。

你们可以试下自己准备的文件。

参考文档：
http://icu-project.org/apiref/icu4c/index.html

centos下ICU4C字符集检测和转换，C++版本

1.ICUUC简介

2.安装

3.代码

4编译

相关阅读

相关文章

相关问答

相关文档