ICU4C是ICU在C/C++平台下的版本, ICU(International Component for Unicode)是基于”IBM公共许可证”的,与开源组织合作研究的, 用于支持软件国际化的开源项目。ICU4C提供了C/C++平台强大的国际化开发能力,软件开发者几乎可以使用ICU4C解决任何国际化的问题,根据各地的风俗和语言习惯,实现对数字、货币、时间、日期、和消息的格式化、解析,对字符串进行大小写转换、整理、搜索和排序等功能,必须一提的是,ICU4C提供了强大的BIDI算法,对阿拉伯语等BIDI语言提供了完善的支持。
2.1 在http://www.icu-project.org/download/4.2.html下载ICU4C库,我下载的是icu4c-49_1_2-src.tgz。
2.2 执行如下命令,安装成功:
tar -zxvf icu4c-49_1_2-src.tgz
cd icu/source
./configure
make
make install
3.1 myicu.h
#ifndef _MYICU_H_
#define _MYICU_H_
#include "unicode/utypes.h"
#include "unicode/ucsdet.h"
#include "unicode/ucnv.h"
#include <iostream>
#include <fstream>
#include <cstdlib>
#include <cstring>
#include <cstdio>
using namespace std;
#define BUF_MAX 4096
class MyIcu{
public:
MyIcu(const char* filename);
bool detectTextEncoding();
bool convertoUtf8();
int convert(const char *toConverterName, const char *fromConverterName,
char *target, int32_t targetCapacity, const char *source, int32_t sourceLength);
~MyIcu();
private:
const char* m_filename;
FILE* file;
char* detected;
};
#endif //_MYICU_H_
3.2 myicu.cpp
#include "myicu.h"
const int BUFFSIZE=8192;
MyIcu::MyIcu(const char* filename):m_filename(filename){
}
MyIcu::~MyIcu(){
fclose(file);
delete [] detected;
}
bool MyIcu::detectTextEncoding(){
UCharsetDetector* csd;
const UCharsetMatch **csm;
UErrorCode status = U_ZERO_ERROR;
char buffer[BUFFSIZE];
int inputLength,match, matchCount = 0;
file = fopen(m_filename, "rb");
if (file == NULL) {
cout<<"open file error"<<endl;
return 0;
}
inputLength = (int32_t) fread(buffer, 1, BUFFSIZE, file);
csd = ucsdet_open(&status);
ucsdet_setText(csd, buffer,inputLength, &status);
csm = ucsdet_detectAll(csd, &matchCount, &status);
if(csm == NULL){
ucsdet_close(csd);
return 0;
}
detected = new char[128];
#if 0
for(match = 0; match < matchCount; match += 1) {
const char *name = ucsdet_getName(csm[match], &status);
const char *lang = ucsdet_getLanguage(csm[match], &status);
int32_t confidence = ucsdet_getConfidence(csm[match], &status);
if (lang == NULL || strlen(lang) == 0) {
lang = "**";
}
cout<<name <<"("<<lang<<")"<<confidence<<endl;
}
#endif
if(matchCount > 0)
{
detected = strdup(ucsdet_getName(csm[0], &status)); //分配了内存, 需要释放
if(status != U_ZERO_ERROR)
return false;
}
cout<<"charset = "<<detected<<endl;
ucsdet_close(csd);
return 1;
}
bool MyIcu::convertoUtf8(){
file = fopen(m_filename, "rb");
if(file == NULL)
{
cout<<"open file error"<<endl;
return 0;
}
int len = 0;
//char *detected;
char *buffer = new char[BUF_MAX];
char *target = new char[BUF_MAX * 2];
while(true)
{
memset(buffer, 0, BUF_MAX);
memset(target, 0, BUF_MAX * 2);
len = (int32_t)fread(buffer, sizeof(char), BUF_MAX, file);
if(detected == NULL)
{
if(!detectTextEncoding()) //编码探测
break;
}
//转换为utf8字符编码
if(convert("UTF-8", detected, target, BUF_MAX * 2, (const char*)buffer, len) != U_ZERO_ERROR)
{
cout<<"ucnv_convert error"<<endl;
break;
}
cout<<target<<endl;//打印出转换的文件的字符串
if(len < BUF_MAX)
break;
}
delete [] buffer;
delete [] target;
return 1;
}
int MyIcu::convert(const char *toConverterName, const char *fromConverterName,
char *target, int32_t targetCapacity, const char *source, int32_t sourceLength){
UErrorCode error = U_ZERO_ERROR;
ucnv_convert(toConverterName, fromConverterName, target, targetCapacity,
source, sourceLength, &error);
return error;
}
3.3 main.cpp
#include "myicu.h"
#include <string>
#include <cstdio>
#define BUF_MAX 4096
int main(){
const char* filename = "123.txt";
MyIcu myicu(filename);
//char* buff = new char[126];
bool flag = myicu.detectTextEncoding();
if(!flag){
std::cout<<"解析错误!"<<endl;
}
bool flag2 = myicu.convertoUtf8();
if(!flag2){
std::cout<<"转换错误!"<<endl;
}
}
g++ -o target main.cpp myicu.cpp -licuuc -licui18n
如果找不到icuuc和icui18n动态库的话,执行如下命令:
vim /etc/ld.so.conf
将/usr/local/目录加进去,然后再
ldconfig
就行了。
你们可以试下自己准备的文件。