UTF-8 CPP是一个简单、小巧、轻量级、跨平台的UTF-8编码字符串库。
下面对其使用方法进行简单的介绍:
1. 从http://sourceforge.net/projects/utfcpp/下载最新的utf8_v2_3_4.zip源码,将其解压缩;
2. 新建一个vs2013 控制台工程TestUTF-8CPP,将utf-8cpp中的src文件加入到包含目录中;
3. 参考http://utfcpp.sourceforge.net/,测试代码内容为:
#include "stdafx.h"
#include <iostream>
#include <string>
#include <fstream>
#include <vector>
#include <assert.h>
#include "utf8.h"
// checks whether the content of a file is valid UTF-8 encoded text without reading the content into the memory
bool valid_utf8_file(const char* file_name)
{
std::ifstream ifs(file_name);
if (!ifs)
return false; // even better, throw here
std::istreambuf_iterator<char> it(ifs.rdbuf());
std::istreambuf_iterator<char> eos;
return utf8::is_valid(it, eos);
}
// The function will replace any invalid UTF-8 sequence with a Unicode replacement character
void fix_utf8_string(std::string& str)
{
std::string temp;
utf8::replace_invalid(str.begin(), str.end(), back_inserter(temp));
str = temp;
}
int main(int argc, char* argv[])
{
const char* test_file_path = "../../../demo/test.txt";
// Open the test file(contains UTF-8 encoded text)
std::ifstream fs8(test_file_path);
if (!fs8.is_open()) {
std::cout << "Could not open " << test_file_path << std::endl;
return -1;
}
if (!valid_utf8_file(test_file_path))
return -1;
unsigned line_count = 1;
std::string line;
// Play with all the lines in the file
while (getline(fs8, line)) {
// check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function)
std::string::iterator end_it = utf8::find_invalid(line.begin(), line.end());
if (end_it != line.end()) {
std::cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n";
std::cout << "This part is fine: " << std::string(line.begin(), end_it) << "\n";
}
// Get the line length (at least for the valid part)
int length = utf8::distance(line.begin(), end_it);
std::cout << "Length of line " << line_count << " is " << length << "\n";
// Convert it to utf-16
std::vector<unsigned short> utf16line;
utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line));
// And back to utf-8
std::string utf8line;
utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line));
// Confirm that the conversion went OK:
if (utf8line != std::string(line.begin(), end_it))
std::cout << "Error in UTF-16 conversion at line: " << line_count << "\n";
line_count++;
}
std::string str = "ABCD";
std::vector<unsigned short> utf16result;
utf8::utf8to16(str.begin(), str.end(), std::back_inserter(utf16result));
size_t size1 = utf16result.size();
std::string str2 = "濦粿夿旴";
std::string utf8str;
utf8::utf16to8(str2.begin(), str2.end(), std::back_inserter(utf8str));
size_t size2 = utf8str.length();
char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
std::vector <unsigned short> utf16result1;
utf8::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result1));
assert(utf16result1.size() == 4);
assert(utf16result1[2] == 0xd834);
assert(utf16result1[3] == 0xdd1e);
unsigned short utf16string[] = { 0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e };
std::vector<unsigned char> utf8result;
utf8::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
assert(utf8result.size() == 10);
char* szSex = "\xe7\x94\xb7\x00";
std::basic_string<wchar_t> sex;
utf8::utf8to16(szSex, szSex + strlen(szSex), back_inserter(sex));
if (sex != L"男") {
std::cout << "unicode char utf16 error" << std::endl;
return -1;
}
std::cout << "ok!" << std::endl;
return 0;
}
GitHub:https://github.com/fengbingchun/UTF-8CPP_Test