编码转换(Transform)
- Utf8是golang的原生编码方式,
Utf8=>X
为编码,X=>Utf8为
解码。 - 字节是数据的最小单元,
数据转换和传输
都以字节切片的形式进行。
$ go get golang.org/x/text
func GbkToUtf8(s []byte) ([]byte, error) {
//第二个参数为“transform.Transformer”接口,simplifiedchinese.GBK.NewDecoder()包含了该接口
reader := transform.NewReader(bytes.NewReader(s), simplifiedchinese.GBK.NewDecoder())
d, e := ioutil.ReadAll(reader)
if e != nil {
return nil, e
}
return d, nil
}
func Utf8ToGbk(s []byte) ([]byte, error) {
reader := transform.NewReader(bytes.NewReader(s), simplifiedchinese.GBK.NewEncoder())
d, e := ioutil.ReadAll(reader)
if e != nil {
return nil, e
}
return d, nil
}
编码转换(Charset)
package charset
import (
"bytes"
"errors"
"fmt"
"io/ioutil"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/ianaindex"
"golang.org/x/text/transform"
)
type Charset string
//中文
const (
GBK Charset = "GBK"
GB18030 = "GB18030"
GB2312 = "GB2312"
Big5 = "Big5"
)
//日文
const (
EUCJP Charset = "EUCJP"
ISO2022JP = "ISO2022JP"
ShiftJIS = "ShiftJIS"
)
//韩文
const (
EUCKR Charset = "EUCKR"
)
//Unicode
const (
UTF_8 Charset = "UTF-8"
UTF_16 = "UTF-16"
UTF_16BE = "UTF-16BE"
UTF_16LE = "UTF-16LE"
)
//其他编码
const (
Macintosh Charset = "macintosh"
IBM = "IBM*"
Windows = "Windows*"
ISO = "ISO-*"
)
var charsetAlias = map[string]string{
"HZGB2312": "HZ-GB-2312",
"hzgb2312": "HZ-GB-2312",
"GB2312": "HZ-GB-2312",
"gb2312": "HZ-GB-2312",
}
func Convert(dstCharset Charset, srcCharset Charset, src string) (dst string, err error) {
if dstCharset == srcCharset {
return src, nil
}
dst = src
// Converting <src> to UTF-8.
if srcCharset != "UTF-8" {
if e := getEncoding(srcCharset); e != nil {
tmp, err := ioutil.ReadAll(
transform.NewReader(bytes.NewReader([]byte(src)), e.NewDecoder()),
)
if err != nil {
return "", fmt.Errorf("%s to utf8 failed. %v", srcCharset, err)
}
src = string(tmp)
} else {
return dst, errors.New(fmt.Sprintf("unsupport srcCharset: %s", srcCharset))
}
}
// Do the converting from UTF-8 to <dstCharset>.
if dstCharset != "UTF-8" {
if e := getEncoding(dstCharset); e != nil {
tmp, err := ioutil.ReadAll(
transform.NewReader(bytes.NewReader([]byte(src)), e.NewEncoder()),
)
if err != nil {
return "", fmt.Errorf("utf to %s failed. %v", dstCharset, err)
}
dst = string(tmp)
} else {
return dst, errors.New(fmt.Sprintf("unsupport dstCharset: %s", dstCharset))
}
} else {
dst = src
}
return dst, nil
}
func ToUTF8(srcCharset Charset, src string) (dst string, err error) {
return Convert("UTF-8", srcCharset, src)
}
func UTF8To(dstCharset Charset, src string) (dst string, err error) {
return Convert(dstCharset, "UTF-8", src)
}
func getEncoding(charset Charset) encoding.Encoding {
if c, ok := charsetAlias[string(charset)]; ok {
charset = Charset(c)
}
if e, err := ianaindex.MIB.Encoding(string(charset)); err == nil && e != nil {
return e
}
return nil
}
func main() {
utf8 := []byte{230, 136, 145, 230, 152, 175, 85, 84, 70, 56}
gbk := []byte{206, 210, 202, 199, 71, 66, 75}
fmt.Println(charset.UTF8To(charset.GBK,string(utf8)))
fmt.Println(charset.ToUTF8(charset.GBK,string(gbk)))
}
编码猜测
golang.org/x/net/html/charset
提供了多个Reader
和DetermineEncoding
方法。
DetermineEncoding
会截取1024个字符进行编码格式的推断。
import (
"fmt"
"golang.org/x/net/html/charset"
)
func main() {
//utf8 := []byte{230, 136, 145, 230, 152, 175, 85, 84, 70, 56}
gbk := []byte{206, 210, 202, 199, 71, 66, 75}
encoding, name, certain := charset.DetermineEncoding(gbk, "text/html")
fmt.Printf("编码:%v\n名称:%s\n确定:%t\n", encoding, name, certain)
}
参考:
https://www.kancloud.cn/liupengjie/go/999876