Utf8=>X
为编码,X=>Utf8为
解码。数据转换和传输
都以字节切片的形式进行。$ go get golang.org/x/text
上面这步有可能在实践中出现些问题:
package golang.org/x/text/encoding/ianaindex/...: unrecognized import path "golang.org/x/text/encoding/ianaindex" (https fetch: Get https://golang.org/x/text/encoding/ianaindex?go-get=1: dial tcp 216.239.37.1:443: connectex: A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond.)
### 解决办法:在$GOPATH/src/golang.org/x/ 文件夹下执行git clone https://github.com/golang/text.git 或者通过下载https://github.com/golang/text/archive/master.zip 该文件,下载好后进行解压。
E:\project\src\golang.org\x 的目录
2020/10/16 23:24 <DIR> .
2020/10/16 23:24 <DIR> ..
2019/08/19 20:07 <DIR> lint
2020/10/16 23:17 8,520,510 master.zip
2019/10/02 22:25 <DIR> net
2020/08/26 07:20 <DIR> text
2020/10/13 11:20 <DIR> tools
2019/09/07 18:52 <DIR> tools1
func GbkToUtf8(s []byte) ([]byte, error) {
//第二个参数为“transform.Transformer”接口,simplifiedchinese.GBK.NewDecoder()包含了该接口
reader := transform.NewReader(bytes.NewReader(s), simplifiedchinese.GBK.NewDecoder())
d, e := ioutil.ReadAll(reader)
if e != nil {
return nil, e
}
return d, nil
}
func Utf8ToGbk(s []byte) ([]byte, error) {
reader := transform.NewReader(bytes.NewReader(s), simplifiedchinese.GBK.NewEncoder())
d, e := ioutil.ReadAll(reader)
if e != nil {
return nil, e
}
return d, nil
}
编码转换(Charset)
package charset
import (
"bytes"
"errors"
"fmt"
"io/ioutil"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/ianaindex"
"golang.org/x/text/transform"
)
type Charset string
//中文
const (
GBK Charset = "GBK"
GB18030 = "GB18030"
GB2312 = "GB2312"
Big5 = "Big5"
)
//日文
const (
EUCJP Charset = "EUCJP"
ISO2022JP = "ISO2022JP"
ShiftJIS = "ShiftJIS"
)
//韩文
const (
EUCKR Charset = "EUCKR"
)
//Unicode
const (
UTF_8 Charset = "UTF-8"
UTF_16 = "UTF-16"
UTF_16BE = "UTF-16BE"
UTF_16LE = "UTF-16LE"
)
//其他编码
const (
Macintosh Charset = "macintosh"
IBM = "IBM*"
Windows = "Windows*"
ISO = "ISO-*"
)
var charsetAlias = map[string]string{
"HZGB2312": "HZ-GB-2312",
"hzgb2312": "HZ-GB-2312",
"GB2312": "HZ-GB-2312",
"gb2312": "HZ-GB-2312",
}
func Convert(dstCharset Charset, srcCharset Charset, src string) (dst string, err error) {
if dstCharset == srcCharset {
return src, nil
}
dst = src
// Converting <src> to UTF-8.
if srcCharset != "UTF-8" {
if e := getEncoding(srcCharset); e != nil {
tmp, err := ioutil.ReadAll(
transform.NewReader(bytes.NewReader([]byte(src)), e.NewDecoder()),
)
if err != nil {
return "", fmt.Errorf("%s to utf8 failed. %v", srcCharset, err)
}
src = string(tmp)
} else {
return dst, errors.New(fmt.Sprintf("unsupport srcCharset: %s", srcCharset))
}
}
// Do the converting from UTF-8 to <dstCharset>.
if dstCharset != "UTF-8" {
if e := getEncoding(dstCharset); e != nil {
tmp, err := ioutil.ReadAll(
transform.NewReader(bytes.NewReader([]byte(src)), e.NewEncoder()),
)
if err != nil {
return "", fmt.Errorf("utf to %s failed. %v", dstCharset, err)
}
dst = string(tmp)
} else {
return dst, errors.New(fmt.Sprintf("unsupport dstCharset: %s", dstCharset))
}
} else {
dst = src
}
return dst, nil
}
func ToUTF8(srcCharset Charset, src string) (dst string, err error) {
return Convert("UTF-8", srcCharset, src)
}
func UTF8To(dstCharset Charset, src string) (dst string, err error) {
return Convert(dstCharset, "UTF-8", src)
}
func getEncoding(charset Charset) encoding.Encoding {
if c, ok := charsetAlias[string(charset)]; ok {
charset = Charset(c)
}
if e, err := ianaindex.MIB.Encoding(string(charset)); err == nil && e != nil {
return e
}
return nil
}
func main() {
utf8 := []byte{230, 136, 145, 230, 152, 175, 85, 84, 70, 56}
gbk := []byte{206, 210, 202, 199, 71, 66, 75}
fmt.Println(charset.UTF8To(charset.GBK,string(utf8)))
fmt.Println(charset.ToUTF8(charset.GBK,string(gbk)))
}
编码猜测
golang.org/x/net/html/charset
提供了多个Reader
和DetermineEncoding
方法。DetermineEncoding
会截取1024个字符进行编码格式的推断。
import (
"fmt"
"golang.org/x/net/html/charset"
)
func main() {
//utf8 := []byte{230, 136, 145, 230, 152, 175, 85, 84, 70, 56}
gbk := []byte{206, 210, 202, 199, 71, 66, 75}
encoding, name, certain := charset.DetermineEncoding(gbk, "text/html")
fmt.Printf("编码:%v\n名称:%s\n确定:%t\n", encoding, name, certain)
}