Go-文本编码

居焱
2023-12-01

编码转换(Transform)

  • Utf8是golang的原生编码方式,Utf8=>X为编码,X=>Utf8为解码。
  • 字节是数据的最小单元,数据转换和传输都以字节切片的形式进行。
$ go get golang.org/x/text
func GbkToUtf8(s []byte) ([]byte, error) {
	//第二个参数为“transform.Transformer”接口,simplifiedchinese.GBK.NewDecoder()包含了该接口
	reader := transform.NewReader(bytes.NewReader(s), simplifiedchinese.GBK.NewDecoder())
	d, e := ioutil.ReadAll(reader)
	if e != nil {
		return nil, e
	}
	return d, nil
}

func Utf8ToGbk(s []byte) ([]byte, error) {
	reader := transform.NewReader(bytes.NewReader(s), simplifiedchinese.GBK.NewEncoder())
	d, e := ioutil.ReadAll(reader)
	if e != nil {
		return nil, e
	}
	return d, nil
}

编码转换(Charset)

package charset

import (
	"bytes"
	"errors"
	"fmt"
	"io/ioutil"

	"golang.org/x/text/encoding"
	"golang.org/x/text/encoding/ianaindex"
	"golang.org/x/text/transform"
)

type Charset string

//中文
const (
	GBK     Charset = "GBK"
	GB18030         = "GB18030"
	GB2312          = "GB2312"
	Big5            = "Big5"
)

//日文
const (
	EUCJP     Charset = "EUCJP"
	ISO2022JP         = "ISO2022JP"
	ShiftJIS          = "ShiftJIS"
)

//韩文
const (
	EUCKR Charset = "EUCKR"
)

//Unicode
const (
	UTF_8    Charset = "UTF-8"
	UTF_16           = "UTF-16"
	UTF_16BE         = "UTF-16BE"
	UTF_16LE         = "UTF-16LE"
)

//其他编码
const (
	Macintosh Charset = "macintosh"
	IBM               = "IBM*"
	Windows           = "Windows*"
	ISO               = "ISO-*"
)

var charsetAlias = map[string]string{
	"HZGB2312": "HZ-GB-2312",
	"hzgb2312": "HZ-GB-2312",
	"GB2312":   "HZ-GB-2312",
	"gb2312":   "HZ-GB-2312",
}

func Convert(dstCharset Charset, srcCharset Charset, src string) (dst string, err error) {
	if dstCharset == srcCharset {
		return src, nil
	}
	dst = src
	// Converting <src> to UTF-8.
	if srcCharset != "UTF-8" {
		if e := getEncoding(srcCharset); e != nil {
			tmp, err := ioutil.ReadAll(
				transform.NewReader(bytes.NewReader([]byte(src)), e.NewDecoder()),
			)
			if err != nil {
				return "", fmt.Errorf("%s to utf8 failed. %v", srcCharset, err)
			}
			src = string(tmp)
		} else {
			return dst, errors.New(fmt.Sprintf("unsupport srcCharset: %s", srcCharset))
		}
	}
	// Do the converting from UTF-8 to <dstCharset>.
	if dstCharset != "UTF-8" {
		if e := getEncoding(dstCharset); e != nil {
			tmp, err := ioutil.ReadAll(
				transform.NewReader(bytes.NewReader([]byte(src)), e.NewEncoder()),
			)
			if err != nil {
				return "", fmt.Errorf("utf to %s failed. %v", dstCharset, err)
			}
			dst = string(tmp)
		} else {
			return dst, errors.New(fmt.Sprintf("unsupport dstCharset: %s", dstCharset))
		}
	} else {
		dst = src
	}
	return dst, nil
}

func ToUTF8(srcCharset Charset, src string) (dst string, err error) {
	return Convert("UTF-8", srcCharset, src)
}

func UTF8To(dstCharset Charset, src string) (dst string, err error) {
	return Convert(dstCharset, "UTF-8", src)
}

func getEncoding(charset Charset) encoding.Encoding {
	if c, ok := charsetAlias[string(charset)]; ok {
		charset = Charset(c)
	}
	if e, err := ianaindex.MIB.Encoding(string(charset)); err == nil && e != nil {
		return e
	}
	return nil
}
func main() {
	utf8 := []byte{230, 136, 145, 230, 152, 175, 85, 84, 70, 56}
	gbk := []byte{206, 210, 202, 199, 71, 66, 75}

	fmt.Println(charset.UTF8To(charset.GBK,string(utf8)))
	fmt.Println(charset.ToUTF8(charset.GBK,string(gbk)))
}

编码猜测

golang.org/x/net/html/charset提供了多个ReaderDetermineEncoding方法。
DetermineEncoding会截取1024个字符进行编码格式的推断。

import (
	"fmt"
	"golang.org/x/net/html/charset"
)

func main() {
	//utf8 := []byte{230, 136, 145, 230, 152, 175, 85, 84, 70, 56}
	gbk := []byte{206, 210, 202, 199, 71, 66, 75}
	encoding, name, certain := charset.DetermineEncoding(gbk, "text/html")
	fmt.Printf("编码:%v\n名称:%s\n确定:%t\n", encoding, name, certain)
}


参考:
https://www.kancloud.cn/liupengjie/go/999876

 类似资料: