codecs — 字符串编码与解码
优质
小牛编辑
124浏览
2023-12-01
初识Unicode
编码
# codecs_to_hex.py
import binascii
def to_hex(t, nbytes):
"""Format text t as a sequence of nbyte long values
separated by spaces.
"""
chars_per_item = nbytes * 2
hex_version = binascii.hexlify(t)
return b' '.join(
hex_version[start:start + chars_per_item]
for start in range(0, len(hex_version), chars_per_item)
)
if __name__ == '__main__':
print(to_hex(b'abcdef', 1))
print(to_hex(b'abcdef', 2))
# codecs_encodings.py
import unicodedata
from codecs_to_hex import to_hex
text = 'français'
print('Raw : {!r}'.format(text))
for c in text:
print(' {!r}: {}'.format(c, unicodedata.name(c, c)))
print('UTF-8 : {!r}'.format(to_hex(text.encode('utf-8'), 1)))
print('UTF-16: {!r}'.format(to_hex(text.encode('utf-16'), 2)))
# codecs_decode.py
from codecs_to_hex import to_hex
text = 'français'
encoded = text.encode('utf-8')
decoded = encoded.decode('utf-8')
print('Original :', repr(text))
print('Encoded :', to_hex(encoded, 1), type(encoded))
print('Decoded :', repr(decoded), type(decoded))
处理文件
# codecs_open_write.py
from codecs_to_hex import to_hex
import codecs
import sys
encoding = sys.argv[1]
filename = encoding + '.txt'
print('Writing to', filename)
with codecs.open(filename, mode='w', encoding=encoding) as f:
f.write('français')
# Determine the byte grouping to use for to_hex()
nbytes = {
'utf-8': 1,
'utf-16': 2,
'utf-32': 4,
}.get(encoding, 1)
# Show the raw bytes in the file
print('File contents:')
with open(filename, mode='rb') as f:
print(to_hex(f.read(), nbytes))
# codecs_open_read.py
import codecs
import sys
encoding = sys.argv[1]
filename = encoding + '.txt'
print('Reading from', filename)
with codecs.open(filename, mode='r', encoding=encoding) as f:
print(repr(f.read()))
字节序
# codecs_bom.py
import codecs
from codecs_to_hex import to_hex
BOM_TYPES = [
'BOM', 'BOM_BE', 'BOM_LE',
'BOM_UTF8',
'BOM_UTF16', 'BOM_UTF16_BE', 'BOM_UTF16_LE',
'BOM_UTF32', 'BOM_UTF32_BE', 'BOM_UTF32_LE',
]
for name in BOM_TYPES:
print('{:12} : {}'.format(
name, to_hex(getattr(codecs, name), 2)))
# codecs_bom_create_file.py
import codecs
from codecs_to_hex import to_hex
# Pick the nonnative version of UTF-16 encoding
if codecs.BOM_UTF16 == codecs.BOM_UTF16_BE:
bom = codecs.BOM_UTF16_LE
encoding = 'utf_16_le'
else:
bom = codecs.BOM_UTF16_BE
encoding = 'utf_16_be'
print('Native order :', to_hex(codecs.BOM_UTF16, 2))
print('Selected order:', to_hex(bom, 2))
# Encode the text.
encoded_text = 'français'.encode(encoding)
print('{:14}: {}'.format(encoding, to_hex(encoded_text, 2)))
with open('nonnative-encoded.txt', mode='wb') as f:
# Write the selected byte-order marker. It is not included
# in the encoded text because the byte order was given
# explicitly when selecting the encoding.
f.write(bom)
# Write the byte string for the encoded text.
f.write(encoded_text)
# codecs_bom_detection.py
import codecs
from codecs_to_hex import to_hex
# Look at the raw data
with open('nonnative-encoded.txt', mode='rb') as f:
raw_bytes = f.read()
print('Raw :', to_hex(raw_bytes, 2))
# Re-open the file and let codecs detect the BOM
with codecs.open('nonnative-encoded.txt',
mode='r',
encoding='utf-16',
) as f:
decoded_text = f.read()
print('Decoded:', repr(decoded_text))
错误处理
编码错误
# codecs_encode_error.py
import codecs
import sys
error_handling = sys.argv[1]
text = 'français'
try:
# Save the data, encoded as ASCII, using the error
# handling mode specified on the command line.
with codecs.open('encode_error.txt', 'w',
encoding='ascii',
errors=error_handling) as f:
f.write(text)
except UnicodeEncodeError as err:
print('ERROR:', err)
else:
# If there was no error writing to the file,
# show what it contains.
with open('encode_error.txt', 'rb') as f:
print('File contents: {!r}'.format(f.read()))
解码错误
# codecs_decode_error.py
import codecs
import sys
from codecs_to_hex import to_hex
error_handling = sys.argv[1]
text = 'français'
print('Original :', repr(text))
# Save the data with one encoding
with codecs.open('decode_error.txt', 'w',
encoding='utf-16') as f:
f.write(text)
# Dump the bytes from the file
with open('decode_error.txt', 'rb') as f:
print('File contents:', to_hex(f.read(), 1))
# Try to read the data with the wrong encoding
with codecs.open('decode_error.txt', 'r',
encoding='utf-8',
errors=error_handling) as f:
try:
data = f.read()
except UnicodeDecodeError as err:
print('ERROR:', err)
else:
print('Read :', repr(data))
编码翻译
# codecs_encodedfile.py
from codecs_to_hex import to_hex
import codecs
import io
# Raw version of the original data.
data = 'français'
# Manually encode it as UTF-8.
utf8 = data.encode('utf-8')
print('Start as UTF-8 :', to_hex(utf8, 1))
# Set up an output buffer, then wrap it as an EncodedFile.
output = io.BytesIO()
encoded_file = codecs.EncodedFile(output, data_encoding='utf-8',
file_encoding='utf-16')
encoded_file.write(utf8)
# Fetch the buffer contents as a UTF-16 encoded byte string
utf16 = output.getvalue()
print('Encoded to UTF-16:', to_hex(utf16, 2))
# Set up another buffer with the UTF-16 data for reading,
# and wrap it with another EncodedFile.
buffer = io.BytesIO(utf16)
encoded_file = codecs.EncodedFile(buffer, data_encoding='utf-8',
file_encoding='utf-16')
# Read the UTF-8 encoded version of the data.
recoded = encoded_file.read()
print('Back to UTF-8 :', to_hex(recoded, 1))
非unicode编码
# codecs_rot13.py
import codecs
import io
buffer = io.StringIO()
stream = codecs.getwriter('rot_13')(buffer)
text = 'abcdefghijklmnopqrstuvwxyz'
stream.write(text)
stream.flush()
print('Original:', text)
print('ROT-13 :', buffer.getvalue())
# codecs_zlib.py
import codecs
import io
from codecs_to_hex import to_hex
buffer = io.BytesIO()
stream = codecs.getwriter('zlib')(buffer)
text = b'abcdefghijklmnopqrstuvwxyz\n' * 50
stream.write(text)
stream.flush()
print('Original length :', len(text))
compressed_data = buffer.getvalue()
print('ZIP compressed :', len(compressed_data))
buffer = io.BytesIO(compressed_data)
stream = codecs.getreader('zlib')(buffer)
first_line = stream.readline()
print('Read first line :', repr(first_line))
uncompressed_data = first_line + stream.read()
print('Uncompressed :', len(uncompressed_data))
print('Same :', text == uncompressed_data)
增量编码
# codecs_incremental_bz2.py
import codecs
import sys
from codecs_to_hex import to_hex
text = b'abcdefghijklmnopqrstuvwxyz\n'
repetitions = 50
print('Text length :', len(text))
print('Repetitions :', repetitions)
print('Expected len:', len(text) * repetitions)
# Encode the text several times to build up a
# large amount of data
encoder = codecs.getincrementalencoder('bz2')()
encoded = []
print()
print('Encoding:', end=' ')
last = repetitions - 1
for i in range(repetitions):
en_c = encoder.encode(text, final=(i == last))
if en_c:
print('\nEncoded : {} bytes'.format(len(en_c)))
encoded.append(en_c)
else:
sys.stdout.write('.')
all_encoded = b''.join(encoded)
print()
print('Total encoded length:', len(all_encoded))
print()
# Decode the byte string one byte at a time
decoder = codecs.getincrementaldecoder('bz2')()
decoded = []
print('Decoding:', end=' ')
for i, b in enumerate(all_encoded):
final = (i + 1) == len(text)
c = decoder.decode(bytes([b]), final)
if c:
print('\nDecoded : {} characters'.format(len(c)))
print('Decoding:', end=' ')
decoded.append(c)
else:
sys.stdout.write('.')
print()
restored = b''.join(decoded)
print()
print('Total uncompressed length:', len(restored))
Unicode数据网络通信
# codecs_socket_fail.py
import sys
import socketserver
class Echo(socketserver.BaseRequestHandler):
def handle(self):
# Get some bytes and echo them back to the client.
data = self.request.recv(1024)
self.request.send(data)
return
if __name__ == '__main__':
import codecs
import socket
import threading
address = ('localhost', 0) # let the kernel assign a port
server = socketserver.TCPServer(address, Echo)
ip, port = server.server_address # what port was assigned?
t = threading.Thread(target=server.serve_forever)
t.setDaemon(True) # don't hang on exit
t.start()
# Connect to the server
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect((ip, port))
# Send the data
# WRONG: Not encoded first!
text = 'français'
len_sent = s.send(text)
# Receive a response
response = s.recv(len_sent)
print(repr(response))
# Clean up
s.close()
server.socket.close()
# codecs_socket.py
import sys
import socketserver
class Echo(socketserver.BaseRequestHandler):
def handle(self):
"""Get some bytes and echo them back to the client.
There is no need to decode them, since they are not used.
"""
data = self.request.recv(1024)
self.request.send(data)
class PassThrough:
def __init__(self, other):
self.other = other
def write(self, data):
print('Writing :', repr(data))
return self.other.write(data)
def read(self, size=-1):
print('Reading :', end=' ')
data = self.other.read(size)
print(repr(data))
return data
def flush(self):
return self.other.flush()
def close(self):
return self.other.close()
if __name__ == '__main__':
import codecs
import socket
import threading
address = ('localhost', 0) # let the kernel assign a port
server = socketserver.TCPServer(address, Echo)
ip, port = server.server_address # what port was assigned?
t = threading.Thread(target=server.serve_forever)
t.setDaemon(True) # don't hang on exit
t.start()
# Connect to the server
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect((ip, port))
# Wrap the socket with a reader and writer.
read_file = s.makefile('rb')
incoming = codecs.getreader('utf-8')(PassThrough(read_file))
write_file = s.makefile('wb')
outgoing = codecs.getwriter('utf-8')(PassThrough(write_file))
# Send the data
text = 'français'
print('Sending :', repr(text))
outgoing.write(text)
outgoing.flush()
# Receive a response
response = incoming.read()
print('Received:', repr(response))
# Clean up
s.close()
server.socket.close()
定义一个自定义编码
# codecs_invertcaps.py
import string
def invertcaps(text):
"""Return new string with the case of all letters switched.
"""
return ''.join(
c.upper() if c in string.ascii_lowercase
else c.lower() if c in string.ascii_uppercase
else c
for c in text
)
if __name__ == '__main__':
print(invertcaps('ABCdef'))
print(invertcaps('abcDEF'))
# codecs_invertcaps_charmap.py
import codecs
import string
# Map every character to itself
decoding_map = codecs.make_identity_dict(range(256))
# Make a list of pairs of ordinal values for the lower
# and uppercase letters
pairs = list(zip(
[ord(c) for c in string.ascii_lowercase],
[ord(c) for c in string.ascii_uppercase],
))
# Modify the mapping to convert upper to lower and
# lower to upper.
decoding_map.update({
upper: lower
for (lower, upper)
in pairs
})
decoding_map.update({
lower: upper
for (lower, upper)
in pairs
})
# Create a separate encoding map.
encoding_map = codecs.make_encoding_map(decoding_map)
if __name__ == '__main__':
print(codecs.charmap_encode('abcDEF', 'strict',
encoding_map))
print(codecs.charmap_decode(b'abcDEF', 'strict',
decoding_map))
print(encoding_map == decoding_map)
# codecs_invertcaps_error.py
import codecs
from codecs_invertcaps_charmap import encoding_map
text = 'pi: \u03c0'
for error in ['ignore', 'replace', 'strict']:
try:
encoded = codecs.charmap_encode(
text, error, encoding_map)
except UnicodeEncodeError as err:
encoded = str(err)
print('{:7}: {}'.format(error, encoded))
# codecs_register.py
import codecs
import encodings
def search1(encoding):
print('search1: Searching for:', encoding)
return None
def search2(encoding):
print('search2: Searching for:', encoding)
return None
codecs.register(search1)
codecs.register(search2)
utf8 = codecs.lookup('utf-8')
print('UTF-8:', utf8)
try:
unknown = codecs.lookup('no-such-encoding')
except LookupError as err:
print('ERROR:', err)
# codecs_invertcaps_register.py
import codecs
from codecs_invertcaps_charmap import encoding_map, decoding_map
class InvertCapsCodec(codecs.Codec):
"Stateless encoder/decoder"
def encode(self, input, errors='strict'):
return codecs.charmap_encode(input, errors, encoding_map)
def decode(self, input, errors='strict'):
return codecs.charmap_decode(input, errors, decoding_map)
class InvertCapsIncrementalEncoder(codecs.IncrementalEncoder):
def encode(self, input, final=False):
data, nbytes = codecs.charmap_encode(input,
self.errors,
encoding_map)
return data
class InvertCapsIncrementalDecoder(codecs.IncrementalDecoder):
def decode(self, input, final=False):
data, nbytes = codecs.charmap_decode(input,
self.errors,
decoding_map)
return data
class InvertCapsStreamReader(InvertCapsCodec,
codecs.StreamReader):
pass
class InvertCapsStreamWriter(InvertCapsCodec,
codecs.StreamWriter):
pass
def find_invertcaps(encoding):
"""Return the codec for 'invertcaps'.
"""
if encoding == 'invertcaps':
return codecs.CodecInfo(
name='invertcaps',
encode=InvertCapsCodec().encode,
decode=InvertCapsCodec().decode,
incrementalencoder=InvertCapsIncrementalEncoder,
incrementaldecoder=InvertCapsIncrementalDecoder,
streamreader=InvertCapsStreamReader,
streamwriter=InvertCapsStreamWriter,
)
return None
codecs.register(find_invertcaps)
if __name__ == '__main__':
# Stateless encoder/decoder
encoder = codecs.getencoder('invertcaps')
text = 'abcDEF'
encoded_text, consumed = encoder(text)
print('Encoded "{}" to "{}", consuming {} characters'.format(
text, encoded_text, consumed))
# Stream writer
import io
buffer = io.BytesIO()
writer = codecs.getwriter('invertcaps')(buffer)
print('StreamWriter for io buffer: ')
print(' writing "abcDEF"')
writer.write('abcDEF')
print(' buffer contents: ', buffer.getvalue())
# Incremental decoder
decoder_factory = codecs.getincrementaldecoder('invertcaps')
decoder = decoder_factory()
decoded_text_parts = []
for c in encoded_text:
decoded_text_parts.append(
decoder.decode(bytes([c]), final=False)
)
decoded_text_parts.append(decoder.decode(b'', final=True))
decoded_text = ''.join(decoded_text_parts)
print('IncrementalDecoder converted {!r} to {!r}'.format(
encoded_text, decoded_text))