ch36Unicode和字节字符串(python)

>>> ord('a')         # 'a' is a byte with binary value 97 in ASCII
97
>>> hex(97)
'0x61'
>>> chr(97)          # Binary value 97 stands for character 'a'
'a'



>>> 0xC4
196
>>> chr(196)
'?



C:\misc> c:\python30\python

>>> B = b'spam'           # Make a bytes object (8-bit bytes)
>>> S = 'eggs'            # Make a str object (Unicode characters, 8-bit or wider)

>>> type(B), type(S)
(<class 'bytes'>, <class 'str'>)

>>> B                     # Prints as a character string, really sequence of ints
b'spam'
>>> S
'eggs'



>>> B[0], S[0]            # Indexing returns an int for bytes, str for str
(115, 'e')

>>> B[1:], S[1:]          # Slicing makes another bytes or str object
(b'pam', 'ggs')

>>> list(B), list(S)
([115, 112, 97, 109], ['e', 'g', 'g', 's'])     # bytes is really ints



>>> B[0] = 'x'                                  # Both are immutable
TypeError: 'bytes' object does not support item assignment

>>> S[0] = 'x'
TypeError: 'str' object does not support item assignment

>>> B = B"""              # bytes prefix works on single, double, triple quotes
... xxxx
... yyyy
... """
>>> B
b'\nxxxx\nyyyy\n'



>>> S = 'eggs'
>>> S.encode()                          # str to bytes: encode text into raw bytes
b'eggs'

>>> bytes(S, encoding='ascii')          # str to bytes, alternative
b'eggs'

>>> B = b'spam'
>>> B.decode()                          # bytes to str: decode raw bytes into text
'spam'

>>> str(B, encoding='ascii')            # bytes to str, alternative
'spam'



>>> import sys
>>> sys.platform                        # Underlying platform
'win32'
>>> sys.getdefaultencoding()            # Default encoding for str here
'utf-8'

>>> bytes(S)
TypeError: string argument without an encoding

>>> str(B)                              # str without encoding
"b'spam'"                               # A print string, not conversion!
>>> len(str(B))
7
>>> len(str(B, encoding='ascii'))       # Use encoding to convert to str
4



C:\misc> c:\python30\python

>>> ord('X')             # 'X' has binary value 88 in the default encoding
88
>>> chr(88)              # 88 stands for character 'X'
'X'

>>> S = 'XYZ'            # A Unicode string of ASCII text
>>> S
'XYZ'
>>> len(S)               # 3 characters long
3
>>> [ord(c) for c in S]  # 3 bytes with integer ordinal values
[88, 89, 90]



>>> S.encode('ascii')    # Values 0..127 in 1 byte (7 bits) each
b'XYZ'
>>> S.encode('latin-1')  # Values 0..255 in 1 byte (8 bits) each
b'XYZ'
>>> S.encode('utf-8')    # Values 0..127 in 1 byte, 128..2047 in 2, others 3 or 4
b'XYZ'



>>> S.encode('latin-1')[0]
88
>>> list(S.encode('latin-1'))
[88, 89, 90]



>>> chr(0xc4)            # 0xC4, 0xE8: characters outside ASCII's range
'?
>>> chr(0xe8)
'?

>>> S = '\xc4\xe8'       # Single byte 8-bit hex escapes
>>> S
'蔫'

>>> S = '\u00c4\u00e8'   # 16-bit Unicode escapes
>>> S
'蔫'
>>> len(S)               # 2 characters long (not number of bytes!)
2



>>> S = '\u00c4\u00e8'
>>> S
'蔫'
>>> len(S)
2

>>> S.encode('ascii')
UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-1:
ordinal not in range(128)

>>> S.encode('latin-1')              # One byte per character
b'\xc4\xe8'

>>> S.encode('utf-8')                # Two bytes per character
b'\xc3\x84\xc3\xa8'

>>> len(S.encode('latin-1'))         # 2 bytes in latin-1, 4 in utf-8
2
>>> len(S.encode('utf-8'))
4



>>> B = b'\xc4\xe8'
>>> B
b'\xc4\xe8'
>>> len(B)                           # 2 raw bytes, 2 characters
2
>>> B.decode('latin-1')              # Decode to latin-1 text
'蔫'

>>> B = b'\xc3\x84\xc3\xa8'
>>> len(B)                           # 4 raw bytes
4
>>> B.decode('utf-8')
'蔫'
>>> len(B.decode('utf-8'))           # 2 Unicode characters
2



>>> S = 'A\u00c4B\U000000e8C'
>>> S                                # A, B, C, and 2 non-ASCII characters
'A腂鐲'
>>> len(S)                           # 5 characters long
5

>>> S.encode('latin-1')
b'A\xc4B\xe8C'
>>> len(S.encode('latin-1'))         # 5 bytes in latin-1
5

>>> S.encode('utf-8')
b'A\xc3\x84B\xc3\xa8C'
>>> len(S.encode('utf-8'))           # 7 bytes in utf-8
7



>>> S
'A腂鐲'
>>> S.encode('cp500')                # Two other Western European encodings
b'\xc1c\xc2T\xc3'
>>> S.encode('cp850')                # 5 bytes each
b'A\x8eB\x8aC'

>>> S = 'spam'                       # ASCII text is the same in most
>>> S.encode('latin-1')
b'spam'
>>> S.encode('utf-8')
b'spam'
>>> S.encode('cp500')                # But not in cp500: IBM EBCDIC!
b'\xa2\x97\x81\x94'
>>> S.encode('cp850')
b'spam'



>>> S = 'A' + chr(0xC4) + 'B' + chr(0xE8) + 'C'
>>> S
'A腂鐲'



>>> S = 'A\xC4B\xE8C'                # str recognizes hex and Unicode escapes
>>> S
'A腂鐲'

>>> S = 'A\u00C4B\U000000E8C'
>>> S
'A腂鐲'

>>> B = b'A\xC4B\xE8C'               # bytes recognizes hex but not Unicode
>>> B
b'A\xc4B\xe8C'

>>> B = b'A\u00C4B\U000000E8C'       # Escape sequences taken literally!
>>> B
b'A\\u00C4B\\U000000E8C'

>>> B = b'A\xC4B\xE8C'               # Use hex escapes for bytes
>>> B                                # Prints non-ASCII as hex
b'A\xc4B\xe8C'
>>> print(B)
b'A\xc4B\xe8C'
>>> B.decode('latin-1')              # Decode as latin-1 to interpret as text
'A腂鐲'



>>> S = 'A腂鐲'                      # Chars from UTF-8 if no encoding declaration
>>> S
'A腂鐲'

>>> B = b'A腂鐲'
SyntaxError: bytes can only contain ASCII literal characters.

>>> B = b'A\xC4B\xE8C'               # Chars must be ASCII, or escapes
>>> B
b'A\xc4B\xe8C'
>>> B.decode('latin-1')
'A腂鐲'

>>> S.encode()                       # Source code encoded per UTF-8 by default
b'A\xc3\x84B\xc3\xa8C'               # Uses system default to encode, unless passed
>>> S.encode('utf-8')
b'A\xc3\x84B\xc3\xa8C'

>>> B.decode()                       # Raw bytes do not correspond to utf-8
UnicodeDecodeError: 'utf8' codec can't decode bytes in position 1-2: ...



>>> S = 'A腂鐲'
>>> S
'A腂鐲'
>>> S.encode()                       # Default utf-8 encoding
b'A\xc3\x84B\xc3\xa8C'

>>> T = S.encode('cp500')            # Convert to EBCDIC
>>> T
b'\xc1c\xc2T\xc3'

>>> U = T.decode('cp500')            # Convert back to Unicode
>>> U
'A腂鐲'

>>> U.encode()                       # Default utf-8 encoding again
b'A\xc3\x84B\xc3\xa8C'



C:\misc> c:\python26\python
>>> import sys
>>> sys.version
'2.6 (r26:66721, Oct  2 2008, 11:35:03) [MSC v.1500 32 bit (Intel)]'

>>> S = 'A\xC4B\xE8C'                # String of 8-bit bytes
>>> print S                          # Some are non-ASCII
A腂鐲

>>> S.decode('latin-1')              # Decode byte to latin-1 Unicode
u'A\xc4B\xe8C'

>>> S.decode('utf-8')                # Not formatted as utf-8
UnicodeDecodeError: 'utf8' codec can't decode bytes in position 1-2: invalid data

>>> S.decode('ascii')                # Outside ASCII range
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc4 in position 1: ordinal
not in range(128)



>>> U = u'A\xC4B\xE8C'               # Make Unicode string, hex escapes
>>> U
u'A\xc4B\xe8C'
>>> print U
A腂鐲



>>> U.encode('latin-1')              # Encode per latin-1: 8-bit bytes
'A\xc4B\xe8C'
>>> U.encode('utf-8')                # Encode per utf-8: multibyte
'A\xc3\x84B\xc3\xa8C'



C:\misc> c:\python26\python
>>> U = u'A\xC4B\xE8C'               # Hex escapes for non-ASCII
>>> U
u'A\xc4B\xe8C'
>>> print U
A腂鐲

>>> U = u'A\u00C4B\U000000E8C'       # Unicode escapes for non-ASCII
>>> U                                # u'' = 16 bits, U'' = 32 bits
u'A\xc4B\xe8C'
>>> print U
A腂鐲

>>> S = 'A\xC4B\xE8C'                # Hex escapes work
>>> S
'A\xc4B\xe8C'
>>> print S                          # But some print oddly, unless decoded
A-BFC
>>> print S.decode('latin-1')
A腂鐲

>>> S = 'A\u00C4B\U000000E8C'        # Not Unicode escapes: taken literally!
>>> S
'A\\u00C4B\\U000000E8C'
>>> print S
A\u00C4B\U000000E8C
>>> len(S)
19



>>> u'ab' + 'cd'                     # Can mix if compatible in 2.6
u'abcd'                              # 'ab' + b'cd' not allowed in 3.0



>>> str(u'spam')                     # Unicode to normal
'spam'
>>> unicode('spam')                  # Normal to Unicode
u'spam'



>>> S = 'A\xC4B\xE8C'                # Can't mix if incompatible
>>> U = u'A\xC4B\xE8C'
>>> S + U
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc4 in position 1: ordinal
not in range(128)

>>> S.decode('latin-1') + U          # Manual conversion still required
u'A\xc4B\xe8CA\xc4B\xe8C'

>>> print S.decode('latin-1') + U
A腂鐲A腂鐲



# -*- coding: latin-1 -*-


### file: text.py

# -*- coding: latin-1 -*-

# Any of the following string literal forms work in latin-1.
# Changing the encoding above to either ascii or utf-8 fails,
# because the 0xc4 and 0xe8 in myStr1 are not valid in either.

myStr1 = 'a腂鐲'

myStr2 = 'A\u00c4B\U000000e8C'

myStr3 = 'A' + chr(0xC4) + 'B' + chr(0xE8) + 'C'

import sys
print('Default encoding:', sys.getdefaultencoding())

for aStr in myStr1, myStr2, myStr3:
    print('{0}, strlen={1}, '.format(aStr, len(aStr)), end='')

    bytes1 = aStr.encode()              # Per default utf-8: 2 bytes for non-ASCII
    bytes2 = aStr.encode('latin-1')     # One byte per char
   #bytes3 = aStr.encode('ascii')       # ASCII fails: outside 0..127 range

    print('byteslen1={0}, byteslen2={1}'.format(len(bytes1), len(bytes2)))



C:\misc> c:\python30\python text.py
Default encoding: utf-8
a腂鐲, strlen=5, byteslen1=7, byteslen2=5
A腂鐲, strlen=5, byteslen1=7, byteslen2=5
A腂鐲, strlen=5, byteslen1=7, byteslen2=5



C:\misc> c:\python30\python

# Attributes unique to str

>>> set(dir('abc')) - set(dir(b'abc'))
{'isprintable', 'format', '__mod__', 'encode', 'isidentifier',
'_formatter_field_name_split', 'isnumeric', '__rmod__', 'isdecimal',
'_formatter_parser', 'maketrans'}

# Attributes unique to bytes

>>> set(dir(b'abc')) - set(dir('abc'))
{'decode', 'fromhex'}



>>> B = b'spam'                    # b'...' bytes literal
>>> B.find(b'pa')
1

>>> B.replace(b'pa', b'XY')        # bytes methods expect bytes arguments
b'sXYm'

>>> B.split(b'pa')
[b's', b'm']

>>> B
b'spam'

>>> B[0] = 'x'
TypeError: 'bytes' object does not support item assignment



>>> b'%s' % 99
TypeError: unsupported operand type(s) for %: 'bytes' and 'int'

>>> '%s' % 99
'99'

>>> b'{0}'.format(99)
AttributeError: 'bytes' object has no attribute 'format'

>>> '{0}'.format(99)
'99'



>>> B = b'spam'                  # A sequence of small ints
>>> B                            # Prints as ASCII characters
b'spam'

>>> B[0]                         # Indexing yields an int
115
>>> B[-1]
109

>>> chr(B[0])                    # Show character for int
's'
>>> list(B)                      # Show all the byte's int values
[115, 112, 97, 109]

>>> B[1:], B[:-1]
(b'pam', b'spa')

>>> len(B)
4

>>> B + b'lmn'
b'spamlmn'
>>> B * 4
b'spamspamspamspam'



>>> B = b'abc'
>>> B
b'abc'

>>> B = bytes('abc', 'ascii')
>>> B
b'abc'

>>> ord('a')
97
>>> B = bytes([97, 98, 99])
>>> B
b'abc'

>>> B = 'spam'.encode()          # Or bytes()
>>> B
b'spam'
>>>
>>> S = B.decode()               # Or str()
>>> S
'spam'



# Must pass expected types to function and method calls

>>> B = b'spam'

>>> B.replace('pa', 'XY')
TypeError: expected an object with the buffer interface

>>> B.replace(b'pa', b'XY')
b'sXYm'

>>> B = B'spam'
>>> B.replace(bytes('pa'), bytes('xy'))
TypeError: string argument without an encoding

>>> B.replace(bytes('pa', 'ascii'), bytes('xy', 'utf-8'))
b'sxym'


# Must convert manually in mixed-type expressions

>>> b'ab' + 'cd'
TypeError: can't concat bytes to str

>>> b'ab'.decode() + 'cd'                   # bytes to str
'abcd'

>>> b'ab' + 'cd'.encode()                   # str to bytes
b'abcd'

>>> b'ab' + bytes('cd', 'ascii')            # str to bytes
b'abcd'



# Creation in 2.6: a mutable sequence of small (0..255) ints

>>> S = 'spam'
>>> C = bytearray(S)                      # A back-port from 3.0 in 2.6
>>> C                                     # b'..' == '..' in 2.6 (str)
bytearray(b'spam')



# Creation in 3.0: text/binary do not mix

>>> S = 'spam'
>>> C = bytearray(S)
TypeError: string argument without an encoding

>>> C = bytearray(S, 'latin1')            # A content-specific type in 3.0
>>> C
bytearray(b'spam')

>>> B = b'spam'                           # b'..' != '..' in 3.0 (bytes/str)
>>> C = bytearray(B)
>>> C
bytearray(b'spam')



# Mutable, but must assign ints, not strings

>>> C[0]
115

>>> C[0] = 'x'                              # This and the next work in 2.6
TypeError: an integer is required

>>> C[0] = b'x'
TypeError: an integer is required

>>> C[0] = ord('x')
>>> C
bytearray(b'xpam')

>>> C[1] = b'Y'[0]
>>> C
bytearray(b'xYam')



# Methods overlap with both str and bytes, but also has list's mutable methods

>>> set(dir(b'abc')) - set(dir(bytearray(b'abc')))
{'__getnewargs__'}

>>> set(dir(bytearray(b'abc'))) - set(dir(b'abc'))
{'insert', '__alloc__', 'reverse', 'extend', '__delitem__', 'pop', '__setitem__'
, '__iadd__', 'remove', 'append', '__imul__'}



# Mutable method calls

>>> C
bytearray(b'xYam')

>>> C.append(b'LMN')                        # 2.6 requires string of size 1
TypeError: an integer is required

>>> C.append(ord('L'))
>>> C
bytearray(b'xYamL')

>>> C.extend(b'MNO')
>>> C
bytearray(b'xYamLMNO')



# Sequence operations and string methods

>>> C + b'!#'
bytearray(b'xYamLMNO!#')

>>> C[0]
120

>>> C[1:]
bytearray(b'YamLMNO')

>>> len(C)
8

>>> C
bytearray(b'xYamLMNO')

>>> C.replace('xY', 'sp')                            # This works in 2.6
TypeError: Type str doesn't support the buffer API

>>> C.replace(b'xY', b'sp')
bytearray(b'spamLMNO')

>>> C
bytearray(b'xYamLMNO')

>>> C * 4
bytearray(b'xYamLMNOxYamLMNOxYamLMNOxYamLMNO')



# Binary versus text

>>> B                                # B is same as S in 2.6
b'spam'
>>> list(B)
[115, 112, 97, 109]

>>> C
bytearray(b'xYamLMNO')
>>> list(C)
[120, 89, 97, 109, 76, 77, 78, 79]

>>> S
'spam'
>>> list(S)
['s', 'p', 'a', 'm']



C:\misc> c:\python30\python

# Basic text files (and strings) work the same as in 2.X

>>> file = open('temp', 'w')
>>> size = file.write('abc\n')       # Returns number of bytes written
>>> file.close()                     # Manual close to flush output buffer

>>> file = open('temp')              # Default mode is "r" (== "rt"): text input
>>> text = file.read()
>>> text
'abc\n'
>>> print(text)
abc



C:\misc> c:\python26\python
>>> open('temp', 'w').write('abd\n')         # Write in text mode: adds \r
>>> open('temp', 'r').read()                 # Read in text mode: drops \r
'abd\n'
>>> open('temp', 'rb').read()                # Read in binary mode: verbatim
'abd\r\n'

>>> open('temp', 'wb').write('abc\n')        # Write in binary mode
>>> open('temp', 'r').read()                 # \n not expanded to \r\n
'abc\n'
>>> open('temp', 'rb').read()
'abc\n'



C:\misc> c:\python30\python

# Write and read a text file

>>> open('temp', 'w').write('abc\n')         # Text mode output, provide a str
4

>>> open('temp', 'r').read()                 # Text mode input, returns a str
'abc\n'

>>> open('temp', 'rb').read()                # Binary mode input, returns a bytes
b'abc\r\n'



# Write and read a binary file

>>> open('temp', 'wb').write(b'abc\n')       # Binary mode output, provide a bytes
4

>>> open('temp', 'r').read()                 # Text mode input, returns a str
'abc\n'

>>> open('temp', 'rb').read()                # Binary mode input, returns a bytes
b'abc\n'



# Write and read truly binary data

>>> open('temp', 'wb').write(b'a\x00c')      # Provide a bytes
3

>>> open('temp', 'r').read()                 # Receive a str
'a\x00c'

>>> open('temp', 'rb').read()                # Receive a bytes
b'a\x00c'



# bytearrays work too

>>> BA = bytearray(b'\x01\x02\x03')

>>> open('temp', 'wb').write(BA)
3

>>> open('temp', 'r').read()
'\x01\x02\x03'

>>> open('temp', 'rb').read()
b'\x01\x02\x03'



# Types are not flexible for file content

>>> open('temp', 'w').write('abc\n')         # Text mode makes and requires str
4
>>> open('temp', 'w').write(b'abc\n')
TypeError: can't write bytes to text stream

>>> open('temp', 'wb').write(b'abc\n')       # Binary mode makes and requires bytes
4
>>> open('temp', 'wb').write('abc\n')
TypeError: can't write str to binary stream



# Can't read truly binary data in text mode

>>> chr(0xFF)                                   # FF is a valid char, FE is not
''
>>> chr(0xFE)
UnicodeEncodeError: 'charmap' codec can't encode character '\xfe' in position 1...

>>> open('temp', 'w').write(b'\xFF\xFE\xFD')    # Can't use arbitrary bytes!
TypeError: can't write bytes to text stream

>>> open('temp', 'w').write('\xFF\xFE\xFD')     # Can write if embeddable in str
3
>>> open('temp', 'wb').write(b'\xFF\xFE\xFD')   # Can also write in binary mode
3

>>> open('temp', 'rb').read()                   # Can always read as binary bytes
b'\xff\xfe\xfd'

>>> open('temp', 'r').read()                    # Can't read text unless decodable!
UnicodeEncodeError: 'charmap' codec can't encode characters in position 2-3: ...



C:\misc> c:\python30\python
>>> S = 'A\xc4B\xe8C'           # 5-character string, non-ASCII
>>> S
'A腂鐲'
>>> len(S)
5



# Encode manually with methods

>>> L = S.encode('latin-1')     # 5 bytes when encoded as latin-1
>>> L
b'A\xc4B\xe8C'
>>> len(L)
5

>>> U = S.encode('utf-8')       # 7 bytes when encoded as utf-8
>>> U
b'A\xc3\x84B\xc3\xa8C'
>>> len(U)
7



# Encoding automatically when written

>>> open('latindata', 'w', encoding='latin-1').write(S)    # Write as latin-1
5
>>> open('utf8data', 'w', encoding='utf-8').write(S)       # Write as utf-8
5

>>> open('latindata', 'rb').read()                         # Read raw bytes
b'A\xc4B\xe8C'

>>> open('utf8data', 'rb').read()                          # Different in files
b'A\xc3\x84B\xc3\xa8C'



# Decoding automatically when read

>>> open('latindata', 'r', encoding='latin-1').read()      # Decoded on input
'A腂鐲'
>>> open('utf8data', 'r', encoding='utf-8').read()         # Per encoding type
'A腂鐲'

>>> X = open('latindata', 'rb').read()                     # Manual decoding:
>>> X.decode('latin-1')                                    # Not necessary
'A腂鐲'
>>> X = open('utf8data', 'rb').read()
>>> X.decode()                                             # UTF-8 is default
'A腂鐲'



>>> file = open('python.exe', 'r')
>>> text = file.read()
UnicodeDecodeError: 'charmap' codec can't decode byte 0x90 in position 2: ...

>>> file = open('python.exe', 'rb')
>>> data = file.read()
>>> data[:20]
b'MZ\x90\x00\x03\x00\x00\x00\x04\x00\x00\x00\xff\xff\x00\x00\xb8\x00\x00\x00'



c:\misc> C:\Python30\python               # File saved in Notepad
>>> import sys
>>> sys.getdefaultencoding()
'utf-8'
>>> open('spam.txt', 'rb').read()         # ASCII (UTF-8) text file
b'spam\r\nSPAM\r\n'
>>> open('spam.txt', 'r').read()          # Text mode translates line-end
'spam\nSPAM\n'
>>> open('spam.txt', 'r', encoding='utf-8').read()
'spam\nSPAM\n'



>>> open('spam.txt', 'rb').read()         # UTF-8 with 3-byte BOM
b'\xef\xbb\xbfspam\r\nSPAM\r\n'
>>> open('spam.txt', 'r').read()
'锘縮pam\nSPAM\n'
>>> open('spam.txt', 'r', encoding='utf-8').read()
'\ufeffspam\nSPAM\n'
>>> open('spam.txt', 'r', encoding='utf-8-sig').read()
'spam\nSPAM\n'



>>> open('spam.txt', 'rb').read()
b'\xfe\xff\x00s\x00p\x00a\x00m\x00\r\x00\n\x00S\x00P\x00A\x00M\x00\r\x00\n'
>>> open('spam.txt', 'r').read()
UnicodeEncodeError: 'charmap' codec can't encode character '\xfe' in position 1:?
>>> open('spam.txt', 'r', encoding='utf-16').read()
'spam\nSPAM\n'
>>> open('spam.txt', 'r', encoding='utf-16-be').read()
'\ufeffspam\nSPAM\n'



>>> open('temp.txt', 'w', encoding='utf-8').write('spam\nSPAM\n')
10
>>> open('temp.txt', 'rb').read()                         # No BOM
b'spam\r\nSPAM\r\n'

>>> open('temp.txt', 'w', encoding='utf-8-sig').write('spam\nSPAM\n')
10
>>> open('temp.txt', 'rb').read()                         # Wrote BOM
b'\xef\xbb\xbfspam\r\nSPAM\r\n'

>>> open('temp.txt', 'r').read()
'锘縮pam\nSPAM\n'
>>> open('temp.txt', 'r', encoding='utf-8').read()        # Keeps BOM
'\ufeffspam\nSPAM\n'
>>> open('temp.txt', 'r', encoding='utf-8-sig').read()    # Skips BOM
'spam\nSPAM\n'



>>> open('temp.txt', 'w').write('spam\nSPAM\n')
10
>>> open('temp.txt', 'rb').read()                         # Data without BOM
b'spam\r\nSPAM\r\n'
>>> open('temp.txt', 'r').read()                          # Any utf-8 works
'spam\nSPAM\n'
>>> open('temp.txt', 'r', encoding='utf-8').read()
'spam\nSPAM\n'
>>> open('temp.txt', 'r', encoding='utf-8-sig').read()
'spam\nSPAM\n'



>>> sys.byteorder
'little'
>>> open('temp.txt', 'w', encoding='utf-16').write('spam\nSPAM\n')
10
>>> open('temp.txt', 'rb').read()
b'\xff\xfes\x00p\x00a\x00m\x00\r\x00\n\x00S\x00P\x00A\x00M\x00\r\x00\n\x00'
>>> open('temp.txt', 'r', encoding='utf-16').read()
'spam\nSPAM\n'

>>> open('temp.txt', 'w', encoding='utf-16-be').write('\ufeffspam\nSPAM\n')
11
>>> open('spam.txt', 'rb').read()
b'\xfe\xff\x00s\x00p\x00a\x00m\x00\r\x00\n\x00S\x00P\x00A\x00M\x00\r\x00\n'
>>> open('temp.txt', 'r', encoding='utf-16').read()
'spam\nSPAM\n'
>>> open('temp.txt', 'r', encoding='utf-16-be').read()
'\ufeffspam\nSPAM\n'



>>> open('temp.txt', 'w', encoding='utf-16-le').write('SPAM')
4
>>> open('temp.txt', 'rb').read()             # OK if BOM not present or expected
b'S\x00P\x00A\x00M\x00'
>>> open('temp.txt', 'r', encoding='utf-16-le').read()
'SPAM'
>>> open('temp.txt', 'r', encoding='utf-16').read()
UnicodeError: UTF-16 stream does not start with BOM



C:\misc> c:\python26\python
>>> S = u'A\xc4B\xe8C'
>>> print S
A腂鐲
>>> len(S)
5
>>> S.encode('latin-1')
'A\xc4B\xe8C'
>>> S.encode('utf-8')
'A\xc3\x84B\xc3\xa8C'

>>> import codecs
>>> codecs.open('latindata', 'w', encoding='latin-1').write(S)
>>> codecs.open('utfdata', 'w', encoding='utf-8').write(S)

>>> open('latindata', 'rb').read()
'A\xc4B\xe8C'
>>> open('utfdata', 'rb').read()
'A\xc3\x84B\xc3\xa8C'


# NOTE: recall from prior examples that that you must use "print"
# to display the non-ascii characters in the following in non-hex
# form (repr shows hex, but str displays the characters)


>>> codecs.open('latindata', 'r', encoding='latin-1').read()
u'A\xc4B\xe8C'
>>> codecs.open('utfdata', 'r', encoding='utf-8').read()
u'A\xc4B\xe8C'



C:\misc> c:\python30\python
>>> import re
>>> S = 'Bugger all down here on earth!'               # Line of text
>>> B = b'Bugger all down here on earth!'              # Usually from a file

>>> re.match('(.*) down (.*) on (.*)', S).groups()     # Match line to pattern
('Bugger all', 'here', 'earth!')                       # Matched substrings

>>> re.match(b'(.*) down (.*) on (.*)', B).groups()    # bytes substrings
(b'Bugger all', b'here', b'earth!')



C:\misc> c:\python26\python
>>> import re
>>> S = 'Bugger all down here on earth!'               # Simple text and binary
>>> U = u'Bugger all down here on earth!'              # Unicode text

>>> re.match('(.*) down (.*) on (.*)', S).groups()
('Bugger all', 'here', 'earth!')

>>> re.match('(.*) down (.*) on (.*)', U).groups()
(u'Bugger all', u'here', u'earth!')



C:\misc> c:\python30\python
>>> import re
>>> S = 'Bugger all down here on earth!'
>>> B = b'Bugger all down here on earth!'

>>> re.match('(.*) down (.*) on (.*)', B).groups()
TypeError: can't use a string pattern on a bytes-like object

>>> re.match(b'(.*) down (.*) on (.*)', S).groups()
TypeError: can't use a bytes pattern on a string-like object

>>> re.match(b'(.*) down (.*) on (.*)', bytearray(B)).groups()
(bytearray(b'Bugger all'), bytearray(b'here'), bytearray(b'earth!'))

>>> re.match('(.*) down (.*) on (.*)', bytearray(B)).groups()
TypeError: can't use a string pattern on a bytes-like object



C:\misc> c:\python30\python
>>> from struct import pack
>>> pack('>i4sh', 7, 'spam', 8)         # bytes in 3.0 (8-bit string)
b'\x00\x00\x00\x07spam\x00\x08'

C:\misc> c:\python26\python
>>> from struct import pack
>>> pack('>i4sh', 7, 'spam', 8)         # str in 2.6 (8-bit string)
'\x00\x00\x00\x07spam\x00\x08'



C:\misc> c:\python30\python
>>> import struct
>>> B = struct.pack('>i4sh', 7, 'spam', 8)
>>> B
b'\x00\x00\x00\x07spam\x00\x08'

>>> vals = struct.unpack('>i4sh', B)
>>> vals
(7, b'spam', 8)

>>> vals = struct.unpack('>i4sh', B.decode())
TypeError: 'str' does not have the buffer interface



C:\misc> c:\python30\python

# Write values to a packed binary file

>>> F = open('data.bin', 'wb')                  # Open binary output file
>>> import struct
>>> data = struct.pack('>i4sh', 7, 'spam', 8)   # Create packed binary data
>>> data                                        # bytes in 3.0, not str
b'\x00\x00\x00\x07spam\x00\x08'
>>> F.write(data)                               # Write to the file
10
>>> F.close()

# Read values from a packed binary file

>>> F = open('data.bin', 'rb')                  # Open binary input file
>>> data = F.read()                             # Read bytes
>>> data
b'\x00\x00\x00\x07spam\x00\x08'
>>> values = struct.unpack('>i4sh', data)       # Extract packed binary data
>>> values                                      # Back to Python objects
(7, b'spam', 8)



>>> values                                      # Result of struct.unpack
(7, b'spam', 8)

# Accesssing bits of parsed integers

>>> bin(values[0])                              # Can get to bits in ints
'0b111'
>>> values[0] & 0x01                            # Test first (lowest) bit in int
1
>>> values[0] | 0b1010                          # Bitwise or: turn bits on
15
>>> bin(values[0] | 0b1010)                     # 15 decimal is 1111 binary
'0b1111'
>>> bin(values[0] ^ 0b1010)                     # Bitwise xor: off if both true
'0b1101'
>>> bool(values[0] & 0b100)                     # Test if bit 3 is on
True
>>> bool(values[0] & 0b1000)                    # Test if bit 4 is set
False



# Accesing bytes of parsed strings and bits within them

>>> values[1]
b'spam'
>>> values[1][0]                          # bytes string: sequence of ints
115
>>> values[1][1:]                         # Prints as ASCII characters
b'pam'
>>> bin(values[1][0])                     # Can get to bits of bytes in strings
'0b1110011'
>>> bin(values[1][0] | 0b1100)            # Turn bits on
'0b1111111'
>>> values[1][0] | 0b1100
127



C:\misc> C:\Python30\python
>>> import pickle                          # dumps() returns pickle string

>>> pickle.dumps([1, 2, 3])                # Python 3.0 default protocol=3=binary
b'\x80\x03]q\x00(K\x01K\x02K\x03e.'

>>> pickle.dumps([1, 2, 3], protocol=0)    # ASCII protocol 0, but still bytes!
b'(lp0\nL1L\naL2L\naL3L\na.'



>>> pickle.dump([1, 2, 3], open('temp', 'w'))    # Text files fail on bytes!
TypeError: can't write bytes to text stream      # Despite protocol value

>>> pickle.dump([1, 2, 3], open('temp', 'w'), protocol=0)
TypeError: can't write bytes to text stream

>>> pickle.dump([1, 2, 3], open('temp', 'wb'))   # Always use binary in 3.0

>>> open('temp', 'r').read()
UnicodeEncodeError: 'charmap' codec can't encode character '\u20ac' in ...



>>> pickle.dump([1, 2, 3], open('temp', 'wb'))
>>> pickle.load(open('temp', 'rb'))
[1, 2, 3]
>>> open('temp', 'rb').read()
b'\x80\x03]q\x00(K\x01K\x02K\x03e.'



C:\misc> c:\python26\python
>>> import pickle
>>> pickle.dumps([1, 2, 3])                      # Python 2.6 default=0=ASCII
'(lp0\nI1\naI2\naI3\na.'

>>> pickle.dumps([1, 2, 3], protocol=1)
']q\x00(K\x01K\x02K\x03e.'

>>> pickle.dump([1, 2, 3], open('temp', 'w'))    # Text mode works in 2.6
>>> pickle.load(open('temp'))
[1, 2, 3]
>>> open('temp').read()
'(lp0\nI1\naI2\naI3\na.'



>>> import pickle
>>> pickle.dump([1, 2, 3], open('temp', 'wb'))     # Version neutral
>>> pickle.load(open('temp', 'rb'))                # And required in 3.0
[1, 2, 3]



### file: mybooks.xml

<books>
    <date>2009</date>
    <title>Learning Python</title>
    <title>Programming Python</title>
    <title>Python Pocket Reference</title>
    <publisher>O'Reilly Media</publisher>
</books>



#### File: patternparse.py

import re
text  = open('mybooks.xml').read()
found = re.findall('<title>(.*)</title>', text)
for title in found: print(title)



### File: domparse.py

from xml.dom.minidom import parse, Node
xmltree = parse('mybooks.xml')
for node1 in xmltree.getElementsByTagName('title'):
    for node2 in node1.childNodes:
         if node2.nodeType == Node.TEXT_NODE:
             print(node2.data)



### File: saxparse.py

import xml.sax.handler
class BookHandler(xml.sax.handler.ContentHandler):
    def __init__(self):
        self.inTitle = False
    def startElement(self, name, attributes):
        if name == 'title':
            self.inTitle = True
    def characters(self, data):
        if self.inTitle:
            print(data)
    def endElement(self, name):
        if name == 'title':
            self.inTitle = False

import xml.sax
parser = xml.sax.make_parser()
handler = BookHandler()
parser.setContentHandler(handler)
parser.parse('mybooks.xml')



### File: etreeparse.py

from xml.etree.ElementTree import parse
tree = parse('mybooks.xml')
for E in tree.findall('title'):
    print(E.text)



C:\misc> c:\python26\python domparse.py
Learning Python
Programming Python
Python Pocket Reference

C:\misc> c:\python30\python domparse.py
Learning Python
Programming Python
Python Pocket Reference



C:\misc> c:\python30\python
>>> from xml.dom.minidom import parse, Node
>>> xmltree = parse('mybooks.xml')
>>> for node in xmltree.getElementsByTagName('title'):
...     for node2 in node.childNodes:
...         if node2.nodeType == Node.TEXT_NODE:
...             node2.data
...
'Learning Python'
'Programming Python'
'Python Pocket Reference'

C:\misc> c:\python26\python
>>> ...same code...
...
u'Learning Python'
u'Programming Python'
u'Python Pocket Reference'




评论

此博客中的热门博文

OAuth 2教程

网格策略

apt-get详细使用