1
- # The contents of this file has been derived code from the Twisted project
2
- # (http://twistedmatrix.com/). The original author is Jp Calderone.
3
-
4
- # Twisted project license follows:
5
-
6
- # Permission is hereby granted, free of charge, to any person obtaining
7
- # a copy of this software and associated documentation files (the
8
- # "Software"), to deal in the Software without restriction, including
9
- # without limitation the rights to use, copy, modify, merge, publish,
10
- # distribute, sublicense, and/or sell copies of the Software, and to
11
- # permit persons to whom the Software is furnished to do so, subject to
12
- # the following conditions:
1
+ # This file contains two main methods used to encode and decode UTF-7
2
+ # string, described in the RFC 3501. There are some variations specific
3
+ # to IMAP4rev1, so the built-in Python UTF-7 codec can't be used instead.
13
4
#
14
- # The above copyright notice and this permission notice shall be
15
- # included in all copies or substantial portions of the Software.
16
- #
17
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18
- # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19
- # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20
- # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21
- # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22
- # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23
- # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24
-
5
+ # The main difference is the shift character (used to switch from ASCII to
6
+ # base64 encoding context), which is & in this modified UTF-7 convention,
7
+ # since + is considered as mainly used in mailbox names.
8
+ # Other variations and examples can be found in the RFC 3501, section 5.1.3.
25
9
from __future__ import unicode_literals
26
10
11
+ import binascii
27
12
from six import binary_type , text_type , byte2int , iterbytes , unichr
28
13
29
14
30
- PRINTABLE = set (range (0x20 , 0x26 )) | set (range (0x27 , 0x7f ))
31
-
32
- # TODO: module needs refactoring (e.g. variable names suck)
33
-
34
-
35
15
def encode (s ):
36
16
"""Encode a folder name using IMAP modified UTF-7 encoding.
37
17
@@ -41,27 +21,36 @@ def encode(s):
41
21
if not isinstance (s , text_type ):
42
22
return s
43
23
44
- r = []
45
- _in = []
46
-
47
- def extend_result_if_chars_buffered ():
48
- if _in :
49
- r .extend ([b'&' , modified_utf7 ('' .join (_in )), b'-' ])
50
- del _in [:]
24
+ res = []
25
+ b64_buffer = []
26
+ def consume_b64_buffer (buf ):
27
+ """
28
+ Consume the buffer by encoding it into a modified base 64 representation
29
+ and surround it with shift characters & and -
30
+ """
31
+ if b64_buffer :
32
+ res .extend ([b'&' , base64_utf7_encode (buf ), b'-' ])
33
+ del buf [:]
51
34
52
35
for c in s :
53
- if ord (c ) in PRINTABLE :
54
- extend_result_if_chars_buffered ()
55
- r .append (c .encode ('latin-1' ))
56
- elif c == '&' :
57
- extend_result_if_chars_buffered ()
58
- r .append (b'&-' )
36
+ # printable ascii case should not be modified
37
+ if 0x20 <= ord (c ) <= 0x7e :
38
+ consume_b64_buffer (b64_buffer )
39
+ # Special case: & is used as shift character so we need to escape it in ASCII
40
+ if c == '&' :
41
+ res .append (b'&-' )
42
+ else :
43
+ res .append (c .encode ('ascii' ))
44
+
45
+ # Bufferize characters that will be encoded in base64 and append them later
46
+ # in the result, when iterating over ASCII character or the end of string
59
47
else :
60
- _in .append (c )
48
+ b64_buffer .append (c )
61
49
62
- extend_result_if_chars_buffered ()
50
+ # Consume the remaining buffer if the string finish with non-ASCII characters
51
+ consume_b64_buffer (b64_buffer )
63
52
64
- return b'' .join (r )
53
+ return b'' .join (res )
65
54
66
55
67
56
AMPERSAND_ORD = byte2int (b'&' )
@@ -75,35 +64,43 @@ def decode(s):
75
64
unicode. If non-bytes/str input is provided, the input is returned
76
65
unchanged.
77
66
"""
78
-
79
67
if not isinstance (s , binary_type ):
80
68
return s
81
69
82
- r = []
83
- _in = bytearray ()
70
+ res = []
71
+ # Store base64 substring that will be decoded once stepping on end shift character
72
+ b64_buffer = bytearray ()
84
73
for c in iterbytes (s ):
85
- if c == AMPERSAND_ORD and not _in :
86
- _in .append (c )
87
- elif c == DASH_ORD and _in :
88
- if len (_in ) == 1 :
89
- r .append ('&' )
74
+ # Shift character without anything in buffer -> starts storing base64 substring
75
+ if c == AMPERSAND_ORD and not b64_buffer :
76
+ b64_buffer .append (c )
77
+ # End shift char. -> append the decoded buffer to the result and reset it
78
+ elif c == DASH_ORD and b64_buffer :
79
+ # Special case &-, representing "&" escaped
80
+ if len (b64_buffer ) == 1 :
81
+ res .append ('&' )
90
82
else :
91
- r .append (modified_deutf7 (_in [1 :]))
92
- _in = bytearray ()
93
- elif _in :
94
- _in .append (c )
83
+ res .append (base64_utf7_decode (b64_buffer [1 :]))
84
+ b64_buffer = bytearray ()
85
+ # Still buffering between the shift character and the shift back to ASCII
86
+ elif b64_buffer :
87
+ b64_buffer .append (c )
88
+ # No buffer initialized yet, should be an ASCII printable char
95
89
else :
96
- r .append (unichr (c ))
97
- if _in :
98
- r .append (modified_deutf7 (_in [1 :]))
99
- return '' .join (r )
90
+ res .append (unichr (c ))
91
+
92
+ # Decode the remaining buffer if any
93
+ if b64_buffer :
94
+ res .append (base64_utf7_decode (b64_buffer [1 :]))
95
+
96
+ return '' .join (res )
100
97
101
98
102
- def modified_utf7 ( s ):
103
- s_utf7 = s . encode ('utf-7 ' )
104
- return s_utf7 [ 1 : - 1 ] .replace (b'/' , b',' )
99
+ def base64_utf7_encode ( buffer ):
100
+ s = '' . join ( buffer ). encode ('utf-16be ' )
101
+ return binascii . b2a_base64 ( s ). rstrip ( b' \n =' ) .replace (b'/' , b',' )
105
102
106
103
107
- def modified_deutf7 (s ):
104
+ def base64_utf7_decode (s ):
108
105
s_utf7 = b'+' + s .replace (b',' , b'/' ) + b'-'
109
106
return s_utf7 .decode ('utf-7' )
0 commit comments