Skip to content

Commit 734c941

Browse files
authored
Merge pull request mjs#300 from mlorant/fix-187-refactor-utf7
Refactor imap_utf7 module to make it easier to understand and fix mjs#187
2 parents 42e1187 + 480127f commit 734c941

File tree

3 files changed

+69
-65
lines changed

3 files changed

+69
-65
lines changed

doc/src/releases.rst

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
Added
88
-----
9-
- Connection and read/write operations timeout can now be distinct,
9+
- Connection and read/write operations timeout can now be distinct,
1010
using `imapclient.SocketTimeout` namedtuple as `timeout` parameter.
1111
- A context manager is introduced to automatically close connections to remote
1212
servers.
@@ -20,6 +20,12 @@ Changed
2020
- More precise exceptions available in `imapclient.exceptions` are raised when
2121
an error happens
2222

23+
Fixed
24+
-----
25+
- Modified UTF-7 encoding function had quirks in its original algorithm,
26+
leading to incorrect encoded output in some cases. The algorithm, described
27+
in RFC 3501, has been reimplemented to fix #187 and is better documented.
28+
2329
Other
2430
-----
2531
- Drop support of OAUTH(1)

imapclient/imap_utf7.py

Lines changed: 61 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,17 @@
1-
# The contents of this file has been derived code from the Twisted project
2-
# (http://twistedmatrix.com/). The original author is Jp Calderone.
3-
4-
# Twisted project license follows:
5-
6-
# Permission is hereby granted, free of charge, to any person obtaining
7-
# a copy of this software and associated documentation files (the
8-
# "Software"), to deal in the Software without restriction, including
9-
# without limitation the rights to use, copy, modify, merge, publish,
10-
# distribute, sublicense, and/or sell copies of the Software, and to
11-
# permit persons to whom the Software is furnished to do so, subject to
12-
# the following conditions:
1+
# This file contains two main methods used to encode and decode UTF-7
2+
# string, described in the RFC 3501. There are some variations specific
3+
# to IMAP4rev1, so the built-in Python UTF-7 codec can't be used instead.
134
#
14-
# The above copyright notice and this permission notice shall be
15-
# included in all copies or substantial portions of the Software.
16-
#
17-
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18-
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19-
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20-
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21-
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22-
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23-
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24-
5+
# The main difference is the shift character (used to switch from ASCII to
6+
# base64 encoding context), which is & in this modified UTF-7 convention,
7+
# since + is considered as mainly used in mailbox names.
8+
# Other variations and examples can be found in the RFC 3501, section 5.1.3.
259
from __future__ import unicode_literals
2610

11+
import binascii
2712
from six import binary_type, text_type, byte2int, iterbytes, unichr
2813

2914

30-
PRINTABLE = set(range(0x20, 0x26)) | set(range(0x27, 0x7f))
31-
32-
# TODO: module needs refactoring (e.g. variable names suck)
33-
34-
3515
def encode(s):
3616
"""Encode a folder name using IMAP modified UTF-7 encoding.
3717
@@ -41,27 +21,36 @@ def encode(s):
4121
if not isinstance(s, text_type):
4222
return s
4323

44-
r = []
45-
_in = []
46-
47-
def extend_result_if_chars_buffered():
48-
if _in:
49-
r.extend([b'&', modified_utf7(''.join(_in)), b'-'])
50-
del _in[:]
24+
res = []
25+
b64_buffer = []
26+
def consume_b64_buffer(buf):
27+
"""
28+
Consume the buffer by encoding it into a modified base 64 representation
29+
and surround it with shift characters & and -
30+
"""
31+
if b64_buffer:
32+
res.extend([b'&', base64_utf7_encode(buf), b'-'])
33+
del buf[:]
5134

5235
for c in s:
53-
if ord(c) in PRINTABLE:
54-
extend_result_if_chars_buffered()
55-
r.append(c.encode('latin-1'))
56-
elif c == '&':
57-
extend_result_if_chars_buffered()
58-
r.append(b'&-')
36+
# printable ascii case should not be modified
37+
if 0x20 <= ord(c) <= 0x7e:
38+
consume_b64_buffer(b64_buffer)
39+
# Special case: & is used as shift character so we need to escape it in ASCII
40+
if c == '&':
41+
res.append(b'&-')
42+
else:
43+
res.append(c.encode('ascii'))
44+
45+
# Bufferize characters that will be encoded in base64 and append them later
46+
# in the result, when iterating over ASCII character or the end of string
5947
else:
60-
_in.append(c)
48+
b64_buffer.append(c)
6149

62-
extend_result_if_chars_buffered()
50+
# Consume the remaining buffer if the string finish with non-ASCII characters
51+
consume_b64_buffer(b64_buffer)
6352

64-
return b''.join(r)
53+
return b''.join(res)
6554

6655

6756
AMPERSAND_ORD = byte2int(b'&')
@@ -75,35 +64,43 @@ def decode(s):
7564
unicode. If non-bytes/str input is provided, the input is returned
7665
unchanged.
7766
"""
78-
7967
if not isinstance(s, binary_type):
8068
return s
8169

82-
r = []
83-
_in = bytearray()
70+
res = []
71+
# Store base64 substring that will be decoded once stepping on end shift character
72+
b64_buffer = bytearray()
8473
for c in iterbytes(s):
85-
if c == AMPERSAND_ORD and not _in:
86-
_in.append(c)
87-
elif c == DASH_ORD and _in:
88-
if len(_in) == 1:
89-
r.append('&')
74+
# Shift character without anything in buffer -> starts storing base64 substring
75+
if c == AMPERSAND_ORD and not b64_buffer:
76+
b64_buffer.append(c)
77+
# End shift char. -> append the decoded buffer to the result and reset it
78+
elif c == DASH_ORD and b64_buffer:
79+
# Special case &-, representing "&" escaped
80+
if len(b64_buffer) == 1:
81+
res.append('&')
9082
else:
91-
r.append(modified_deutf7(_in[1:]))
92-
_in = bytearray()
93-
elif _in:
94-
_in.append(c)
83+
res.append(base64_utf7_decode(b64_buffer[1:]))
84+
b64_buffer = bytearray()
85+
# Still buffering between the shift character and the shift back to ASCII
86+
elif b64_buffer:
87+
b64_buffer.append(c)
88+
# No buffer initialized yet, should be an ASCII printable char
9589
else:
96-
r.append(unichr(c))
97-
if _in:
98-
r.append(modified_deutf7(_in[1:]))
99-
return ''.join(r)
90+
res.append(unichr(c))
91+
92+
# Decode the remaining buffer if any
93+
if b64_buffer:
94+
res.append(base64_utf7_decode(b64_buffer[1:]))
95+
96+
return ''.join(res)
10097

10198

102-
def modified_utf7(s):
103-
s_utf7 = s.encode('utf-7')
104-
return s_utf7[1:-1].replace(b'/', b',')
99+
def base64_utf7_encode(buffer):
100+
s = ''.join(buffer).encode('utf-16be')
101+
return binascii.b2a_base64(s).rstrip(b'\n=').replace(b'/', b',')
105102

106103

107-
def modified_deutf7(s):
104+
def base64_utf7_decode(s):
108105
s_utf7 = b'+' + s.replace(b',', b'/') + b'-'
109106
return s_utf7.decode('utf-7')

tests/test_imap_utf7.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ class IMAP4UTF7TestCase(unittest.TestCase):
2222
['~peter/mail/\u65e5\u672c\u8a9e/\u53f0\u5317',
2323
b'~peter/mail/&ZeVnLIqe-/&U,BTFw-'], # example from RFC 2060
2424
['\x00foo', b'&AAA-foo'],
25+
['foo\r\n\nbar\n', b'foo&AA0ACgAK-bar&AAo-'] # see imapclient/#187 issue
2526
]
2627

2728
def test_encode(self):

0 commit comments

Comments
 (0)