Skip to content

Commit 402d88b

Browse files
committed
handles string versus bytes correctly. probably not python2 compatible
1 parent 0aef61f commit 402d88b

2 files changed

Lines changed: 79 additions & 49 deletions

File tree

bencode/__init__.py

Lines changed: 66 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212

1313
"""bencode.py - bencode encoder + decoder."""
1414

15+
from collections import deque
16+
1517
from bencode.BTL import BTFailure
1618
from bencode.exceptions import BencodeDecodeError
1719

@@ -29,35 +31,56 @@
2931

3032
def decode_int(x, f):
3133
f += 1
32-
newf = x.index('e', f)
34+
newf = x.index(b'e', f)
3335
n = int(x[f:newf])
3436

35-
if x[f] == '-':
36-
if x[f + 1] == '0':
37+
if x[f : f+1] == b'-':
38+
if x[f+1 : f+2] == b'0':
3739
raise ValueError
38-
elif x[f] == '0' and newf != f + 1:
40+
elif x[f : f+1] == b'0' and newf != f + 1:
3941
raise ValueError
4042

4143
return n, newf + 1
4244

4345

44-
def decode_string(x, f):
45-
colon = x.index(':', f)
46+
def decode_string(x, f, try_decode_utf8=True, force_decode_utf8=False):
47+
"""
48+
decode torrent bencoded 'string' in x starting at f
49+
50+
An attempt is made to convert the string to a python string from utf-8.
51+
However, both string and non-string binary data is intermixed in the
52+
torrent bencoding standard. So we have to guess whether the byte
53+
sequence is a string or just binary data. We make this guess by trying
54+
to decode (from utf-8), and if that fails, assuming it is binary data.
55+
There are some instances where the data SHOULD be a string though.
56+
You can check enforce this by setting force_decode_utf8 to True. If the
57+
decoding from utf-8 fails, an UnidcodeDecodeError is raised. Similarly,
58+
if you know it should not be a string, you can skip the decoding
59+
attempt by setting try_decode_utf8=False.
60+
"""
61+
colon = x.index(b':', f)
4662
n = int(x[f:colon])
4763

48-
if x[f] == '0' and colon != f + 1:
64+
if x[f : f+1] == b'0' and colon != f + 1:
4965
raise ValueError
5066

5167
colon += 1
68+
s = x[colon:colon + n]
69+
if try_decode_utf8:
70+
try:
71+
s = s.decode('utf-8')
72+
except UnicodeDecodeError as e:
73+
if force_decode_utf8:
74+
raise
5275

53-
return x[colon:colon + n], colon + n
76+
return s, colon + n
5477

5578

5679
def decode_list(x, f):
5780
r, f = [], f + 1
5881

59-
while x[f] != 'e':
60-
v, f = decode_func[x[f]](x, f)
82+
while x[f : f+1] != b'e':
83+
v, f = decode_func[x[f : f+1]](x, f)
6184
r.append(v)
6285

6386
return r, f + 1
@@ -66,28 +89,28 @@ def decode_list(x, f):
6689
def decode_dict(x, f):
6790
r, f = {}, f + 1
6891

69-
while x[f] != 'e':
92+
while x[f : f+1] != b'e':
7093
k, f = decode_string(x, f)
71-
r[k], f = decode_func[x[f]](x, f)
94+
r[k], f = decode_func[x[f : f+1]](x, f)
7295

7396
return r, f + 1
7497

7598

7699
# noinspection PyDictCreation
77100
decode_func = {}
78-
decode_func['l'] = decode_list
79-
decode_func['d'] = decode_dict
80-
decode_func['i'] = decode_int
81-
decode_func['0'] = decode_string
82-
decode_func['1'] = decode_string
83-
decode_func['2'] = decode_string
84-
decode_func['3'] = decode_string
85-
decode_func['4'] = decode_string
86-
decode_func['5'] = decode_string
87-
decode_func['6'] = decode_string
88-
decode_func['7'] = decode_string
89-
decode_func['8'] = decode_string
90-
decode_func['9'] = decode_string
101+
decode_func[b'l'] = decode_list
102+
decode_func[b'd'] = decode_dict
103+
decode_func[b'i'] = decode_int
104+
decode_func[b'0'] = decode_string
105+
decode_func[b'1'] = decode_string
106+
decode_func[b'2'] = decode_string
107+
decode_func[b'3'] = decode_string
108+
decode_func[b'4'] = decode_string
109+
decode_func[b'5'] = decode_string
110+
decode_func[b'6'] = decode_string
111+
decode_func[b'7'] = decode_string
112+
decode_func[b'8'] = decode_string
113+
decode_func[b'9'] = decode_string
91114

92115

93116
def bdecode(value):
@@ -101,8 +124,8 @@ def bdecode(value):
101124
:rtype: object
102125
"""
103126
try:
104-
r, l = decode_func[value[0]](value, 0)
105-
except (IndexError, KeyError, ValueError):
127+
r, l = decode_func[value[0:1]](value, 0)
128+
except (IndexError, KeyError, TypeError, ValueError):
106129
raise BencodeDecodeError("not a valid bencoded string")
107130

108131
if l != len(value):
@@ -123,7 +146,7 @@ def encode_bencached(x, r):
123146

124147

125148
def encode_int(x, r):
126-
r.extend(('i', str(x), 'e'))
149+
r.extend((b'i', str(x).encode('utf-8'), b'e'))
127150

128151

129152
def encode_bool(x, r):
@@ -134,28 +157,34 @@ def encode_bool(x, r):
134157

135158

136159
def encode_string(x, r):
137-
r.extend((str(len(x)), ':', x))
160+
s = x.encode('utf-8')
161+
r.extend((str(len(s)).encode('utf-8'), b':', s))
162+
163+
164+
def encode_bytes(x, r):
165+
r.extend((str(len(x)).encode('utf-8'), b':', x))
138166

139167

140168
def encode_list(x, r):
141-
r.append('l')
169+
r.append(b'l')
142170

143171
for i in x:
144172
encode_func[type(i)](i, r)
145173

146-
r.append('e')
174+
r.append(b'e')
147175

148176

149177
def encode_dict(x, r):
150-
r.append('d')
178+
r.append(b'd')
151179
ilist = list(x.items())
152180
ilist.sort()
153181

154182
for k, v in ilist:
155-
r.extend((str(len(k)), ':', k))
183+
k = k.encode('utf-8')
184+
r.extend((str(len(k)).encode('utf-8'), b':', k))
156185
encode_func[type(v)](v, r)
157186

158-
r.append('e')
187+
r.append(b'e')
159188

160189

161190
# noinspection PyDictCreation
@@ -184,6 +213,7 @@ def encode_dict(x, r):
184213
encode_func[list] = encode_list
185214
encode_func[str] = encode_string
186215
encode_func[tuple] = encode_list
216+
encode_func[bytes] = encode_bytes
187217

188218

189219
def bencode(value):
@@ -196,9 +226,9 @@ def bencode(value):
196226
:return: Bencode formatted string
197227
:rtype: str
198228
"""
199-
r = []
229+
r = deque() # makes more sense for something with lots of appends
200230
encode_func[type(value)](value, r)
201-
return ''.join(r)
231+
return b''.join(r)
202232

203233

204234
# Method proxies (for compatibility with other libraries)

tests/bencode_tests.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,20 +17,20 @@ class KnownValues(unittest.TestCase):
1717
"""
1818

1919
knownValues = (
20-
(0, 'i0e'),
21-
(1, 'i1e'),
22-
(10, 'i10e'),
23-
(42, 'i42e'),
24-
(-42, 'i-42e'),
25-
(True, 'i1e'),
26-
(False, 'i0e'),
27-
('spam', '4:spam'),
28-
('parrot sketch', '13:parrot sketch'),
29-
(['parrot sketch', 42], 'l13:parrot sketchi42ee'),
20+
(0, 'i0e'.encode('utf-8')),
21+
(1, 'i1e'.encode('utf-8')),
22+
(10, 'i10e'.encode('utf-8')),
23+
(42, 'i42e'.encode('utf-8')),
24+
(-42, 'i-42e'.encode('utf-8')),
25+
(True, 'i1e'.encode('utf-8')),
26+
(False, 'i0e'.encode('utf-8')),
27+
('spam', '4:spam'.encode('utf-8')),
28+
('parrot sketch', '13:parrot sketch'.encode('utf-8')),
29+
(['parrot sketch', 42], 'l13:parrot sketchi42ee'.encode('utf-8')),
3030
({
3131
'foo': 42,
3232
'bar': 'spam'
33-
}, 'd3:bar4:spam3:fooi42ee'),
33+
}, 'd3:bar4:spam3:fooi42ee'.encode('utf-8')),
3434
)
3535

3636
def testBencodeKnownValues(self):
@@ -89,11 +89,11 @@ def testSortedKeysForDicts(self):
8989
"""Ensure the keys of a dictionary are sorted before being encoded."""
9090
encoded = bencode({'zoo': 42, 'bar': 'spam'})
9191

92-
self.failUnless(encoded.index('zoo') > encoded.index('bar'))
92+
self.assertTrue(encoded.index(b'zoo') > encoded.index(b'bar'))
9393

9494
def testNestedDictionary(self):
9595
"""Test the handling of nested dicts."""
9696
self.assertEqual(
9797
bencode({'foo': 42, 'bar': {'sketch': 'parrot', 'foobar': 23}}),
98-
'd3:bard6:foobari23e6:sketch6:parrote3:fooi42ee'
98+
'd3:bard6:foobari23e6:sketch6:parrote3:fooi42ee'.encode('utf-8')
9999
)

0 commit comments

Comments
 (0)