Skip to content

Commit c3d3a37

Browse files
committed
Merge pull request #30 from hynek/unicode
Unicode all the things
2 parents 7bf9aa9 + 1788e91 commit c3d3a37

11 files changed

Lines changed: 224 additions & 167 deletions

File tree

doc2dash/parsers/pydoctor.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
from __future__ import absolute_import, division, print_function
22

3+
import codecs
34
import logging
45
import os
56

7+
import six
8+
69
from bs4 import BeautifulSoup
7-
from characteristic import attributes
10+
from characteristic import Attribute, attributes
811
from zope.interface import implementer
912

1013
from . import types
@@ -20,7 +23,7 @@
2023

2124

2225
@implementer(IParser)
23-
@attributes(["doc_path"])
26+
@attributes([Attribute("doc_path", instance_of=six.text_type)])
2427
class PyDoctorParser(object):
2528
"""
2629
Parser for pydoctor-based documentation: mainly Twisted.
@@ -40,23 +43,26 @@ def parse(self):
4043
yield `ParserEntry`s
4144
"""
4245
soup = BeautifulSoup(
43-
open(os.path.join(self.doc_path, 'nameIndex.html')),
46+
codecs.open(
47+
os.path.join(self.doc_path, 'nameIndex.html'),
48+
mode="r", encoding="utf-8",
49+
),
4450
'lxml'
4551
)
46-
for tag in soup.body.find_all('a'):
47-
path = tag.get('href')
48-
if path and not path.startswith('#'):
52+
for tag in soup.body.find_all(u'a'):
53+
path = tag.get(u'href')
54+
if path and not path.startswith(u'#'):
4955
name = tag.string
5056
yield ParserEntry(
5157
name=name,
5258
type=_guess_type(name, path),
53-
path=path
59+
path=six.text_type(path)
5460
)
5561

5662
def find_and_patch_entry(self, soup, entry):
57-
link = soup.find('a', attrs={'name': entry.anchor})
63+
link = soup.find(u'a', attrs={'name': entry.anchor})
5864
if link:
59-
tag = soup.new_tag('a')
65+
tag = soup.new_tag(u'a')
6066
tag['name'] = APPLE_REF_TEMPLATE.format(entry.type, entry.name)
6167
link.insert_before(tag)
6268
return True
@@ -68,9 +74,9 @@ def _guess_type(name, path):
6874
"""
6975
Employ voodoo magic to guess the type of *name* in *path*.
7076
"""
71-
if name.rsplit('.', 1)[-1][0].isupper() and '#' not in path:
77+
if name.rsplit(u'.', 1)[-1][0].isupper() and u'#' not in path:
7278
return types.CLASS
73-
elif name.islower() and '#' not in path:
79+
elif name.islower() and u'#' not in path:
7480
return types.PACKAGE
7581
else:
7682
return types.METHOD

doc2dash/parsers/sphinx.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
from __future__ import absolute_import, division, print_function
22

3+
import codecs
34
import errno
45
import logging
56
import os
67
import re
78

9+
import six
10+
811
from bs4 import BeautifulSoup
912
from characteristic import attributes
1013
from zope.interface import implementer
@@ -39,8 +42,11 @@ def parse(self):
3942
"""
4043
for idx in POSSIBLE_INDEXES:
4144
try:
42-
soup = BeautifulSoup(open(os.path.join(self.doc_path, idx)),
43-
'lxml')
45+
soup = BeautifulSoup(
46+
codecs.open(os.path.join(self.doc_path, idx),
47+
mode="r", encoding="utf-8"),
48+
'lxml'
49+
)
4450
break
4551
except IOError:
4652
pass
@@ -69,9 +75,9 @@ def _parse_soup(soup):
6975
continue
7076
type_, name = _get_type_and_name(dt.a.string)
7177
if name:
72-
href = dt.a['href']
78+
href = six.text_type(dt.a['href'])
7379
tmp_name = _url_to_name(href, type_)
74-
if not tmp_name.startswith('index-'):
80+
if not tmp_name.startswith(u'index-'):
7581
yield ParserEntry(name=tmp_name,
7682
type=type_,
7783
path=href)
@@ -83,7 +89,7 @@ def _parse_soup(soup):
8389
yield y
8490

8591

86-
RE_ANNO = re.compile(r'(.+) \(.*\)')
92+
RE_ANNO = re.compile(six.text_type(r'(.+) \(.*\)'))
8793

8894

8995
def _strip_annotation(text):
@@ -101,10 +107,10 @@ def _url_to_name(url, type_):
101107
"""
102108
Certain types have prefixes in names we have to strip before adding.
103109
"""
104-
if type_ == types.PACKAGE or type_ == types.CONSTANT and 'opcode-' in url:
105-
return url.split('#')[1][7:]
110+
if type_ == types.PACKAGE or type_ == types.CONSTANT and u'opcode-' in url:
111+
return url.split(u'#')[1][7:]
106112
else:
107-
return url.split('#')[1]
113+
return url.split(u'#')[1]
108114

109115

110116
def _process_dd(name, dd):
@@ -119,11 +125,11 @@ def _process_dd(name, dd):
119125
if type_:
120126
if type_ == _IN_MODULE:
121127
type_ = _guess_type_by_name(name)
122-
full_name = _url_to_name(dt.a['href'], type_)
123-
if not full_name.startswith('index-'):
128+
full_name = _url_to_name(six.text_type(dt.a[u'href']), type_)
129+
if not full_name.startswith(u'index-'):
124130
yield ParserEntry(name=full_name,
125131
type=type_,
126-
path=dt.a['href'])
132+
path=six.text_type(dt.a[u'href']))
127133

128134

129135
def _guess_type_by_name(name):

doc2dash/parsers/types.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,18 @@
11
from __future__ import absolute_import, division, print_function
22

3-
ATTRIBUTE = "Attribute"
4-
CLASS = "Class"
5-
CONSTANT = "Constant"
6-
ENV = "Environment"
7-
EXCEPTION = "Exception"
8-
FUNCTION = "Function"
9-
INTERFACE = "Interface"
10-
MACRO = "Macro"
11-
METHOD = "Method"
12-
OPCODE = "Operator"
13-
OPTION = "Option"
14-
PACKAGE = "Module"
15-
TYPE = "Type"
16-
VALUE = "Value"
17-
VARIABLE = "Variable"
3+
4+
ATTRIBUTE = u"Attribute"
5+
CLASS = u"Class"
6+
CONSTANT = u"Constant"
7+
ENV = u"Environment"
8+
EXCEPTION = u"Exception"
9+
FUNCTION = u"Function"
10+
INTERFACE = u"Interface"
11+
MACRO = u"Macro"
12+
METHOD = u"Method"
13+
OPCODE = u"Operator"
14+
OPTION = u"Option"
15+
PACKAGE = u"Module"
16+
TYPE = u"Type"
17+
VALUE = u"Value"
18+
VARIABLE = u"Variable"

doc2dash/parsers/utils.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import absolute_import, division, print_function
22

3+
import codecs
34
import errno
45
import logging
56
import os
@@ -9,21 +10,21 @@
910

1011
import click
1112
import six
13+
import zope.interface
1214

1315
from bs4 import BeautifulSoup
14-
from characteristic import attributes
15-
from zope.interface import Attribute, Interface
16+
from characteristic import attributes, Attribute
1617

1718

1819
log = logging.getLogger(__name__)
1920

2021

21-
class IParser(Interface):
22+
class IParser(zope.interface.Interface):
2223
"""
2324
A doc2dash documentation parser.
2425
"""
25-
name = Attribute("Name of the parser")
26-
doc_path = Attribute(
26+
name = zope.interface.Attribute("Name of the parser")
27+
doc_path = zope.interface.Attribute(
2728
"The place to look for documentation for parsing and patching."
2829
)
2930

@@ -49,14 +50,22 @@ def find_and_patch_entry(soup, entry):
4950
"""
5051

5152

52-
@attributes(["name", "type", "anchor"])
53+
@attributes([
54+
Attribute("name", instance_of=six.text_type),
55+
Attribute("type", instance_of=six.text_type),
56+
Attribute("anchor", instance_of=six.text_type)
57+
])
5358
class TOCEntry(object):
5459
"""
5560
A symbol entry generated by the parser and to be added to the TOC.
5661
"""
5762

5863

59-
@attributes(["name", "type", "path"])
64+
@attributes([
65+
Attribute("name", instance_of=six.text_type),
66+
Attribute("type", instance_of=six.text_type),
67+
Attribute("path", instance_of=six.text_type)
68+
])
6069
class ParserEntry(object):
6170
"""
6271
A symbol as found by the parser that get yield for further processing.
@@ -84,7 +93,7 @@ def start(*args, **kwargs):
8493
return start
8594

8695

87-
APPLE_REF_TEMPLATE = '//apple_ref/cpp/{}/{}'
96+
APPLE_REF_TEMPLATE = u'//apple_ref/cpp/{}/{}'
8897

8998

9099
@coroutine
@@ -113,15 +122,15 @@ def patch_anchors(parser, show_progressbar):
113122
def patch_files(files):
114123
for fname, entries in files:
115124
full_path = os.path.join(parser.doc_path, fname)
116-
with open(full_path) as fp:
125+
with codecs.open(full_path, mode="r", encoding="utf-8") as fp:
117126
soup = BeautifulSoup(fp, 'lxml')
118127
for entry in entries:
119128
if not parser.find_and_patch_entry(soup, entry):
120129
log.debug("Can't find anchor {} in {}."
121130
.format(entry.anchor,
122131
click.format_filename(fname)))
123-
with open(full_path, 'w') as fp:
124-
fp.write(str(soup))
132+
with open(full_path, mode="wb") as fp:
133+
fp.write(soup.encode("utf-8"))
125134

126135
if show_progressbar is True:
127136
with click.progressbar(

docs/changelog.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,14 @@ Changelog
44
=========
55

66

7+
2.0.1 (UNRELEASED)
8+
------------------
9+
10+
- Better Unicode support.
11+
The move from ``unicode_literals`` to explicit prefixes broke some things that are fixed now.
12+
(`#29 <https://github.com/hynek/doc2dash/issues/29>`_, `#30 <https://github.com/hynek/doc2dash/issues/30>`_)
13+
14+
715
2.0.0 (2014-08-14)
816
------------------
917

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def run_tests(self):
7171
install_requires=[
7272
"Sphinx==1.2.2",
7373
"beautifulsoup4==4.3.2",
74-
"characteristic==0.1.0",
74+
"characteristic==14.1.0",
7575
"click==3.1",
7676
"colorama==0.3.1",
7777
"lxml==3.3.5",

tests/parsers/intersphinx/test_intersphinx.py

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import absolute_import, division, print_function
22

3+
import codecs
34
import os
45

56
from bs4 import BeautifulSoup
@@ -37,11 +38,11 @@ def test_inv_to_entries(self):
3738
"""
3839
result = list(
3940
_inv_to_entries({"py:method": {
40-
"some_method": (None, None, u"some_module.py", u"-"),
41+
u"some_method": (None, None, u"some_module.py", u"-"),
4142
}})
4243
)
4344
assert [ParserEntry(
44-
name='some_method', type='Method', path='some_module.py'
45+
name=u'some_method', type=u'Method', path=u'some_module.py'
4546
)] == result
4647

4748

@@ -50,37 +51,41 @@ def test_patch_method(self):
5051
"""
5152
Patching a method adds a TOC entry.
5253
"""
53-
soup = BeautifulSoup(open(os.path.join(HERE, 'function_example.html')))
54+
soup = BeautifulSoup(
55+
codecs.open(os.path.join(HERE, 'function_example.html'),
56+
mode="r", encoding="utf-8")
57+
)
5458
assert True is find_and_patch_entry(
5559
soup,
5660
TOCEntry(
57-
name='pyramid.config.Configurator.add_route',
58-
type='Method',
59-
anchor='pyramid.config.Configurator.add_route',
61+
name=u'pyramid.config.Configurator.add_route',
62+
type=u'Method',
63+
anchor=u'pyramid.config.Configurator.add_route',
6064
)
6165
)
6266
toc_link = soup(
63-
'a',
67+
u'a',
6468
attrs={
65-
'name': '//apple_ref/cpp/Method/pyramid.config.Configurator.'
66-
'add_route'
69+
u'name': u'//apple_ref/cpp/Method/pyramid.config.Configurator.'
70+
u'add_route'
6771
}
6872
)
6973
assert toc_link
7074

7175
def test_patch_modules(self):
7276
"""
73-
Patching a module adds the TOC entry into the next <h1>.
77+
Patching a module adds the TOC entry into the next <h1>. Non-ASCII
78+
works.
7479
"""
7580
soup = BeautifulSoup(
76-
"<h1>Some Module</h1>",
81+
u"<h1>Some Module</h1>",
7782
)
7883
assert True is find_and_patch_entry(
7984
soup,
8085
TOCEntry(
81-
name="some_module",
82-
type="Module",
83-
anchor="module-some_module",
86+
name=u"some_module",
87+
type=u"M\xc3\xb6dule",
88+
anchor=u"module-some_module",
8489
)
8590
)
8691
assert '<a name="//apple_ref' in str(soup)
@@ -89,12 +94,15 @@ def test_patch_fail(self):
8994
"""
9095
Return `False` if anchor can't be found
9196
"""
92-
soup = BeautifulSoup(open(os.path.join(HERE, 'function_example.html')))
97+
soup = BeautifulSoup(
98+
codecs.open(os.path.join(HERE, 'function_example.html'),
99+
mode="r", encoding="utf-8")
100+
)
93101
assert False is find_and_patch_entry(
94102
soup,
95103
TOCEntry(
96-
name="foo",
97-
type="Nothing",
98-
anchor="does-not-exist",
104+
name=u"foo",
105+
type=u"Nothing",
106+
anchor=u"does-not-exist",
99107
)
100108
)

0 commit comments

Comments
 (0)