Skip to content

Commit b9e3ff7

Browse files
committed
Batch 1 of adding the domain to parameters.
1 parent 500cf9c commit b9e3ff7

1 file changed

Lines changed: 37 additions & 25 deletions

File tree

microdata.py

Lines changed: 37 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,16 @@ def get_items(location, encoding=None):
1616
Pass in a string or file-like object and get a list of Items present in the
1717
HTML document.
1818
"""
19+
try:
20+
from urllib.request import urlopen
21+
except ImportError:
22+
from urllib import urlopen
23+
24+
print "Objects in location", location
1925
dom_builder = html5lib.treebuilders.getTreeBuilder("dom")
2026
parser = html5lib.HTMLParser(tree=dom_builder)
21-
tree = parser.parse(location, encoding=encoding)
22-
return _find_items(tree)
27+
tree = parser.parse(urlopen(location), encoding=encoding)
28+
return _find_items(tree, URI.get_domain(location))
2329

2430

2531
class Item(object):
@@ -29,15 +35,15 @@ class Item(object):
2935
or another Item.
3036
"""
3137

32-
def __init__(self, itemtype=None, itemid=None):
38+
def __init__(self, itemtype=None, itemid=None, url=""):
3339
"""Create an Item, with an optional itemptype and/or itemid.
3440
"""
3541
# itemtype can be a space delimited list
3642
if itemtype:
37-
self.itemtype = [URI(i) for i in itemtype.split(" ")]
43+
self.itemtype = [URI(i, domain=url) for i in itemtype.split(" ")]
3844

3945
if itemid:
40-
self.itemid = URI(itemid)
46+
self.itemid = URI(itemid, domain=url)
4147

4248
self.props = {}
4349

@@ -104,8 +110,13 @@ def json_dict(self):
104110

105111
class URI(object):
106112

107-
def __init__(self, string):
108-
self.string = string
113+
def __init__(self, string, domain):
114+
if string.startswith("http://") or string.startswith("https://"):
115+
self.string = string
116+
else:
117+
self.string = "http://" + domain + string
118+
119+
print "URI created with string", self.string
109120

110121
def __eq__(self, other):
111122
if isinstance(other, URI):
@@ -115,6 +126,12 @@ def __eq__(self, other):
115126
def __repr__(self):
116127
return self.string
117128

129+
@staticmethod
130+
def get_domain(url_string):
131+
if "://" in url_string:
132+
return url_string.split("/")[2]
133+
else:
134+
return url_string.split("/")[0]
118135

119136
# what follows are the guts of extracting the Items from a DOM
120137

@@ -134,23 +151,23 @@ def __repr__(self):
134151
}
135152

136153

137-
def _find_items(e):
154+
def _find_items(e, url=""):
138155
items = []
139156
unlinked = []
140157
if _is_element(e) and _is_itemscope(e):
141-
item = _make_item(e)
142-
unlinked = _extract(e, item)
158+
item = _make_item(e, url=url)
159+
unlinked = _extract(e, item, url=url)
143160
items.append(item)
144161
for unlinked_element in unlinked:
145-
items.extend(_find_items(unlinked_element))
162+
items.extend(_find_items(unlinked_element, url=url))
146163
else:
147164
for child in e.childNodes:
148-
items.extend(_find_items(child))
165+
items.extend(_find_items(child, url=url))
149166

150167
return items
151168

152169

153-
def _extract(e, item):
170+
def _extract(e, item, url=""):
154171
# looks in a DOM element for microdata to assign to an Item
155172
# _extract returns a list of elements which appeared to have microdata
156173
# but which were not directly related to the Item that was passed in
@@ -160,19 +177,19 @@ def _extract(e, item):
160177
itemprop = _attr(child, "itemprop")
161178
itemscope = _is_itemscope(child)
162179
if itemprop and itemscope:
163-
nested_item = _make_item(child)
164-
unlinked.extend(_extract(child, nested_item))
180+
nested_item = _make_item(child, url=url)
181+
unlinked.extend(_extract(child, nested_item, url=url))
165182
item.set(itemprop, nested_item)
166183
elif itemprop:
167184
value = _property_value(child)
168185
# itemprops may also be in a space delimited list
169186
for i in itemprop.split(" "):
170187
item.set(i, value)
171-
unlinked.extend(_extract(child, item))
188+
unlinked.extend(_extract(child, item, url=url))
172189
elif itemscope:
173190
unlinked.append(child)
174191
else:
175-
unlinked.extend(_extract(child, item))
192+
unlinked.extend(_extract(child, item, url=url))
176193

177194
return unlinked
178195

@@ -216,20 +233,15 @@ def _text(e):
216233
return ''.join(chunks)
217234

218235

219-
def _make_item(e):
236+
def _make_item(e, url=""):
220237
if not _is_itemscope(e):
221238
raise Exception("element is not an Item")
222239
itemtype = _attr(e, "itemtype")
223240
itemid = _attr(e, "itemid")
224-
return Item(itemtype, itemid)
241+
return Item(itemtype, itemid, url=url)
225242

226243

227244
if __name__ == "__main__":
228-
try:
229-
from urllib.request import urlopen
230-
except ImportError:
231-
from urllib import urlopen
232-
233245
if len(sys.argv) < 2:
234246
print("Usage: %s URL [...]" % sys.argv[0])
235247
sys.exit(1)
@@ -240,7 +252,7 @@ def _make_item(e):
240252
microdata = {}
241253
microdata['items'] = items = []
242254

243-
for item in get_items(urlopen(url)):
255+
for item in get_items(url):
244256
items.append(item.json_dict())
245257

246258
print(json.dumps(microdata, indent=2))

0 commit comments

Comments
 (0)