@@ -16,10 +16,16 @@ def get_items(location, encoding=None):
1616 Pass in a string or file-like object and get a list of Items present in the
1717 HTML document.
1818 """
19+ try :
20+ from urllib .request import urlopen
21+ except ImportError :
22+ from urllib import urlopen
23+
24+ print "Objects in location" , location
1925 dom_builder = html5lib .treebuilders .getTreeBuilder ("dom" )
2026 parser = html5lib .HTMLParser (tree = dom_builder )
21- tree = parser .parse (location , encoding = encoding )
22- return _find_items (tree )
27+ tree = parser .parse (urlopen ( location ) , encoding = encoding )
28+ return _find_items (tree , URI . get_domain ( location ) )
2329
2430
2531class Item (object ):
@@ -29,15 +35,15 @@ class Item(object):
2935 or another Item.
3036 """
3137
32- def __init__ (self , itemtype = None , itemid = None ):
38+ def __init__ (self , itemtype = None , itemid = None , url = "" ):
3339 """Create an Item, with an optional itemptype and/or itemid.
3440 """
3541 # itemtype can be a space delimited list
3642 if itemtype :
37- self .itemtype = [URI (i ) for i in itemtype .split (" " )]
43+ self .itemtype = [URI (i , domain = url ) for i in itemtype .split (" " )]
3844
3945 if itemid :
40- self .itemid = URI (itemid )
46+ self .itemid = URI (itemid , domain = url )
4147
4248 self .props = {}
4349
@@ -104,8 +110,13 @@ def json_dict(self):
104110
105111class URI (object ):
106112
107- def __init__ (self , string ):
108- self .string = string
113+ def __init__ (self , string , domain ):
114+ if string .startswith ("http://" ) or string .startswith ("https://" ):
115+ self .string = string
116+ else :
117+ self .string = "http://" + domain + string
118+
119+ print "URI created with string" , self .string
109120
110121 def __eq__ (self , other ):
111122 if isinstance (other , URI ):
@@ -115,6 +126,12 @@ def __eq__(self, other):
115126 def __repr__ (self ):
116127 return self .string
117128
129+ @staticmethod
130+ def get_domain (url_string ):
131+ if "://" in url_string :
132+ return url_string .split ("/" )[2 ]
133+ else :
134+ return url_string .split ("/" )[0 ]
118135
119136# what follows are the guts of extracting the Items from a DOM
120137
@@ -134,23 +151,23 @@ def __repr__(self):
134151}
135152
136153
137- def _find_items (e ):
154+ def _find_items (e , url = "" ):
138155 items = []
139156 unlinked = []
140157 if _is_element (e ) and _is_itemscope (e ):
141- item = _make_item (e )
142- unlinked = _extract (e , item )
158+ item = _make_item (e , url = url )
159+ unlinked = _extract (e , item , url = url )
143160 items .append (item )
144161 for unlinked_element in unlinked :
145- items .extend (_find_items (unlinked_element ))
162+ items .extend (_find_items (unlinked_element , url = url ))
146163 else :
147164 for child in e .childNodes :
148- items .extend (_find_items (child ))
165+ items .extend (_find_items (child , url = url ))
149166
150167 return items
151168
152169
153- def _extract (e , item ):
170+ def _extract (e , item , url = "" ):
154171 # looks in a DOM element for microdata to assign to an Item
155172 # _extract returns a list of elements which appeared to have microdata
156173 # but which were not directly related to the Item that was passed in
@@ -160,19 +177,19 @@ def _extract(e, item):
160177 itemprop = _attr (child , "itemprop" )
161178 itemscope = _is_itemscope (child )
162179 if itemprop and itemscope :
163- nested_item = _make_item (child )
164- unlinked .extend (_extract (child , nested_item ))
180+ nested_item = _make_item (child , url = url )
181+ unlinked .extend (_extract (child , nested_item , url = url ))
165182 item .set (itemprop , nested_item )
166183 elif itemprop :
167184 value = _property_value (child )
168185 # itemprops may also be in a space delimited list
169186 for i in itemprop .split (" " ):
170187 item .set (i , value )
171- unlinked .extend (_extract (child , item ))
188+ unlinked .extend (_extract (child , item , url = url ))
172189 elif itemscope :
173190 unlinked .append (child )
174191 else :
175- unlinked .extend (_extract (child , item ))
192+ unlinked .extend (_extract (child , item , url = url ))
176193
177194 return unlinked
178195
@@ -216,20 +233,15 @@ def _text(e):
216233 return '' .join (chunks )
217234
218235
219- def _make_item (e ):
236+ def _make_item (e , url = "" ):
220237 if not _is_itemscope (e ):
221238 raise Exception ("element is not an Item" )
222239 itemtype = _attr (e , "itemtype" )
223240 itemid = _attr (e , "itemid" )
224- return Item (itemtype , itemid )
241+ return Item (itemtype , itemid , url = url )
225242
226243
227244if __name__ == "__main__" :
228- try :
229- from urllib .request import urlopen
230- except ImportError :
231- from urllib import urlopen
232-
233245 if len (sys .argv ) < 2 :
234246 print ("Usage: %s URL [...]" % sys .argv [0 ])
235247 sys .exit (1 )
@@ -240,7 +252,7 @@ def _make_item(e):
240252 microdata = {}
241253 microdata ['items' ] = items = []
242254
243- for item in get_items (urlopen ( url ) ):
255+ for item in get_items (url ):
244256 items .append (item .json_dict ())
245257
246258 print (json .dumps (microdata , indent = 2 ))
0 commit comments