Skip to content

Commit f3293db

Browse files
committed
Ignore <script> tag in content text
1 parent 502486e commit f3293db

3 files changed

Lines changed: 8 additions & 0 deletions

File tree

microdata.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,8 @@ def _text(e):
209209
chunks = []
210210
if e.nodeType == e.TEXT_NODE:
211211
chunks.append(e.data)
212+
elif e.tagName == 'script':
213+
return ''
212214
for child in e.childNodes:
213215
chunks.append(_text(child))
214216
return ''.join(chunks)

test-data/example.html

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
<span itemprop="streetAddress">
1616
20341 Whitworth Institute
1717
405 N. Whitworth
18+
<script>
19+
// Unrelated text
20+
</script>
1821
</span>
1922
<span itemprop="addressLocality">Seattle</span>,
2023
<span itemprop="addressRegion">WA</span>

test.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ def test_parse(self):
4141
self.assertEqual(item.address.itemtype, [URI("http://schema.org/PostalAddress")])
4242
self.assertTrue(item.address.addressLocality, "Seattle")
4343

44+
# <script> tag should be ignored in the content text
45+
self.assertFalse("Unrelated text" in item.address.streetAddress)
46+
4447
# json
4548
i = json.loads(item.json())
4649
self.assertEqual(i["properties"]["name"][0], "Jane Doe")

0 commit comments

Comments
 (0)