Skip to content

Commit 6fa47ab

Browse files
committed
FantasyCoref: parentheses inside a doc ID must be removed
1 parent 57b003c commit 6fa47ab

1 file changed

Lines changed: 6 additions & 1 deletion

File tree

udapi/block/read/conll2012.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def parse_comment_line(self, line, root):
4545
match = RE_BEGIN.match(line)
4646
if match:
4747
docname = match.group(1)
48-
# LitBank uses e.g.
48+
# LitBank and FantasyCoref use e.g.
4949
# #begin document (1023_bleak_house_brat); part 0
5050
if docname.startswith('(') and docname.endswith(');'):
5151
docname = docname[1:-2]
@@ -56,6 +56,9 @@ def parse_comment_line(self, line, root):
5656
# Corref-PT-SemEval uses e.g.
5757
# #begin document D1_C30_Folha_07-08-2007_09h19.txt.xml
5858
docname = docname.replace('.txt', '').replace('.xml', '')
59+
# FantasyCoref may use parentheses within the document ID e.g.
60+
# #begin document (051_Fundevogel_(Bird-foundling)); part 000
61+
docname = docname.replace('(', '').replace(')', '')
5962

6063
root.newdoc = docname
6164
self._global_entity = 'eid-etype-head-other'
@@ -77,6 +80,8 @@ def parse_node_line(self, line, root, nodes):
7780
for (n_attribute, attribute_name) in enumerate(self.node_attributes):
7881
value = fields[n_attribute]
7982
if attribute_name == 'docname':
83+
# FantasyCoref may use parentheses within the document ID
84+
value = value.replace('(', '').replace(')', '')
8085
if value != self._docname:
8186
logging.warning(f"Document name mismatch {value} != {self._docname}")
8287

0 commit comments

Comments
 (0)