Skip to content

Commit 57bacc8

Browse files
committed
read.Conll2012 adjusted to read FantasyCoref
- a different number of columns - "-" as an empty value instead of "_"
1 parent 057d483 commit 57bacc8

1 file changed

Lines changed: 7 additions & 2 deletions

File tree

udapi/block/read/conll2012.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
class Conll2012(udapi.block.read.conllu.Conllu):
1919
"""A reader of the Conll2012 files."""
2020

21-
def __init__(self, attributes='docname,_,ord,form,_,_,_,_,_,_,_,_,coref', **kwargs):
21+
def __init__(self, attributes='docname,_,ord,form,_,_,_,_,_,_,_,_,coref', emptyval='_', **kwargs):
2222
"""Create the Conll2012 reader object.
2323
2424
Args:
@@ -29,10 +29,15 @@ def __init__(self, attributes='docname,_,ord,form,_,_,_,_,_,_,_,_,coref', **kwar
2929
word-order number/index (usualy called ID).
3030
For Corref-PT-SemEval, use attributes='ord,form,_,_,_,_,coref'.
3131
For Summ-it++v2, use attributes='ord,form,_,_,_,_,_,_,coref'.
32+
For FantasyCoref, use attributes='docname,_,ord,form,_,_,_,_,_,_,_,coref'.
33+
emptyval: a symbol that represents an empty value, especially in the coref column
34+
(default='_' suitable for LitBank, Corref-PT-SemEval, and Summ-it++v2)
35+
For FantasyCoref, use emptyval='-'.
3236
"""
3337
super().__init__(**kwargs)
3438
self.node_attributes = attributes.split(',')
3539
self._docname = 'd'
40+
self.emptyval = emptyval
3641

3742
def parse_comment_line(self, line, root):
3843
if line.startswith("#end document"):
@@ -83,7 +88,7 @@ def parse_node_line(self, line, root, nodes):
8388
logging.warning(f"Mismatch: expected {node.ord=}, but found {int(value) + 1} {line=}")
8489

8590
elif attribute_name == 'coref':
86-
if value and value != '_':
91+
if value and value != self.emptyval:
8792
# LitBank always separates chunks by a vertical bar, e.g. (13)|10)
8893
# Summ-it++v2 does not, e.g. (13)10)
8994
if '|' in value:

0 commit comments

Comments
 (0)