1818class Conll2012 (udapi .block .read .conllu .Conllu ):
1919 """A reader of the Conll2012 files."""
2020
21- def __init__ (self , attributes = 'docname,_,ord,form,_,_,_,_,_,_,_,_,coref' , ** kwargs ):
21+ def __init__ (self , attributes = 'docname,_,ord,form,_,_,_,_,_,_,_,_,coref' , emptyval = '_' , ** kwargs ):
2222 """Create the Conll2012 reader object.
2323
2424 Args:
@@ -29,10 +29,15 @@ def __init__(self, attributes='docname,_,ord,form,_,_,_,_,_,_,_,_,coref', **kwar
2929 word-order number/index (usualy called ID).
3030 For Corref-PT-SemEval, use attributes='ord,form,_,_,_,_,coref'.
3131 For Summ-it++v2, use attributes='ord,form,_,_,_,_,_,_,coref'.
32+ For FantasyCoref, use attributes='docname,_,ord,form,_,_,_,_,_,_,_,coref'.
33+ emptyval: a symbol that represents an empty value, especially in the coref column
34+ (default='_' suitable for LitBank, Corref-PT-SemEval, and Summ-it++v2)
35+ For FantasyCoref, use emptyval='-'.
3236 """
3337 super ().__init__ (** kwargs )
3438 self .node_attributes = attributes .split (',' )
3539 self ._docname = 'd'
40+ self .emptyval = emptyval
3641
3742 def parse_comment_line (self , line , root ):
3843 if line .startswith ("#end document" ):
@@ -83,7 +88,7 @@ def parse_node_line(self, line, root, nodes):
8388 logging .warning (f"Mismatch: expected { node .ord = } , but found { int (value ) + 1 } { line = } " )
8489
8590 elif attribute_name == 'coref' :
86- if value and value != '_' :
91+ if value and value != self . emptyval :
8792 # LitBank always separates chunks by a vertical bar, e.g. (13)|10)
8893 # Summ-it++v2 does not, e.g. (13)10)
8994 if '|' in value :
0 commit comments