1313import logging
1414from bisect import bisect_left
1515
16- # cd fr-litbank/brat/coref
17- # cat *.ann | grep '^T' | cut -f2 | cut -f1 -d " " | Sort
18- # 33085 PER
19- # 2518 FAC
20- # 1886 TIME
21- # 1046 LOC
22- # 1046 GPE
23- # 559 NO_PER
24- # 508 VEH
25- # 229 ORG
26- # 115 METALEPSE
27- # 85 HIST
28- # 12 X
29- # 12 TO_DISCUSS
30- # 1 OTHER
31- # 1 None
3216
3317class AddBratAnn (Block ):
3418
35- def __init__ (self , files , offset = 0 , detect_bom = True , keep_mention_id = True , coref_attr = "R" , zone = '' , ** kwargs ):
19+ def __init__ (self , files , zone = '' , offset = 0 , detect_bom = True , keep_mention_id = True ,
20+ coref_attr = "R" , no_type_value = '_Unsorted_' ,
21+ ** kwargs ):
3622 """Args:
3723 files: file names with the coreference annotations (*.ann)
3824 offset: what number to substract from the chatacter indices in the ann files
@@ -45,6 +31,7 @@ def __init__(self, files, offset=0, detect_bom=True, keep_mention_id=True, coref
4531 self .detect_bom = detect_bom
4632 self .keep_mention_id = keep_mention_id
4733 self .coref_attr = coref_attr
34+ self .no_type_value = no_type_value
4835
4936 def process_document (self , document ):
5037
@@ -71,14 +58,14 @@ def process_document(self, document):
7158 # Let's take the first and last number only.´
7259 parts = type_and_range .split ()
7360 ne_type , range_s , range_e = parts [0 ], parts [1 ], parts [- 1 ]
74-
61+
7562 # If form ends with spaces, remove them and adjust range_e
7663 stripped_form = form .rstrip (" " )
7764 if form != stripped_form :
7865 num_spaces = len (form ) - len (stripped_form )
7966 form = stripped_form
8067 range_e = int (range_e ) - num_spaces
81-
68+
8269 mentions [mention_id ] = [ne_type , int (range_s ), int (range_e ), form ]
8370 if self .keep_mention_id :
8471 attrs .append (["mention_id" , mention_id , mention_id ])
@@ -97,19 +84,17 @@ def process_document(self, document):
9784 # Create entity objects for non-singletons.
9885 entity_map = {}
9986 for mention_ids in clusters :
100- #etype = mentions[mention_ids[0]][0]
10187 etype , etype_index = None , 0
102- #for m_id in mention_ids[1:]:
10388 for index , m_id in enumerate (mention_ids ):
104- if mentions [m_id ][0 ] == '_Unsorted_' :
89+ if mentions [m_id ][0 ] == self . no_type_value :
10590 pass
10691 elif etype is None :
10792 etype , etype_index = mentions [m_id ][0 ], index
10893 elif etype != mentions [m_id ][0 ]:
10994 logging .warning (f"Mention type mismatch { mention_ids [etype_index ]} :{ etype } != { m_id } :{ mentions [m_id ][0 ]} . Using the former." )
11095 if etype is None :
11196 etype = "other"
112- entity = document .create_coref_entity (etype = etype . lower () )
97+ entity = document .create_coref_entity (etype = etype )
11398 for m_id in mention_ids :
11499 if m_id in entity_map :
115100 logging .warning (f"Mention { m_id } already in Entity { entity_map [m_id ].eid } , not adding to { entity .eid } " )
@@ -196,8 +181,7 @@ def process_document(self, document):
196181
197182 # Create entities for singletons
198183 if mention_id not in entity_map :
199- etype = 'other' if ne_type == "_Unsorted_" else ne_type .lower ()
200- entity_map [mention_id ] = document .create_coref_entity (etype = etype )
184+ entity_map [mention_id ] = document .create_coref_entity (etype = ne_type )
201185
202186 # Create the Udapi mention object
203187 mention = entity_map [mention_id ].create_mention (words = mwords )
0 commit comments