Skip to content

Commit dd51913

Browse files
committed
read.AddBratAnn should keep the etype
normalization of etype should be done in another dataset-specific block
1 parent 8839a14 commit dd51913

1 file changed

Lines changed: 9 additions & 25 deletions

File tree

udapi/block/read/addbratann.py

Lines changed: 9 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -13,26 +13,12 @@
1313
import logging
1414
from bisect import bisect_left
1515

16-
# cd fr-litbank/brat/coref
17-
# cat *.ann | grep '^T' | cut -f2 | cut -f1 -d " " | Sort
18-
# 33085 PER
19-
# 2518 FAC
20-
# 1886 TIME
21-
# 1046 LOC
22-
# 1046 GPE
23-
# 559 NO_PER
24-
# 508 VEH
25-
# 229 ORG
26-
# 115 METALEPSE
27-
# 85 HIST
28-
# 12 X
29-
# 12 TO_DISCUSS
30-
# 1 OTHER
31-
# 1 None
3216

3317
class AddBratAnn(Block):
3418

35-
def __init__(self, files, offset=0, detect_bom=True, keep_mention_id=True, coref_attr="R", zone='', **kwargs):
19+
def __init__(self, files, zone='', offset=0, detect_bom=True, keep_mention_id=True,
20+
coref_attr="R", no_type_value='_Unsorted_',
21+
**kwargs):
3622
"""Args:
3723
files: file names with the coreference annotations (*.ann)
3824
offset: what number to substract from the chatacter indices in the ann files
@@ -45,6 +31,7 @@ def __init__(self, files, offset=0, detect_bom=True, keep_mention_id=True, coref
4531
self.detect_bom = detect_bom
4632
self.keep_mention_id = keep_mention_id
4733
self.coref_attr = coref_attr
34+
self.no_type_value = no_type_value
4835

4936
def process_document(self, document):
5037

@@ -71,14 +58,14 @@ def process_document(self, document):
7158
# Let's take the first and last number only.´
7259
parts = type_and_range.split()
7360
ne_type, range_s, range_e = parts[0], parts[1], parts[-1]
74-
61+
7562
# If form ends with spaces, remove them and adjust range_e
7663
stripped_form = form.rstrip(" ")
7764
if form != stripped_form:
7865
num_spaces = len(form) - len(stripped_form)
7966
form = stripped_form
8067
range_e = int(range_e) - num_spaces
81-
68+
8269
mentions[mention_id] = [ne_type, int(range_s), int(range_e), form]
8370
if self.keep_mention_id:
8471
attrs.append(["mention_id", mention_id, mention_id])
@@ -97,19 +84,17 @@ def process_document(self, document):
9784
# Create entity objects for non-singletons.
9885
entity_map = {}
9986
for mention_ids in clusters:
100-
#etype = mentions[mention_ids[0]][0]
10187
etype, etype_index = None, 0
102-
#for m_id in mention_ids[1:]:
10388
for index, m_id in enumerate(mention_ids):
104-
if mentions[m_id][0] == '_Unsorted_':
89+
if mentions[m_id][0] == self.no_type_value:
10590
pass
10691
elif etype is None:
10792
etype, etype_index = mentions[m_id][0], index
10893
elif etype != mentions[m_id][0]:
10994
logging.warning(f"Mention type mismatch {mention_ids[etype_index]}:{etype} != {m_id}:{mentions[m_id][0]}. Using the former.")
11095
if etype is None:
11196
etype = "other"
112-
entity = document.create_coref_entity(etype=etype.lower())
97+
entity = document.create_coref_entity(etype=etype)
11398
for m_id in mention_ids:
11499
if m_id in entity_map:
115100
logging.warning(f"Mention {m_id} already in Entity {entity_map[m_id].eid}, not adding to {entity.eid}")
@@ -196,8 +181,7 @@ def process_document(self, document):
196181

197182
# Create entities for singletons
198183
if mention_id not in entity_map:
199-
etype = 'other' if ne_type == "_Unsorted_" else ne_type.lower()
200-
entity_map[mention_id] = document.create_coref_entity(etype=etype)
184+
entity_map[mention_id] = document.create_coref_entity(etype=ne_type)
201185

202186
# Create the Udapi mention object
203187
mention = entity_map[mention_id].create_mention(words=mwords)

0 commit comments

Comments
 (0)