Skip to content

Commit 8839a14

Browse files
committed
read.AddBratAnn for French LitBank
adapted from MiniCiep, WIP
1 parent a8f9681 commit 8839a14

1 file changed

Lines changed: 219 additions & 0 deletions

File tree

udapi/block/read/addbratann.py

Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
"""Add Brat coreference annotation from *.ann files.
2+
3+
So far, tested on French LitBank data only.
4+
5+
T12 HIST 362 366 qui
6+
T13 HIST 349 362 une aventure
7+
R1431 Coreference Arg1:T12 Arg2:T13
8+
9+
"""
10+
11+
from udapi.core.block import Block
12+
from udapi.core.files import Files
13+
import logging
14+
from bisect import bisect_left
15+
16+
# cd fr-litbank/brat/coref
17+
# cat *.ann | grep '^T' | cut -f2 | cut -f1 -d " " | Sort
18+
# 33085 PER
19+
# 2518 FAC
20+
# 1886 TIME
21+
# 1046 LOC
22+
# 1046 GPE
23+
# 559 NO_PER
24+
# 508 VEH
25+
# 229 ORG
26+
# 115 METALEPSE
27+
# 85 HIST
28+
# 12 X
29+
# 12 TO_DISCUSS
30+
# 1 OTHER
31+
# 1 None
32+
33+
class AddBratAnn(Block):
34+
35+
def __init__(self, files, offset=0, detect_bom=True, keep_mention_id=True, coref_attr="R", zone='', **kwargs):
36+
"""Args:
37+
files: file names with the coreference annotations (*.ann)
38+
offset: what number to substract from the chatacter indices in the ann files
39+
detect_bom: if True and the current txt file starts with BOM (byte-order mark), add 1 to the offset
40+
"""
41+
super().__init__(**kwargs)
42+
self.zone = zone
43+
self.files = Files(filenames=files)
44+
self.offset = offset
45+
self.detect_bom = detect_bom
46+
self.keep_mention_id = keep_mention_id
47+
self.coref_attr = coref_attr
48+
49+
def process_document(self, document):
50+
51+
# Read all the important info from the *.ann file.
52+
mentions, attrs, split_ante, clusters = {}, [], [], []
53+
ann_filehandle = self.files.next_filehandle()
54+
offset = self.offset
55+
if self.detect_bom:
56+
txt_filename = self.files.filename.replace("ann", "txt")
57+
with open(txt_filename, 'rb') as txt_fh:
58+
raw_bytes = txt_fh.read(3)
59+
if raw_bytes == b'\xef\xbb\xbf':
60+
offset += 1
61+
62+
for line in ann_filehandle:
63+
line = line.rstrip()
64+
if not "\t" in line:
65+
logging.warning(f"Unexpected line without tabs: {line}")
66+
elif line.startswith("T"):
67+
# T13 HIST 349 362 une aventure
68+
try:
69+
mention_id, type_and_range, form = line.split("\t")
70+
# Usually range are two numbers, but can be more, e.g. type_and_range="Abstract 605 653;654 703"
71+
# Let's take the first and last number only.´
72+
parts = type_and_range.split()
73+
ne_type, range_s, range_e = parts[0], parts[1], parts[-1]
74+
75+
# If form ends with spaces, remove them and adjust range_e
76+
stripped_form = form.rstrip(" ")
77+
if form != stripped_form:
78+
num_spaces = len(form) - len(stripped_form)
79+
form = stripped_form
80+
range_e = int(range_e) - num_spaces
81+
82+
mentions[mention_id] = [ne_type, int(range_s), int(range_e), form]
83+
if self.keep_mention_id:
84+
attrs.append(["mention_id", mention_id, mention_id])
85+
except Exception as e:
86+
logging.warning(f"Unexpected mention line: {line}\n{e}")
87+
elif line.startswith(self.coref_attr):
88+
cor_attr, mention_ids = line.split("\t")
89+
parts = mention_ids.split()
90+
assert(parts[0] == "Coreference")
91+
clusters.append([p.split(":")[1] for p in parts[1:]])
92+
elif line.startswith("#"):
93+
pass # Let's ignore annotators' comments
94+
else:
95+
logging.warning(f"Unexpected line in {self.files.filename}:\n{line}")
96+
97+
# Create entity objects for non-singletons.
98+
entity_map = {}
99+
for mention_ids in clusters:
100+
#etype = mentions[mention_ids[0]][0]
101+
etype, etype_index = None, 0
102+
#for m_id in mention_ids[1:]:
103+
for index, m_id in enumerate(mention_ids):
104+
if mentions[m_id][0] == '_Unsorted_':
105+
pass
106+
elif etype is None:
107+
etype, etype_index = mentions[m_id][0], index
108+
elif etype != mentions[m_id][0]:
109+
logging.warning(f"Mention type mismatch {mention_ids[etype_index]}:{etype} != {m_id}:{mentions[m_id][0]}. Using the former.")
110+
if etype is None:
111+
etype = "other"
112+
entity = document.create_coref_entity(etype=etype.lower())
113+
for m_id in mention_ids:
114+
if m_id in entity_map:
115+
logging.warning(f"Mention {m_id} already in Entity {entity_map[m_id].eid}, not adding to {entity.eid}")
116+
else:
117+
entity_map[m_id] = entity
118+
119+
# Collect TokenRange (as pre-filled by UDPipe) for each token.
120+
tokens, starts, ends = [], [], []
121+
for tree in document.trees:
122+
for token in tree.token_descendants:
123+
tokens.append(token)
124+
range_s, range_e = token.misc["TokenRange"].split(":")
125+
starts.append(int(range_s))
126+
ends.append(int(range_e))
127+
128+
# Create mention objects.
129+
mention_map = {}
130+
for mention_id, mention_values in mentions.items():
131+
132+
# Find Udapi tokens for each mention.
133+
ne_type, range_s, range_e, form = mention_values
134+
index_s = bisect_left(starts, range_s - offset)
135+
if starts[index_s] != range_s - offset and index_s > 0:
136+
index_s -= 1
137+
index_e = bisect_left(ends, range_e - offset)
138+
mtokens = tokens[index_s : index_e+1]
139+
token_s, token_e = tokens[index_s], tokens[index_e]
140+
141+
# Solve cases when the character range crosses Udapi (UDPipe-predicted) token boundaries.
142+
# If the start token is a multi-word token (MWT),
143+
# we can still try to find the proper word within the MWT.
144+
ok_s, ok_e = True, True
145+
if starts[index_s] != range_s - offset:
146+
ok_s = False
147+
if token_s.is_mwt():
148+
mtokens.pop(0)
149+
first_form = form.split()[0]
150+
new_start = ends[index_s]
151+
for w in reversed(token_s.words):
152+
mtokens = [w] + mtokens
153+
new_start -= len(w.form)
154+
if w.form == first_form or new_start < range_s - offset:
155+
ok_s = True
156+
break
157+
158+
# similarly for the end token
159+
if ends[index_e] != range_e - offset:
160+
ok_e = False
161+
if token_e.is_mwt():
162+
mtokens.pop()
163+
last_form = form.split()[-1]
164+
new_end = starts[index_e]
165+
for w in token_e.words:
166+
mtokens.append(w)
167+
new_end += len(w.form)
168+
if w.form == last_form or new_end > range_e - offset:
169+
ok_e = True
170+
break
171+
172+
if not ok_s or not ok_e:
173+
logging.warning(f"Mention {mention_id} range {range_s}-{offset}:{range_e}-{offset} ({form})"
174+
f" crosses token boundaries: {token_s.misc} ({token_s.form}) "
175+
f".. {token_e.misc} ({token_e.form})")
176+
177+
# Project tokens (including MWTs) to words and check forms match.
178+
words, udapi_form = [], ""
179+
for token in mtokens:
180+
words += token.words
181+
udapi_form += token.form
182+
if not token.no_space_after:
183+
udapi_form += " "
184+
udapi_form = udapi_form.rstrip()
185+
if form != udapi_form:
186+
logging.warning(f"Mention {mention_id}: ann form '{form}' != Udapi form '{udapi_form}'")
187+
188+
# Make sure all words of the mention are in the same sentence.
189+
root = words[0].root
190+
mwords = [words[0]]
191+
for word in words[1:]:
192+
if word.root is root:
193+
mwords.append(word)
194+
else:
195+
logging.warning(f"Cross-sentence mention. Word {word} not in {root}, thus omitting from the mention.")
196+
197+
# Create entities for singletons
198+
if mention_id not in entity_map:
199+
etype = 'other' if ne_type == "_Unsorted_" else ne_type.lower()
200+
entity_map[mention_id] = document.create_coref_entity(etype=etype)
201+
202+
# Create the Udapi mention object
203+
mention = entity_map[mention_id].create_mention(words=mwords)
204+
mention_map[mention_id] = mention
205+
206+
# Fill-in the additional mention attributes.
207+
for attr_name, mention_id, attr_value in attrs:
208+
if mention_id in mention_map:
209+
mention_map[mention_id].other[attr_name] = attr_value
210+
211+
# Fill-in split antecedents
212+
for arg1, arg2 in split_ante:
213+
if arg1 in entity_map and arg2 in entity_map:
214+
if entity_map[arg1] in entity_map[arg2].split_ante:
215+
logging.warning(f"Repeated SplitAnte: {arg1=} ({entity_map[arg1].eid}) {arg2=} ({entity_map[arg2].eid})")
216+
else:
217+
entity_map[arg2].split_ante.append(entity_map[arg1])
218+
else:
219+
logging.warning(f"{arg1} or {arg2} not indexed in entity_map")

0 commit comments

Comments
 (0)