|
| 1 | +"""Add Brat coreference annotation from *.ann files. |
| 2 | +
|
| 3 | +So far, tested on French LitBank data only. |
| 4 | +
|
| 5 | +T12 HIST 362 366 qui |
| 6 | +T13 HIST 349 362 une aventure |
| 7 | +R1431 Coreference Arg1:T12 Arg2:T13 |
| 8 | +
|
| 9 | +""" |
| 10 | + |
| 11 | +from udapi.core.block import Block |
| 12 | +from udapi.core.files import Files |
| 13 | +import logging |
| 14 | +from bisect import bisect_left |
| 15 | + |
| 16 | +# cd fr-litbank/brat/coref |
| 17 | +# cat *.ann | grep '^T' | cut -f2 | cut -f1 -d " " | Sort |
| 18 | +# 33085 PER |
| 19 | +# 2518 FAC |
| 20 | +# 1886 TIME |
| 21 | +# 1046 LOC |
| 22 | +# 1046 GPE |
| 23 | +# 559 NO_PER |
| 24 | +# 508 VEH |
| 25 | +# 229 ORG |
| 26 | +# 115 METALEPSE |
| 27 | +# 85 HIST |
| 28 | +# 12 X |
| 29 | +# 12 TO_DISCUSS |
| 30 | +# 1 OTHER |
| 31 | +# 1 None |
| 32 | + |
| 33 | +class AddBratAnn(Block): |
| 34 | + |
| 35 | + def __init__(self, files, offset=0, detect_bom=True, keep_mention_id=True, coref_attr="R", zone='', **kwargs): |
| 36 | + """Args: |
| 37 | + files: file names with the coreference annotations (*.ann) |
| 38 | + offset: what number to substract from the chatacter indices in the ann files |
| 39 | + detect_bom: if True and the current txt file starts with BOM (byte-order mark), add 1 to the offset |
| 40 | + """ |
| 41 | + super().__init__(**kwargs) |
| 42 | + self.zone = zone |
| 43 | + self.files = Files(filenames=files) |
| 44 | + self.offset = offset |
| 45 | + self.detect_bom = detect_bom |
| 46 | + self.keep_mention_id = keep_mention_id |
| 47 | + self.coref_attr = coref_attr |
| 48 | + |
| 49 | + def process_document(self, document): |
| 50 | + |
| 51 | + # Read all the important info from the *.ann file. |
| 52 | + mentions, attrs, split_ante, clusters = {}, [], [], [] |
| 53 | + ann_filehandle = self.files.next_filehandle() |
| 54 | + offset = self.offset |
| 55 | + if self.detect_bom: |
| 56 | + txt_filename = self.files.filename.replace("ann", "txt") |
| 57 | + with open(txt_filename, 'rb') as txt_fh: |
| 58 | + raw_bytes = txt_fh.read(3) |
| 59 | + if raw_bytes == b'\xef\xbb\xbf': |
| 60 | + offset += 1 |
| 61 | + |
| 62 | + for line in ann_filehandle: |
| 63 | + line = line.rstrip() |
| 64 | + if not "\t" in line: |
| 65 | + logging.warning(f"Unexpected line without tabs: {line}") |
| 66 | + elif line.startswith("T"): |
| 67 | + # T13 HIST 349 362 une aventure |
| 68 | + try: |
| 69 | + mention_id, type_and_range, form = line.split("\t") |
| 70 | + # Usually range are two numbers, but can be more, e.g. type_and_range="Abstract 605 653;654 703" |
| 71 | + # Let's take the first and last number only.´ |
| 72 | + parts = type_and_range.split() |
| 73 | + ne_type, range_s, range_e = parts[0], parts[1], parts[-1] |
| 74 | + |
| 75 | + # If form ends with spaces, remove them and adjust range_e |
| 76 | + stripped_form = form.rstrip(" ") |
| 77 | + if form != stripped_form: |
| 78 | + num_spaces = len(form) - len(stripped_form) |
| 79 | + form = stripped_form |
| 80 | + range_e = int(range_e) - num_spaces |
| 81 | + |
| 82 | + mentions[mention_id] = [ne_type, int(range_s), int(range_e), form] |
| 83 | + if self.keep_mention_id: |
| 84 | + attrs.append(["mention_id", mention_id, mention_id]) |
| 85 | + except Exception as e: |
| 86 | + logging.warning(f"Unexpected mention line: {line}\n{e}") |
| 87 | + elif line.startswith(self.coref_attr): |
| 88 | + cor_attr, mention_ids = line.split("\t") |
| 89 | + parts = mention_ids.split() |
| 90 | + assert(parts[0] == "Coreference") |
| 91 | + clusters.append([p.split(":")[1] for p in parts[1:]]) |
| 92 | + elif line.startswith("#"): |
| 93 | + pass # Let's ignore annotators' comments |
| 94 | + else: |
| 95 | + logging.warning(f"Unexpected line in {self.files.filename}:\n{line}") |
| 96 | + |
| 97 | + # Create entity objects for non-singletons. |
| 98 | + entity_map = {} |
| 99 | + for mention_ids in clusters: |
| 100 | + #etype = mentions[mention_ids[0]][0] |
| 101 | + etype, etype_index = None, 0 |
| 102 | + #for m_id in mention_ids[1:]: |
| 103 | + for index, m_id in enumerate(mention_ids): |
| 104 | + if mentions[m_id][0] == '_Unsorted_': |
| 105 | + pass |
| 106 | + elif etype is None: |
| 107 | + etype, etype_index = mentions[m_id][0], index |
| 108 | + elif etype != mentions[m_id][0]: |
| 109 | + logging.warning(f"Mention type mismatch {mention_ids[etype_index]}:{etype} != {m_id}:{mentions[m_id][0]}. Using the former.") |
| 110 | + if etype is None: |
| 111 | + etype = "other" |
| 112 | + entity = document.create_coref_entity(etype=etype.lower()) |
| 113 | + for m_id in mention_ids: |
| 114 | + if m_id in entity_map: |
| 115 | + logging.warning(f"Mention {m_id} already in Entity {entity_map[m_id].eid}, not adding to {entity.eid}") |
| 116 | + else: |
| 117 | + entity_map[m_id] = entity |
| 118 | + |
| 119 | + # Collect TokenRange (as pre-filled by UDPipe) for each token. |
| 120 | + tokens, starts, ends = [], [], [] |
| 121 | + for tree in document.trees: |
| 122 | + for token in tree.token_descendants: |
| 123 | + tokens.append(token) |
| 124 | + range_s, range_e = token.misc["TokenRange"].split(":") |
| 125 | + starts.append(int(range_s)) |
| 126 | + ends.append(int(range_e)) |
| 127 | + |
| 128 | + # Create mention objects. |
| 129 | + mention_map = {} |
| 130 | + for mention_id, mention_values in mentions.items(): |
| 131 | + |
| 132 | + # Find Udapi tokens for each mention. |
| 133 | + ne_type, range_s, range_e, form = mention_values |
| 134 | + index_s = bisect_left(starts, range_s - offset) |
| 135 | + if starts[index_s] != range_s - offset and index_s > 0: |
| 136 | + index_s -= 1 |
| 137 | + index_e = bisect_left(ends, range_e - offset) |
| 138 | + mtokens = tokens[index_s : index_e+1] |
| 139 | + token_s, token_e = tokens[index_s], tokens[index_e] |
| 140 | + |
| 141 | + # Solve cases when the character range crosses Udapi (UDPipe-predicted) token boundaries. |
| 142 | + # If the start token is a multi-word token (MWT), |
| 143 | + # we can still try to find the proper word within the MWT. |
| 144 | + ok_s, ok_e = True, True |
| 145 | + if starts[index_s] != range_s - offset: |
| 146 | + ok_s = False |
| 147 | + if token_s.is_mwt(): |
| 148 | + mtokens.pop(0) |
| 149 | + first_form = form.split()[0] |
| 150 | + new_start = ends[index_s] |
| 151 | + for w in reversed(token_s.words): |
| 152 | + mtokens = [w] + mtokens |
| 153 | + new_start -= len(w.form) |
| 154 | + if w.form == first_form or new_start < range_s - offset: |
| 155 | + ok_s = True |
| 156 | + break |
| 157 | + |
| 158 | + # similarly for the end token |
| 159 | + if ends[index_e] != range_e - offset: |
| 160 | + ok_e = False |
| 161 | + if token_e.is_mwt(): |
| 162 | + mtokens.pop() |
| 163 | + last_form = form.split()[-1] |
| 164 | + new_end = starts[index_e] |
| 165 | + for w in token_e.words: |
| 166 | + mtokens.append(w) |
| 167 | + new_end += len(w.form) |
| 168 | + if w.form == last_form or new_end > range_e - offset: |
| 169 | + ok_e = True |
| 170 | + break |
| 171 | + |
| 172 | + if not ok_s or not ok_e: |
| 173 | + logging.warning(f"Mention {mention_id} range {range_s}-{offset}:{range_e}-{offset} ({form})" |
| 174 | + f" crosses token boundaries: {token_s.misc} ({token_s.form}) " |
| 175 | + f".. {token_e.misc} ({token_e.form})") |
| 176 | + |
| 177 | + # Project tokens (including MWTs) to words and check forms match. |
| 178 | + words, udapi_form = [], "" |
| 179 | + for token in mtokens: |
| 180 | + words += token.words |
| 181 | + udapi_form += token.form |
| 182 | + if not token.no_space_after: |
| 183 | + udapi_form += " " |
| 184 | + udapi_form = udapi_form.rstrip() |
| 185 | + if form != udapi_form: |
| 186 | + logging.warning(f"Mention {mention_id}: ann form '{form}' != Udapi form '{udapi_form}'") |
| 187 | + |
| 188 | + # Make sure all words of the mention are in the same sentence. |
| 189 | + root = words[0].root |
| 190 | + mwords = [words[0]] |
| 191 | + for word in words[1:]: |
| 192 | + if word.root is root: |
| 193 | + mwords.append(word) |
| 194 | + else: |
| 195 | + logging.warning(f"Cross-sentence mention. Word {word} not in {root}, thus omitting from the mention.") |
| 196 | + |
| 197 | + # Create entities for singletons |
| 198 | + if mention_id not in entity_map: |
| 199 | + etype = 'other' if ne_type == "_Unsorted_" else ne_type.lower() |
| 200 | + entity_map[mention_id] = document.create_coref_entity(etype=etype) |
| 201 | + |
| 202 | + # Create the Udapi mention object |
| 203 | + mention = entity_map[mention_id].create_mention(words=mwords) |
| 204 | + mention_map[mention_id] = mention |
| 205 | + |
| 206 | + # Fill-in the additional mention attributes. |
| 207 | + for attr_name, mention_id, attr_value in attrs: |
| 208 | + if mention_id in mention_map: |
| 209 | + mention_map[mention_id].other[attr_name] = attr_value |
| 210 | + |
| 211 | + # Fill-in split antecedents |
| 212 | + for arg1, arg2 in split_ante: |
| 213 | + if arg1 in entity_map and arg2 in entity_map: |
| 214 | + if entity_map[arg1] in entity_map[arg2].split_ante: |
| 215 | + logging.warning(f"Repeated SplitAnte: {arg1=} ({entity_map[arg1].eid}) {arg2=} ({entity_map[arg2].eid})") |
| 216 | + else: |
| 217 | + entity_map[arg2].split_ante.append(entity_map[arg1]) |
| 218 | + else: |
| 219 | + logging.warning(f"{arg1} or {arg2} not indexed in entity_map") |
0 commit comments