Skip to content

Commit 585832c

Browse files
committed
AddBratAnn strip trailing spaces from mentions
those prevented correct offset matching
1 parent dd51913 commit 585832c

1 file changed

Lines changed: 9 additions & 5 deletions

File tree

udapi/block/read/addbratann.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
import logging
1414
from bisect import bisect_left
1515

16+
def _m(range_s, range_e, offset):
17+
return f"{range_s}-{offset}:{range_e}-{offset}" if offset else f"{range_s}:{range_e}"
1618

1719
class AddBratAnn(Block):
1820

@@ -47,7 +49,7 @@ def process_document(self, document):
4749
offset += 1
4850

4951
for line in ann_filehandle:
50-
line = line.rstrip()
52+
line = line.rstrip('\n')
5153
if not "\t" in line:
5254
logging.warning(f"Unexpected line without tabs: {line}")
5355
elif line.startswith("T"):
@@ -57,16 +59,18 @@ def process_document(self, document):
5759
# Usually range are two numbers, but can be more, e.g. type_and_range="Abstract 605 653;654 703"
5860
# Let's take the first and last number only.´
5961
parts = type_and_range.split()
60-
ne_type, range_s, range_e = parts[0], parts[1], parts[-1]
62+
ne_type, range_s, range_e = parts[0], int(parts[1]), int(parts[-1])
6163

6264
# If form ends with spaces, remove them and adjust range_e
6365
stripped_form = form.rstrip(" ")
6466
if form != stripped_form:
6567
num_spaces = len(form) - len(stripped_form)
68+
logging.debug(f"Stripping {num_spaces} space{'s' if num_spaces>1 else ''} from {mention_id} '{form}' ({_m(range_s,range_e,offset)}->{range_e-num_spaces})")
6669
form = stripped_form
67-
range_e = int(range_e) - num_spaces
70+
range_e = range_e - num_spaces
6871

69-
mentions[mention_id] = [ne_type, int(range_s), int(range_e), form]
72+
73+
mentions[mention_id] = [ne_type, range_s, range_e, form]
7074
if self.keep_mention_id:
7175
attrs.append(["mention_id", mention_id, mention_id])
7276
except Exception as e:
@@ -155,7 +159,7 @@ def process_document(self, document):
155159
break
156160

157161
if not ok_s or not ok_e:
158-
logging.warning(f"Mention {mention_id} range {range_s}-{offset}:{range_e}-{offset} ({form})"
162+
logging.warning(f"Mention {mention_id} range {_m(range_s, range_e, offset)} ({form})"
159163
f" crosses token boundaries: {token_s.misc} ({token_s.form}) "
160164
f".. {token_e.misc} ({token_e.form})")
161165

0 commit comments

Comments
 (0)