Skip to content

Commit 4f996fd

Browse files
committed
read.Text empty_line=keep
1 parent 839491c commit 4f996fd

1 file changed

Lines changed: 18 additions & 1 deletion

File tree

udapi/block/read/text.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,17 @@ class Text(BaseReader):
1616
so that `udpipe.Base` keeps these characters in `SpacesAfter`.
1717
As most blocks do not expect whitespace other than a space to appear
1818
in the processed text, using this feature is at your own risk.
19+
empty_line: how empty lines are handled. Default 'new_sentence' preserves
20+
the current behaviour (empty lines mark sentence boundaries). Use
21+
'keep' to read the entire file content into a single sentence (tree), including
22+
empty lines. Use 'newpar' to behave like 'new_sentence' but also set
23+
`root.newpar = True` on each sentence.
1924
"""
20-
def __init__(self, rstrip='\r\n ', **kwargs):
25+
def __init__(self, rstrip='\r\n ', empty_line='new_sentence', **kwargs):
26+
if empty_line not in {'new_sentence', 'keep', 'newpar'}:
27+
raise ValueError("empty_line must be 'new_sentence', 'keep' or 'newpar'")
2128
self.rstrip = rstrip
29+
self.empty_line = empty_line
2230
super().__init__(**kwargs)
2331

2432
@staticmethod
@@ -32,6 +40,13 @@ def is_multizone_reader():
3240
def read_tree(self, document=None):
3341
if self.filehandle is None:
3442
return None
43+
if self.empty_line == 'keep':
44+
content = self.filehandle.read()
45+
if content == '':
46+
return None
47+
root = Root()
48+
root.text = content
49+
return root
3550
lines = []
3651
line = None
3752
while True:
@@ -54,4 +69,6 @@ def read_tree(self, document=None):
5469

5570
root = Root()
5671
root.text = " ".join(lines)
72+
if self.empty_line == 'newpar':
73+
root.newpar = True
5774
return root

0 commit comments

Comments
 (0)