@@ -16,9 +16,17 @@ class Text(BaseReader):
1616 so that `udpipe.Base` keeps these characters in `SpacesAfter`.
1717 As most blocks do not expect whitespace other than a space to appear
1818 in the processed text, using this feature is at your own risk.
19+ empty_line: how empty lines are handled. Default 'new_sentence' preserves
20+ the current behaviour (empty lines mark sentence boundaries). Use
21+ 'keep' to read the entire file content into a single sentence (tree), including
22+ empty lines. Use 'newpar' to behave like 'new_sentence' but also set
23+ `root.newpar = True` on each sentence.
1924 """
20- def __init__ (self , rstrip = '\r \n ' , ** kwargs ):
25+ def __init__ (self , rstrip = '\r \n ' , empty_line = 'new_sentence' , ** kwargs ):
26+ if empty_line not in {'new_sentence' , 'keep' , 'newpar' }:
27+ raise ValueError ("empty_line must be 'new_sentence', 'keep' or 'newpar'" )
2128 self .rstrip = rstrip
29+ self .empty_line = empty_line
2230 super ().__init__ (** kwargs )
2331
2432 @staticmethod
@@ -32,6 +40,13 @@ def is_multizone_reader():
3240 def read_tree (self , document = None ):
3341 if self .filehandle is None :
3442 return None
43+ if self .empty_line == 'keep' :
44+ content = self .filehandle .read ()
45+ if content == '' :
46+ return None
47+ root = Root ()
48+ root .text = content
49+ return root
3550 lines = []
3651 line = None
3752 while True :
@@ -54,4 +69,6 @@ def read_tree(self, document=None):
5469
5570 root = Root ()
5671 root .text = " " .join (lines )
72+ if self .empty_line == 'newpar' :
73+ root .newpar = True
5774 return root
0 commit comments