We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 79bd1af commit cd295deCopy full SHA for cd295de
3 files changed
src/layoutparser/io/pdf.py
@@ -53,6 +53,10 @@ def extract_words_for_page(
53
)
54
55
df = pd.DataFrame(tokens)
56
+
57
+ if len(df) == 0:
58
+ return Layout()
59
60
df[["x0", "x1"]] = (
61
df[["x0", "x1"]].clip(lower=0, upper=int(page.width)).astype("float")
62
tests/fixtures/io/empty.pdf
17.8 KB
tests/test_io.py
@@ -78,4 +78,11 @@ def test_pdf():
78
assert attr_name in page_layout.page_data
79
80
assert len(set(ele.type for ele in page_layout)) == 3
81
- # Only three types of font show-up in the file
+ # Only three types of font show-up in the file
82
83
+def test_empty_pdf():
84
+ pdf_layout = load_pdf("tests/fixtures/io/empty.pdf")
85
+ assert len(pdf_layout) == 1 # Only one page
86
87
+ page_layout = pdf_layout[0]
88
+ assert len(page_layout) == 0 # No selectable tokens on the page
0 commit comments