Skip to content

Commit 36e9fe6

Browse files
authored
refactor: add TypeScript types and improve null handling in PDFParser class. Need to address nodeUtil (#403)
1 parent 4b26552 commit 36e9fe6

2 files changed

Lines changed: 52 additions & 30 deletions

File tree

pdfparser.js

Lines changed: 47 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ import nodeUtil from "node:util";
33
import { readFile } from "node:fs/promises";
44
import { EventEmitter } from "node:events";
55
import { Buffer } from "node:buffer";
6+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
7+
import { Readable } from "node:stream";
68

79
import PDFJS from "./lib/pdf.js";
810
import { ParserStream, StringifyStream } from "./lib/parserstream.js";
@@ -78,26 +80,34 @@ export default class PDFParser extends EventEmitter {
7880
}
7981

8082
static #maxBinBufferCount = 10;
83+
/** @type {Record<string, Buffer | null>} */
8184
static #binBuffer = {};
85+
static #instanceCounter = 0;
8286

8387
#password = "";
84-
#context = null; // service context object, only used in Web Service project; null in command line #pdfFilePath = null;
85-
#pdfFilePath = null; //current PDF file to load and parse, null means loading/parsing not started #data = null;
86-
#pdfFileMTime = null; // last time the current pdf was modified, used to recognize changes and ignore cache #PDFJS = null;
87-
#data = null; //if file read success, data is PDF content; if failed, data is "err" object #processFieldInfoXML = false;
88+
/** @type {import('./src/types/pdfparser.js').PDFParserContext|null} */
89+
#context = null; // service context object, only used in Web Service project; null in command line
90+
/** @type {string|null} */
91+
#pdfFilePath = null; //current PDF file to load and parse, null means loading/parsing not started
92+
/** @type {number|null} */
93+
#pdfFileMTime = null; // last time the current pdf was modified, used to recognize changes and ignore cache
94+
/** @type {object|null} */
95+
#data = null; //if file read success, data is PDF content; if failed, data is "err" object
96+
/** @type {import('./lib/pdf.js').default|null} */
8897
#PDFJS = null; //will be initialized in constructor
8998
#processFieldInfoXML = false; //disable additional _fieldInfo.xml parsing and merging (do NOT set to true)
9099

91100
/**
92101
* PDFParser constructor.
93102
* @constructor PDFParser class.
94-
* @param {object} context - The context object (only used in Web Service project); null in command line
103+
* @param {import('./src/types/pdfparser.js').PDFParserContext|null} context - The context object (only used in Web Service project); null in command line
95104
* @param {boolean} needRawText - Whether raw text is needed or not
96105
* @param {string} password - The password for PDF file
97106
* @info Private methods accessible using the [funcName].call(this, ...) syntax
98107
*/
99108
constructor(context, needRawText, password) {
100109
super();
110+
PDFParser.#instanceCounter++;
101111
this.#context = context;
102112
this.#pdfFilePath = null; //current PDF file to load and parse, null means loading/parsing not started this.#pdfFileMTime = null;
103113
this.#pdfFileMTime = null; // last time the current pdf was modified, used to recognize changes and ignore cache this.#data = null;
@@ -109,20 +119,18 @@ export default class PDFParser extends EventEmitter {
109119
}
110120

111121
/**
112-
* @private
113122
* @param {object} data - The parsed data
114123
*/
115124
#onPDFJSParseDataReady(data) {
116125
if (!data) {
117126
nodeUtil.p2jinfo("PDF parsing completed.");
118127
this.emit("pdfParser_dataReady", this.#data);
119128
} else {
120-
this.#data = { ...this.#data, ...data };
129+
this.#data = { ...(this.#data || {}), ...data };
121130
}
122131
}
123132

124133
/**
125-
* @private
126134
* @param {Error} err - The error object
127135
*/
128136
#onPDFJSParserDataError(err) {
@@ -131,11 +139,16 @@ export default class PDFParser extends EventEmitter {
131139
}
132140

133141
/**
134-
* @private
135-
* @param {Buffer} buffer - The PDF buffer
142+
* @param {Buffer|null} buffer - The PDF buffer
136143
*/
137-
#startParsingPDF(buffer) {
138-
this.#data = {};
144+
#startParsingPDF(buffer = null) {
145+
this.#data = null;
146+
147+
if (!this.#PDFJS) {
148+
this.#onPDFJSParserDataError(new Error("PDFJS parser not initialized"));
149+
return;
150+
}
151+
139152
this.#PDFJS.on("pdfjs_parseDataReady", (data) =>
140153
this.#onPDFJSParseDataReady(data)
141154
);
@@ -155,7 +168,6 @@ export default class PDFParser extends EventEmitter {
155168
}
156169

157170
/**
158-
* @private
159171
* @returns {boolean}
160172
*/
161173
#processBinaryCache() {
@@ -166,7 +178,7 @@ export default class PDFParser extends EventEmitter {
166178

167179
const allKeys = Object.keys(PDFParser.#binBuffer);
168180
if (allKeys.length > PDFParser.#maxBinBufferCount) {
169-
const idx = this.id % PDFParser.#maxBinBufferCount;
181+
const idx = PDFParser.#instanceCounter % PDFParser.#maxBinBufferCount;
170182
const key = allKeys[idx];
171183
PDFParser.#binBuffer[key] = null;
172184
delete PDFParser.#binBuffer[key];
@@ -190,6 +202,9 @@ export default class PDFParser extends EventEmitter {
190202
* @returns {string} The binBufferKey
191203
*/
192204
get binBufferKey() {
205+
if (this.#pdfFilePath === null || this.#pdfFileMTime === null) {
206+
return "";
207+
}
193208
return this.#pdfFilePath + this.#pdfFileMTime;
194209
}
195210

@@ -205,6 +220,7 @@ export default class PDFParser extends EventEmitter {
205220
* Asynchronously load a PDF from a file path.
206221
* @param {string} pdfFilePath - Path of the PDF file
207222
* @param {number} verbosity - Verbosity level
223+
* @returns {Promise<void>} Promise that resolves when PDF is loaded
208224
*/
209225
async loadPDF(pdfFilePath, verbosity) {
210226
nodeUtil.verbosity(verbosity || 0);
@@ -214,7 +230,7 @@ export default class PDFParser extends EventEmitter {
214230

215231
try {
216232
this.#pdfFileMTime = fs.statSync(pdfFilePath).mtimeMs;
217-
if (this.#processFieldInfoXML) {
233+
if (this.#processFieldInfoXML && this.#PDFJS) {
218234
this.#PDFJS.tryLoadFieldInfoXML(pdfFilePath);
219235
}
220236

@@ -253,36 +269,36 @@ export default class PDFParser extends EventEmitter {
253269
* @returns {string} Raw text content
254270
*/
255271
getRawTextContent() {
256-
return this.#PDFJS.getRawTextContent();
272+
return this.#PDFJS?.getRawTextContent() || "";
257273
}
258274

259275
/**
260276
* Retrieve raw text content stream.
261-
* @returns {Stream} Raw text content stream
277+
* @returns {Readable} Raw text content stream
262278
*/
263279
getRawTextContentStream() {
264280
return ParserStream.createContentStream(this.getRawTextContent());
265281
}
266282

267283
/**
268284
* Retrieve all field types.
269-
* @returns {object[]} All field types
285+
* @returns {import('./src/types/pdfparser.js').FieldType[]} All field types
270286
*/
271287
getAllFieldsTypes() {
272-
return this.#PDFJS.getAllFieldsTypes();
288+
return this.#PDFJS?.getAllFieldsTypes() || [];
273289
}
274290

275291
/**
276-
* Retrieve all field types.
277-
* @returns {object[]} All field types
292+
* Retrieve all field data.
293+
* @returns {import('./src/types/pdfparser.js').FieldType[]} All field data
278294
*/
279295
getAllFieldData() {
280-
return this.#PDFJS.getAllFieldData();
296+
return this.#PDFJS?.getAllFieldData() || [];
281297
}
282298

283299
/**
284300
* Retrieve all field types stream.
285-
* @returns {Stream} All field types stream
301+
* @returns {Readable} All field types stream
286302
*/
287303
getAllFieldsTypesStream() {
288304
return ParserStream.createContentStream(this.getAllFieldsTypes());
@@ -293,12 +309,12 @@ export default class PDFParser extends EventEmitter {
293309
* @returns {object} Merged text blocks
294310
*/
295311
getMergedTextBlocksIfNeeded() {
296-
return this.#PDFJS.getMergedTextBlocksIfNeeded();
312+
return this.#PDFJS?.getMergedTextBlocksIfNeeded() || {};
297313
}
298314

299315
/**
300316
* Retrieve merged text blocks stream.
301-
* @returns {Stream} Merged text blocks stream
317+
* @returns {Readable} Merged text blocks stream
302318
*/
303319
getMergedTextBlocksStream() {
304320
return ParserStream.createContentStream(this.getMergedTextBlocksIfNeeded());
@@ -309,8 +325,9 @@ export default class PDFParser extends EventEmitter {
309325
* @param {boolean} needRawText - Whether raw text is needed or not
310326
*/
311327
resetPDFJS(needRawText){
312-
this.#PDFJS.destroy();
313-
this.#PDFJS=new PDFJS(needRawText);
328+
this.#PDFJS?.destroy();
329+
this.#PDFJS = new PDFJS(needRawText);
330+
PDFParser.#instanceCounter++;
314331
}
315332

316333
/**
@@ -322,7 +339,7 @@ export default class PDFParser extends EventEmitter {
322339

323340
//context object will be set in Web Service project, but not in command line utility
324341
if (this.#context) {
325-
this.#context.destroy();
342+
this.#context.destroy?.();
326343
this.#context = null;
327344
}
328345

@@ -331,7 +348,8 @@ export default class PDFParser extends EventEmitter {
331348
this.#data = null;
332349
this.#processFieldInfoXML = false; //disable additional _fieldInfo.xml parsing and merging (do NOT set to true)
333350

334-
this.#PDFJS.destroy();
351+
this.#PDFJS?.destroy();
335352
this.#PDFJS = null;
353+
PDFParser.#instanceCounter--;
336354
}
337355
}

src/types/pdfparser.d.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ export declare class PDFParser extends EventEmitter {
3030
// eslint-disable-next-line @typescript-eslint/naming-convention
3131
static get _PARSER_SIG(): string;
3232

33-
constructor(context?: any, needRawText?: boolean, password?: string);
33+
constructor(context?: PDFParserContext | null, needRawText?: boolean, password?: string);
3434
on<K extends keyof EventMap>(eventName: K, listener: EventMap[K]): this;
3535

3636
readonly data: object | null;
@@ -172,4 +172,8 @@ export declare interface Box {
172172
style?: number;
173173
}
174174

175+
export interface PDFParserContext {
176+
destroy?(): void;
177+
}
178+
175179
export default PDFParser

0 commit comments

Comments
 (0)