@@ -3,6 +3,8 @@ import nodeUtil from "node:util";
33import { readFile } from "node:fs/promises" ;
44import { EventEmitter } from "node:events" ;
55import { Buffer } from "node:buffer" ;
6+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
7+ import { Readable } from "node:stream" ;
68
79import PDFJS from "./lib/pdf.js" ;
810import { ParserStream , StringifyStream } from "./lib/parserstream.js" ;
@@ -78,26 +80,34 @@ export default class PDFParser extends EventEmitter {
7880 }
7981
8082 static #maxBinBufferCount = 10 ;
83+ /** @type {Record<string, Buffer | null> } */
8184 static #binBuffer = { } ;
85+ static #instanceCounter = 0 ;
8286
8387 #password = "" ;
84- #context = null ; // service context object, only used in Web Service project; null in command line #pdfFilePath = null;
85- #pdfFilePath = null ; //current PDF file to load and parse, null means loading/parsing not started #data = null;
86- #pdfFileMTime = null ; // last time the current pdf was modified, used to recognize changes and ignore cache #PDFJS = null;
87- #data = null ; //if file read success, data is PDF content; if failed, data is "err" object #processFieldInfoXML = false;
88+ /** @type {import('./src/types/pdfparser.js').PDFParserContext|null } */
89+ #context = null ; // service context object, only used in Web Service project; null in command line
90+ /** @type {string|null } */
91+ #pdfFilePath = null ; //current PDF file to load and parse, null means loading/parsing not started
92+ /** @type {number|null } */
93+ #pdfFileMTime = null ; // last time the current pdf was modified, used to recognize changes and ignore cache
94+ /** @type {object|null } */
95+ #data = null ; //if file read success, data is PDF content; if failed, data is "err" object
96+ /** @type {import('./lib/pdf.js').default|null } */
8897 #PDFJS = null ; //will be initialized in constructor
8998 #processFieldInfoXML = false ; //disable additional _fieldInfo.xml parsing and merging (do NOT set to true)
9099
91100 /**
92101 * PDFParser constructor.
93102 * @constructor PDFParser class.
94- * @param {object } context - The context object (only used in Web Service project); null in command line
103+ * @param {import('./src/types/pdfparser.js').PDFParserContext|null } context - The context object (only used in Web Service project); null in command line
95104 * @param {boolean } needRawText - Whether raw text is needed or not
96105 * @param {string } password - The password for PDF file
97106 * @info Private methods accessible using the [funcName].call(this, ...) syntax
98107 */
99108 constructor ( context , needRawText , password ) {
100109 super ( ) ;
110+ PDFParser . #instanceCounter++ ;
101111 this . #context = context ;
102112 this . #pdfFilePath = null ; //current PDF file to load and parse, null means loading/parsing not started this.#pdfFileMTime = null;
103113 this . #pdfFileMTime = null ; // last time the current pdf was modified, used to recognize changes and ignore cache this.#data = null;
@@ -109,20 +119,18 @@ export default class PDFParser extends EventEmitter {
109119 }
110120
111121 /**
112- * @private
113122 * @param {object } data - The parsed data
114123 */
115124 #onPDFJSParseDataReady( data ) {
116125 if ( ! data ) {
117126 nodeUtil . p2jinfo ( "PDF parsing completed." ) ;
118127 this . emit ( "pdfParser_dataReady" , this . #data) ;
119128 } else {
120- this . #data = { ...this . #data, ...data } ;
129+ this . #data = { ...( this . #data || { } ) , ...data } ;
121130 }
122131 }
123132
124133 /**
125- * @private
126134 * @param {Error } err - The error object
127135 */
128136 #onPDFJSParserDataError( err ) {
@@ -131,11 +139,16 @@ export default class PDFParser extends EventEmitter {
131139 }
132140
133141 /**
134- * @private
135- * @param {Buffer } buffer - The PDF buffer
142+ * @param {Buffer|null } buffer - The PDF buffer
136143 */
137- #startParsingPDF( buffer ) {
138- this . #data = { } ;
144+ #startParsingPDF( buffer = null ) {
145+ this . #data = null ;
146+
147+ if ( ! this . #PDFJS) {
148+ this . #onPDFJSParserDataError( new Error ( "PDFJS parser not initialized" ) ) ;
149+ return ;
150+ }
151+
139152 this . #PDFJS. on ( "pdfjs_parseDataReady" , ( data ) =>
140153 this . #onPDFJSParseDataReady( data )
141154 ) ;
@@ -155,7 +168,6 @@ export default class PDFParser extends EventEmitter {
155168 }
156169
157170 /**
158- * @private
159171 * @returns {boolean }
160172 */
161173 #processBinaryCache( ) {
@@ -166,7 +178,7 @@ export default class PDFParser extends EventEmitter {
166178
167179 const allKeys = Object . keys ( PDFParser . #binBuffer) ;
168180 if ( allKeys . length > PDFParser . #maxBinBufferCount) {
169- const idx = this . id % PDFParser . #maxBinBufferCount;
181+ const idx = PDFParser . #instanceCounter % PDFParser . #maxBinBufferCount;
170182 const key = allKeys [ idx ] ;
171183 PDFParser . #binBuffer[ key ] = null ;
172184 delete PDFParser . #binBuffer[ key ] ;
@@ -190,6 +202,9 @@ export default class PDFParser extends EventEmitter {
190202 * @returns {string } The binBufferKey
191203 */
192204 get binBufferKey ( ) {
205+ if ( this . #pdfFilePath === null || this . #pdfFileMTime === null ) {
206+ return "" ;
207+ }
193208 return this . #pdfFilePath + this . #pdfFileMTime;
194209 }
195210
@@ -205,6 +220,7 @@ export default class PDFParser extends EventEmitter {
205220 * Asynchronously load a PDF from a file path.
206221 * @param {string } pdfFilePath - Path of the PDF file
207222 * @param {number } verbosity - Verbosity level
223+ * @returns {Promise<void> } Promise that resolves when PDF is loaded
208224 */
209225 async loadPDF ( pdfFilePath , verbosity ) {
210226 nodeUtil . verbosity ( verbosity || 0 ) ;
@@ -214,7 +230,7 @@ export default class PDFParser extends EventEmitter {
214230
215231 try {
216232 this . #pdfFileMTime = fs . statSync ( pdfFilePath ) . mtimeMs ;
217- if ( this . #processFieldInfoXML) {
233+ if ( this . #processFieldInfoXML && this . #PDFJS ) {
218234 this . #PDFJS. tryLoadFieldInfoXML ( pdfFilePath ) ;
219235 }
220236
@@ -253,36 +269,36 @@ export default class PDFParser extends EventEmitter {
253269 * @returns {string } Raw text content
254270 */
255271 getRawTextContent ( ) {
256- return this . #PDFJS. getRawTextContent ( ) ;
272+ return this . #PDFJS? .getRawTextContent ( ) || "" ;
257273 }
258274
259275 /**
260276 * Retrieve raw text content stream.
261- * @returns {Stream } Raw text content stream
277+ * @returns {Readable } Raw text content stream
262278 */
263279 getRawTextContentStream ( ) {
264280 return ParserStream . createContentStream ( this . getRawTextContent ( ) ) ;
265281 }
266282
267283 /**
268284 * Retrieve all field types.
269- * @returns {object [] } All field types
285+ * @returns {import('./src/types/pdfparser.js').FieldType [] } All field types
270286 */
271287 getAllFieldsTypes ( ) {
272- return this . #PDFJS. getAllFieldsTypes ( ) ;
288+ return this . #PDFJS? .getAllFieldsTypes ( ) || [ ] ;
273289 }
274290
275291 /**
276- * Retrieve all field types .
277- * @returns {object [] } All field types
292+ * Retrieve all field data .
293+ * @returns {import('./src/types/pdfparser.js').FieldType [] } All field data
278294 */
279295 getAllFieldData ( ) {
280- return this . #PDFJS. getAllFieldData ( ) ;
296+ return this . #PDFJS? .getAllFieldData ( ) || [ ] ;
281297 }
282298
283299 /**
284300 * Retrieve all field types stream.
285- * @returns {Stream } All field types stream
301+ * @returns {Readable } All field types stream
286302 */
287303 getAllFieldsTypesStream ( ) {
288304 return ParserStream . createContentStream ( this . getAllFieldsTypes ( ) ) ;
@@ -293,12 +309,12 @@ export default class PDFParser extends EventEmitter {
293309 * @returns {object } Merged text blocks
294310 */
295311 getMergedTextBlocksIfNeeded ( ) {
296- return this . #PDFJS. getMergedTextBlocksIfNeeded ( ) ;
312+ return this . #PDFJS? .getMergedTextBlocksIfNeeded ( ) || { } ;
297313 }
298314
299315 /**
300316 * Retrieve merged text blocks stream.
301- * @returns {Stream } Merged text blocks stream
317+ * @returns {Readable } Merged text blocks stream
302318 */
303319 getMergedTextBlocksStream ( ) {
304320 return ParserStream . createContentStream ( this . getMergedTextBlocksIfNeeded ( ) ) ;
@@ -309,8 +325,9 @@ export default class PDFParser extends EventEmitter {
309325 * @param {boolean } needRawText - Whether raw text is needed or not
310326 */
311327 resetPDFJS ( needRawText ) {
312- this . #PDFJS. destroy ( ) ;
313- this . #PDFJS= new PDFJS ( needRawText ) ;
328+ this . #PDFJS?. destroy ( ) ;
329+ this . #PDFJS = new PDFJS ( needRawText ) ;
330+ PDFParser . #instanceCounter++ ;
314331 }
315332
316333 /**
@@ -322,7 +339,7 @@ export default class PDFParser extends EventEmitter {
322339
323340 //context object will be set in Web Service project, but not in command line utility
324341 if ( this . #context) {
325- this . #context. destroy ( ) ;
342+ this . #context. destroy ?. ( ) ;
326343 this . #context = null ;
327344 }
328345
@@ -331,7 +348,8 @@ export default class PDFParser extends EventEmitter {
331348 this . #data = null ;
332349 this . #processFieldInfoXML = false ; //disable additional _fieldInfo.xml parsing and merging (do NOT set to true)
333350
334- this . #PDFJS. destroy ( ) ;
351+ this . #PDFJS? .destroy ( ) ;
335352 this . #PDFJS = null ;
353+ PDFParser . #instanceCounter-- ;
336354 }
337355}
0 commit comments