ganze project

This commit is contained in:
2025-12-15 19:52:56 +01:00
commit bc9b07ca4e
521 changed files with 361138 additions and 0 deletions

96
node_modules/pdf-parse/dist/node/cjs/index.cjs generated vendored Normal file
View File

@@ -0,0 +1,96 @@
"use strict";
var __create = Object.create;
var __defProp = Object.defineProperty;
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
var __getOwnPropNames = Object.getOwnPropertyNames;
var __getProtoOf = Object.getPrototypeOf;
var __hasOwnProp = Object.prototype.hasOwnProperty;
var __export = (target, all) => {
for (var name in all)
__defProp(target, name, { get: all[name], enumerable: true });
};
var __copyProps = (to, from, except, desc) => {
if (from && typeof from === "object" || typeof from === "function") {
for (let key of __getOwnPropNames(from))
if (!__hasOwnProp.call(to, key) && key !== except)
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
}
return to;
};
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
// If the importer is in node compatibility mode or this is not an ESM
// file that has been converted to a CommonJS file using a Babel-
// compatible transform (i.e. "__esModule" has not been set), then set
// "default" to the CommonJS "module.exports" for node compatibility.
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
mod
));
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
// src/node/index.ts
var index_exports = {};
__export(index_exports, {
getHeader: () => getHeader
});
module.exports = __toCommonJS(index_exports);
// src/node/getHeader.ts
var http = __toESM(require("node:http"), 1);
var https = __toESM(require("node:https"), 1);
async function nodeRequest(u, method, headers) {
return new Promise((resolve, reject) => {
const reqFn = u.protocol === "https:" ? https.request : http.request;
const req = reqFn(u, { method, headers }, (res) => {
const headersObj = {};
for (const [k, v] of Object.entries(res.headers)) {
headersObj[k] = Array.isArray(v) ? v.join(",") : v ?? "";
}
const chunks = [];
res.on("data", (c) => chunks.push(Buffer.from(c)));
res.on("end", () => {
const buffer = chunks.length ? Buffer.concat(chunks) : void 0;
resolve({ status: res.statusCode ?? 0, headers: headersObj, buffer });
});
});
req.on("error", (err) => reject(err));
req.end();
});
}
async function getHeader(url, check = false) {
try {
const u = typeof url === "string" ? new URL(url) : url;
const headResp = await nodeRequest(u, "HEAD");
const size = headResp.headers["content-length"] ? parseInt(headResp.headers["content-length"], 10) : void 0;
let magic = null;
if (check) {
const rangeResp = await nodeRequest(u, "GET", { Range: "bytes=0-4" });
if (rangeResp.status >= 200 && rangeResp.status < 300 && rangeResp.buffer) {
const headerStr = rangeResp.buffer.slice(0, 4).toString("utf8");
magic = headerStr.startsWith("%PDF");
} else {
magic = false;
}
}
return {
ok: headResp.status >= 200 && headResp.status < 300,
status: headResp.status,
size,
magic,
headers: headResp.headers
};
} catch (error) {
return {
ok: false,
status: void 0,
size: void 0,
magic: false,
headers: {},
error: new Error(String(error))
};
}
}
// Annotate the CommonJS export names for ESM import in node:
0 && (module.exports = {
getHeader
});
//# sourceMappingURL=index.cjs.map

7
node_modules/pdf-parse/dist/node/cjs/index.cjs.map generated vendored Normal file
View File

@@ -0,0 +1,7 @@
{
"version": 3,
"sources": ["../../../src/node/index.ts", "../../../src/node/getHeader.ts"],
"sourcesContent": ["export type * from './getHeader.js';\nexport * from './getHeader.js';\n", "import * as http from 'node:http';\nimport * as https from 'node:https';\n\n/**\n * Result information from getHeader.\n * @public\n */\nexport interface HeaderResult {\n\tok: boolean;\n\tstatus?: number;\n\tsize?: number;\n\tmagic: boolean | null;\n\theaders?: Record<string, string>;\n\terror?: Error;\n}\n\ninterface RequestResult {\n\tstatus: number;\n\theaders: Record<string, string>;\n\tbuffer?: Buffer;\n}\n\nasync function nodeRequest(u: URL, method: string, headers?: Record<string, string>): Promise<RequestResult> {\n\treturn new Promise((resolve, reject) => {\n\t\tconst reqFn = u.protocol === 'https:' ? https.request : http.request;\n\t\tconst req = reqFn(u, { method, headers }, (res) => {\n\t\t\tconst headersObj: Record<string, string> = {};\n\t\t\tfor (const [k, v] of Object.entries(res.headers)) {\n\t\t\t\theadersObj[k] = Array.isArray(v) ? v.join(',') : (v ?? '');\n\t\t\t}\n\n\t\t\tconst chunks: Buffer[] = [];\n\t\t\tres.on('data', (c) => chunks.push(Buffer.from(c)));\n\t\t\tres.on('end', () => {\n\t\t\t\tconst buffer = chunks.length ? Buffer.concat(chunks) : undefined;\n\t\t\t\tresolve({ status: res.statusCode ?? 0, headers: headersObj, buffer });\n\t\t\t});\n\t\t});\n\n\t\treq.on('error', (err) => reject(err));\n\t\treq.end();\n\t});\n}\n\n/**\n * Perform an HTTP HEAD request to retrieve the file size and verify existence;\n * when `check` is true, fetch a small range and inspect the magic number to confirm the URL points to a valid PDF.\n * If the server does not support range requests, `isPdf` will be set to `false`.\n * @param url - The URL of the PDF file to check. Can be a string or URL object.\n * @param check - When `true`, download a small byte range (first 4 bytes) to validate the file signature by checking for '%PDF' magic bytes. Default: `false`.\n * @returns - A Promise that resolves to a HeaderResult object containing the response status, size, headers, and PDF validation result.\n * @public\n */\nexport async function getHeader(url: string | URL, check: boolean = false): Promise<HeaderResult> {\n\ttry {\n\t\tconst u = typeof url === 'string' ? new URL(url) : url;\n\n\t\tconst headResp = await nodeRequest(u, 'HEAD');\n\t\tconst size = headResp.headers['content-length'] ? parseInt(headResp.headers['content-length'], 10) : undefined;\n\n\t\tlet magic: boolean | null = null;\n\t\tif (check) {\n\t\t\tconst rangeResp = await nodeRequest(u, 'GET', { Range: 'bytes=0-4' });\n\t\t\tif (rangeResp.status >= 200 && rangeResp.status < 300 && rangeResp.buffer) {\n\t\t\t\tconst headerStr = rangeResp.buffer.slice(0, 4).toString('utf8');\n\t\t\t\tmagic = headerStr.startsWith('%PDF');\n\t\t\t} else {\n\t\t\t\tmagic = false;\n\t\t\t}\n\t\t}\n\n\t\treturn {\n\t\t\tok: headResp.status >= 200 && headResp.status < 300,\n\t\t\tstatus: headResp.status,\n\t\t\tsize,\n\t\t\tmagic,\n\t\t\theaders: headResp.headers,\n\t\t};\n\t} catch (error) {\n\t\treturn {\n\t\t\tok: false,\n\t\t\tstatus: undefined,\n\t\t\tsize: undefined,\n\t\t\tmagic: false,\n\t\t\theaders: {},\n\t\t\terror: new Error(String(error)),\n\t\t};\n\t}\n}\n"],
"mappings": ";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACAA,WAAsB;AACtB,YAAuB;AAqBvB,eAAe,YAAY,GAAQ,QAAgB,SAA0D;AAC5G,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACvC,UAAM,QAAQ,EAAE,aAAa,WAAiB,gBAAe;AAC7D,UAAM,MAAM,MAAM,GAAG,EAAE,QAAQ,QAAQ,GAAG,CAAC,QAAQ;AAClD,YAAM,aAAqC,CAAC;AAC5C,iBAAW,CAAC,GAAG,CAAC,KAAK,OAAO,QAAQ,IAAI,OAAO,GAAG;AACjD,mBAAW,CAAC,IAAI,MAAM,QAAQ,CAAC,IAAI,EAAE,KAAK,GAAG,IAAK,KAAK;AAAA,MACxD;AAEA,YAAM,SAAmB,CAAC;AAC1B,UAAI,GAAG,QAAQ,CAAC,MAAM,OAAO,KAAK,OAAO,KAAK,CAAC,CAAC,CAAC;AACjD,UAAI,GAAG,OAAO,MAAM;AACnB,cAAM,SAAS,OAAO,SAAS,OAAO,OAAO,MAAM,IAAI;AACvD,gBAAQ,EAAE,QAAQ,IAAI,cAAc,GAAG,SAAS,YAAY,OAAO,CAAC;AAAA,MACrE,CAAC;AAAA,IACF,CAAC;AAED,QAAI,GAAG,SAAS,CAAC,QAAQ,OAAO,GAAG,CAAC;AACpC,QAAI,IAAI;AAAA,EACT,CAAC;AACF;AAWA,eAAsB,UAAU,KAAmB,QAAiB,OAA8B;AACjG,MAAI;AACH,UAAM,IAAI,OAAO,QAAQ,WAAW,IAAI,IAAI,GAAG,IAAI;AAEnD,UAAM,WAAW,MAAM,YAAY,GAAG,MAAM;AAC5C,UAAM,OAAO,SAAS,QAAQ,gBAAgB,IAAI,SAAS,SAAS,QAAQ,gBAAgB,GAAG,EAAE,IAAI;AAErG,QAAI,QAAwB;AAC5B,QAAI,OAAO;AACV,YAAM,YAAY,MAAM,YAAY,GAAG,OAAO,EAAE,OAAO,YAAY,CAAC;AACpE,UAAI,UAAU,UAAU,OAAO,UAAU,SAAS,OAAO,UAAU,QAAQ;AAC1E,cAAM,YAAY,UAAU,OAAO,MAAM,GAAG,CAAC,EAAE,SAAS,MAAM;AAC9D,gBAAQ,UAAU,WAAW,MAAM;AAAA,MACpC,OAAO;AACN,gBAAQ;AAAA,MACT;AAAA,IACD;AAEA,WAAO;AAAA,MACN,IAAI,SAAS,UAAU,OAAO,SAAS,SAAS;AAAA,MAChD,QAAQ,SAAS;AAAA,MACjB;AAAA,MACA;AAAA,MACA,SAAS,SAAS;AAAA,IACnB;AAAA,EACD,SAAS,OAAO;AACf,WAAO;AAAA,MACN,IAAI;AAAA,MACJ,QAAQ;AAAA,MACR,MAAM;AAAA,MACN,OAAO;AAAA,MACP,SAAS,CAAC;AAAA,MACV,OAAO,IAAI,MAAM,OAAO,KAAK,CAAC;AAAA,IAC/B;AAAA,EACD;AACD;",
"names": []
}

25
node_modules/pdf-parse/dist/node/cjs/index.d.cts generated vendored Normal file
View File

@@ -0,0 +1,25 @@
/**
* Perform an HTTP HEAD request to retrieve the file size and verify existence;
* when `check` is true, fetch a small range and inspect the magic number to confirm the URL points to a valid PDF.
* If the server does not support range requests, `isPdf` will be set to `false`.
* @param url - The URL of the PDF file to check. Can be a string or URL object.
* @param check - When `true`, download a small byte range (first 4 bytes) to validate the file signature by checking for '%PDF' magic bytes. Default: `false`.
* @returns - A Promise that resolves to a HeaderResult object containing the response status, size, headers, and PDF validation result.
* @public
*/
export declare function getHeader(url: string | URL, check?: boolean): Promise<HeaderResult>;
/**
* Result information from getHeader.
* @public
*/
export declare interface HeaderResult {
ok: boolean;
status?: number;
size?: number;
magic: boolean | null;
headers?: Record<string, string>;
error?: Error;
}
export { }

23
node_modules/pdf-parse/dist/node/esm/getHeader.d.ts generated vendored Normal file
View File

@@ -0,0 +1,23 @@
/**
* Result information from getHeader.
* @public
*/
export interface HeaderResult {
ok: boolean;
status?: number;
size?: number;
magic: boolean | null;
headers?: Record<string, string>;
error?: Error;
}
/**
* Perform an HTTP HEAD request to retrieve the file size and verify existence;
* when `check` is true, fetch a small range and inspect the magic number to confirm the URL points to a valid PDF.
* If the server does not support range requests, `isPdf` will be set to `false`.
* @param url - The URL of the PDF file to check. Can be a string or URL object.
* @param check - When `true`, download a small byte range (first 4 bytes) to validate the file signature by checking for '%PDF' magic bytes. Default: `false`.
* @returns - A Promise that resolves to a HeaderResult object containing the response status, size, headers, and PDF validation result.
* @public
*/
export declare function getHeader(url: string | URL, check?: boolean): Promise<HeaderResult>;
//# sourceMappingURL=getHeader.d.ts.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"getHeader.d.ts","sourceRoot":"","sources":["../../../src/node/getHeader.ts"],"names":[],"mappings":"AAGA;;;GAGG;AACH,MAAM,WAAW,YAAY;IAC5B,EAAE,EAAE,OAAO,CAAC;IACZ,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,KAAK,EAAE,OAAO,GAAG,IAAI,CAAC;IACtB,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACjC,KAAK,CAAC,EAAE,KAAK,CAAC;CACd;AA8BD;;;;;;;;GAQG;AACH,wBAAsB,SAAS,CAAC,GAAG,EAAE,MAAM,GAAG,GAAG,EAAE,KAAK,GAAE,OAAe,GAAG,OAAO,CAAC,YAAY,CAAC,CAmChG"}

66
node_modules/pdf-parse/dist/node/esm/getHeader.js generated vendored Normal file
View File

@@ -0,0 +1,66 @@
import * as http from 'node:http';
import * as https from 'node:https';
async function nodeRequest(u, method, headers) {
return new Promise((resolve, reject) => {
const reqFn = u.protocol === 'https:' ? https.request : http.request;
const req = reqFn(u, { method, headers }, (res) => {
const headersObj = {};
for (const [k, v] of Object.entries(res.headers)) {
headersObj[k] = Array.isArray(v) ? v.join(',') : (v ?? '');
}
const chunks = [];
res.on('data', (c) => chunks.push(Buffer.from(c)));
res.on('end', () => {
const buffer = chunks.length ? Buffer.concat(chunks) : undefined;
resolve({ status: res.statusCode ?? 0, headers: headersObj, buffer });
});
});
req.on('error', (err) => reject(err));
req.end();
});
}
/**
* Perform an HTTP HEAD request to retrieve the file size and verify existence;
* when `check` is true, fetch a small range and inspect the magic number to confirm the URL points to a valid PDF.
* If the server does not support range requests, `isPdf` will be set to `false`.
* @param url - The URL of the PDF file to check. Can be a string or URL object.
* @param check - When `true`, download a small byte range (first 4 bytes) to validate the file signature by checking for '%PDF' magic bytes. Default: `false`.
* @returns - A Promise that resolves to a HeaderResult object containing the response status, size, headers, and PDF validation result.
* @public
*/
export async function getHeader(url, check = false) {
try {
const u = typeof url === 'string' ? new URL(url) : url;
const headResp = await nodeRequest(u, 'HEAD');
const size = headResp.headers['content-length'] ? parseInt(headResp.headers['content-length'], 10) : undefined;
let magic = null;
if (check) {
const rangeResp = await nodeRequest(u, 'GET', { Range: 'bytes=0-4' });
if (rangeResp.status >= 200 && rangeResp.status < 300 && rangeResp.buffer) {
const headerStr = rangeResp.buffer.slice(0, 4).toString('utf8');
magic = headerStr.startsWith('%PDF');
}
else {
magic = false;
}
}
return {
ok: headResp.status >= 200 && headResp.status < 300,
status: headResp.status,
size,
magic,
headers: headResp.headers,
};
}
catch (error) {
return {
ok: false,
status: undefined,
size: undefined,
magic: false,
headers: {},
error: new Error(String(error)),
};
}
}
//# sourceMappingURL=getHeader.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"getHeader.js","sourceRoot":"","sources":["../../../src/node/getHeader.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,KAAK,KAAK,MAAM,YAAY,CAAC;AAqBpC,KAAK,UAAU,WAAW,CAAC,CAAM,EAAE,MAAc,EAAE,OAAgC;IAClF,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QACtC,MAAM,KAAK,GAAG,CAAC,CAAC,QAAQ,KAAK,QAAQ,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC;QACrE,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,EAAE,EAAE,MAAM,EAAE,OAAO,EAAE,EAAE,CAAC,GAAG,EAAE,EAAE;YACjD,MAAM,UAAU,GAA2B,EAAE,CAAC;YAC9C,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC;gBAClD,UAAU,CAAC,CAAC,CAAC,GAAG,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;YAC5D,CAAC;YAED,MAAM,MAAM,GAAa,EAAE,CAAC;YAC5B,GAAG,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YACnD,GAAG,CAAC,EAAE,CAAC,KAAK,EAAE,GAAG,EAAE;gBAClB,MAAM,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;gBACjE,OAAO,CAAC,EAAE,MAAM,EAAE,GAAG,CAAC,UAAU,IAAI,CAAC,EAAE,OAAO,EAAE,UAAU,EAAE,MAAM,EAAE,CAAC,CAAC;YACvE,CAAC,CAAC,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,GAAG,EAAE,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;QACtC,GAAG,CAAC,GAAG,EAAE,CAAC;IACX,CAAC,CAAC,CAAC;AACJ,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,CAAC,KAAK,UAAU,SAAS,CAAC,GAAiB,EAAE,QAAiB,KAAK;IACxE,IAAI,CAAC;QACJ,MAAM,CAAC,GAAG,OAAO,GAAG,KAAK,QAAQ,CAAC,CAAC,CAAC,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC;QAEvD,MAAM,QAAQ,GAAG,MAAM,WAAW,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC;QAC9C,MAAM,IAAI,GAAG,QAAQ,CAAC,OAAO,CAAC,gBAAgB,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,OAAO,CAAC,gBAAgB,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;QAE/G,IAAI,KAAK,GAAmB,IAAI,CAAC;QACjC,IAAI,KAAK,EAAE,CAAC;YACX,MAAM,SAAS,GAAG,MAAM,WAAW,CAAC,CAAC,EAAE,KAAK,EAAE,EAAE,KAAK,EAAE,WAAW,EAAE,CAAC,CAAC;YACtE,IAAI,SAAS,CAAC,MAAM,IAAI,GAAG,IAAI,SAAS,CAAC,MAAM,GAAG,GAAG,IAAI,SAAS,CAAC,MAAM,EAAE,CAAC;gBAC3E,MAAM,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;gBAChE,KAAK,GAAG,SAAS,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;YACtC,CAAC;iBAAM,CAAC;gBACP,KAAK,GAAG,KAAK,CAAC;YACf,CAAC;QACF,CAAC;QAED,OAAO;YACN,EAAE,EAAE,QAAQ,CAAC,MAAM,IAAI,GAAG,IAAI,QAAQ,CAAC,MAAM,GAAG,GAAG;YACnD,MAAM,EAAE,QAAQ,CAAC,MAAM;YACvB,IAAI;YACJ,KAAK;YACL,OAAO,EAAE,QAAQ,CAAC,OAAO;SACzB,CAAC;IACH,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QAChB,OAAO;YACN,EAAE,EAAE,KAAK;YACT,MAAM,EAAE,SAAS;YACjB,IAAI,EAAE,SAAS;YACf,KAAK,EAAE,KAAK;YACZ,OAAO,EAAE,EAAE;YACX,KAAK,EAAE,IAAI,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;SAC/B,CAAC;IACH,CAAC;AACF,CAAC"}

3
node_modules/pdf-parse/dist/node/esm/index.d.ts generated vendored Normal file
View File

@@ -0,0 +1,3 @@
export type * from './getHeader.js';
export * from './getHeader.js';
//# sourceMappingURL=index.d.ts.map

1
node_modules/pdf-parse/dist/node/esm/index.d.ts.map generated vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/node/index.ts"],"names":[],"mappings":"AAAA,mBAAmB,gBAAgB,CAAC;AACpC,cAAc,gBAAgB,CAAC"}

2
node_modules/pdf-parse/dist/node/esm/index.js generated vendored Normal file
View File

@@ -0,0 +1,2 @@
export * from './getHeader.js';
//# sourceMappingURL=index.js.map

1
node_modules/pdf-parse/dist/node/esm/index.js.map generated vendored Normal file
View File

@@ -0,0 +1 @@
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/node/index.ts"],"names":[],"mappings":"AACA,cAAc,gBAAgB,CAAC"}

2
node_modules/pdf-parse/dist/pdf-parse/cjs/index.cjs generated vendored Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

898
node_modules/pdf-parse/dist/pdf-parse/cjs/index.d.cts generated vendored Normal file
View File

@@ -0,0 +1,898 @@
import type { DocumentInitParameters } from 'pdfjs-dist/types/src/display/api.js';
import type { ImageKind } from 'pdfjs-dist/legacy/build/pdf.mjs';
import { Metadata } from 'pdfjs-dist/types/src/display/metadata.js';
import type { PDFDataRangeTransport } from 'pdfjs-dist/types/src/display/api.js';
import type { PDFWorker } from 'pdfjs-dist/types/src/display/api.js';
import { VerbosityLevel } from 'pdfjs-dist/legacy/build/pdf.mjs';
/**
* Error used to indicate that an operation was aborted (for example by an AbortSignal).
*
* @public
*/
export declare class AbortException extends Error {
/**
* Create a new AbortException.
* @param message - Optional error message.
* @param cause - Optional underlying cause.
*/
constructor(message?: string, cause?: unknown);
}
/**
* @public
* Consolidated date information gathered from different PDF sources.
* The PDF 'Info' dictionary contains CreationDate / ModDate and
* the XMP/XAP metadata can contain several timestamps as well. This
* structure collects those values (if present) as JavaScript Date objects
* or null when the property exists but cannot be parsed.
*/
export declare type DateNode = {
CreationDate?: Date | null;
ModDate?: Date | null;
XmpCreateDate?: Date | null;
XmpModifyDate?: Date | null;
XmpMetadataDate?: Date | null;
XapCreateDate?: Date | null;
XapModifyDate?: Date | null;
XapMetadataDate?: Date | null;
};
/**
* @public
* EmbeddedImage
* - Normalized representation of an embedded image extracted from the PDF.
* - `data`: Raw image bytes (e.g. PNG/JPEG) as Uint8Array. Use this for file writing or binary processing.
* - `dataUrl`: Optional data URL (e.g. "data:image/png;base64,...") for directly embedding in <img> src.
* Storing both lets consumers choose the most convenient form; consider omitting one to save memory.
* - `name`: Resource name for the image.
* - `width` / `height`: Dimensions in pixels.
* - `kind`: ImageKindValue from indicating the pixel format (e.g. GRAYSCALE_1BPP / RGB_24BPP / RGBA_32BPP).
*/
export declare interface EmbeddedImage {
data: Uint8Array;
dataUrl: string;
name: string;
width: number;
height: number;
kind: ImageKindValue;
}
/**
* Error thrown when the PDF structure/contents are malformed and cannot be parsed.
*
* This is raised for low-level format problems detected while reading PDF objects.
* Errors caused during parsing PDF data.
*
* @public
*/
export declare class FormatError extends Error {
/**
* Create a new FormatError.
* @param message - Optional message describing the format problem.
* @param cause - Optional underlying cause.
*/
constructor(message?: string, cause?: unknown);
}
/**
* Normalize arbitrary thrown values into an Error instance used by the library.
*
* Known Error instances with specific names are mapped to the library's
* typed exceptions in order to preserve type information and any additional
* fields (for example `details`, `status`, etc.). If the value is not an
* Error it is converted to a generic Error containing the stringified value.
*
* @public
* @param error - The thrown value to normalize.
* @returns An Error instance representing the provided value.
*/
export declare function getException(error: unknown): Error;
/**
* @public
* ImageKindKey
* - Represents the keys of the ImageKind enum (e.g. "GRAYSCALE_1BPP", "RGB_24BPP", "RGBA_32BPP").
*/
export declare type ImageKindKey = keyof typeof ImageKind;
/**
* @public
* ImageKindValue
* - Represents the numeric values of the ImageKind enum (e.g. 1, 2, 3).
*/
export declare type ImageKindValue = (typeof ImageKind)[ImageKindKey];
/**
* @public
* ImageResult
* Helper container for extracted images grouped per page.
*/
export declare class ImageResult {
pages: Array<PageImages>;
total: number;
getPageImage(num: number, name: string): EmbeddedImage | null;
constructor(total: number);
}
/**
* @public
* Aggregated information about a PDF document returned by getInfo().
* The object contains high-level metadata, outline/bookmark structure,
* per-page extracted hyperlinks and utility helpers for parsing dates.
*/
export declare class InfoResult {
total: number;
/**
* The PDF 'Info' dictionary. Typical fields include title, author, subject,
* Creator, Producer and Creation/Modification dates. The exact structure is
* determined by the PDF and as returned by PDF.js.
*/
info?: any;
metadata?: Metadata;
/**
* An array of document fingerprint strings provided by PDF.js. Useful
* for caching, de-duplication or identifying a document across runs.
*/
fingerprints?: Array<string | null>;
/**
* Permission flags for the document as returned by PDF.js (or null).
* These flags indicate capabilities such as printing, copying and
* other restrictions imposed by the PDF security settings.
*/
permission?: number[] | null;
/**
* Optional document outline (bookmarks). When present this is the
* hierarchical navigation structure which viewers use for quick access.
*/
outline?: Array<OutlineNode> | null;
pages: Array<PageLinkResult>;
/**
* Collects dates from different sources (Info dictionary and XMP/XAP metadata)
* and returns them as a DateNode where available. This helps callers compare
* and choose the most relevant timestamp (for example a creation date vs XMP date).
*/
getDateNode(): DateNode;
/**
* Try to parse an ISO-8601 date string from XMP/XAP metadata. If the
* value is falsy or cannot be parsed, undefined is returned to indicate
* absence or unparsable input.
*/
private parseISODateString;
constructor(total: number);
}
/**
* Error thrown when the parsed data is not a valid PDF document.
*
* Use this exception to signal that the input cannot be interpreted as a PDF
* (corrupt file, invalid header, etc.).
*
* @public
*/
export declare class InvalidPDFException extends Error {
/**
* Create a new InvalidPDFException.
* @param message - Optional error message.
* @param cause - Optional underlying cause (preserved on modern runtimes).
*/
constructor(message?: string, cause?: unknown);
}
export declare class Line extends Shape {
from: Point;
to: Point;
direction: LineDirection;
length: number;
intersections: Array<Point>;
gaps: Array<Line>;
constructor(from: Point, to: Point);
private init;
private _valid;
get valid(): boolean;
get normalized(): Line;
addGap(line: Line): void;
containsPoint(p: Point): boolean;
addIntersectionPoint(point: Point): void;
intersection(line: Line): Point | undefined;
transform(matrix: Array<number>): this;
}
export declare enum LineDirection {
None = 0,
Horizontal = 1,
Vertical = 2
}
export declare class LineStore {
hLines: Array<Line>;
vLines: Array<Line>;
add(line: Line): void;
addRectangle(rect: Rectangle): void;
getTableData(): Array<TableData>;
getTables(): Array<Table>;
normalize(): void;
normalizeHorizontal(): void;
normalizeVertical(): void;
private fillTable;
private tryFill;
private margeHorizontalLines;
private margeVerticalLines;
}
/**
* @public
* LoadParameters
* PDF loading parameters.
*/
export declare interface LoadParameters extends DocumentInitParameters {
/**
* The URL of the PDF.
* Default: `undefined`.
*/
url?: string | URL | undefined;
/**
* Binary PDF data.
* Use TypedArrays (e.g., `Uint8Array`) to improve memory usage. If PDF data is BASE64-encoded, use `atob()` to convert it to a binary string first.
* **NOTE**: If TypedArrays are used, they will generally be transferred to the worker thread, reducing main-thread memory usage but taking ownership of the array.
* Default: `undefined`.
*/
data?: string | number[] | ArrayBuffer | TypedArray | undefined;
/**
* Basic authentication headers.
* Default: `undefined`.
*/
httpHeaders?: Object | undefined;
/**
* Indicates whether cross-site Access-Control requests should be made using credentials (e.g., cookies or auth headers).
* Default: `false`.
*/
withCredentials?: boolean | undefined;
/**
* For decrypting password-protected PDFs.
* Default: `undefined`.
*/
password?: string | undefined;
/**
* The PDF file length. Used for progress reports and range requests.
* Default: `undefined`.
*/
length?: number | undefined;
/**
* Allows using a custom range transport implementation.
* Default: `undefined`.
*/
range?: PDFDataRangeTransport | undefined;
/**
* Maximum number of bytes fetched per range request.
* Default: `65536` (`2^16`).
*/
rangeChunkSize?: number | undefined;
/**
* The worker used for loading and parsing PDF data.
* Default: `undefined`.
*/
worker?: PDFWorker | undefined;
/**
* Controls logging level; use constants from `VerbosityLevel`.
* Default: `undefined`.
*/
verbosity?: number | undefined;
/**
* Base URL of the document, used to resolve relative URLs in annotations and outline items.
* Default: `undefined`.
*/
docBaseUrl?: string | undefined;
/**
* URL where predefined Adobe CMaps are located. Include trailing slash.
* Default: `undefined`.
*/
cMapUrl?: string | undefined;
/**
* Specifies if Adobe CMaps are binary-packed.
* Default: `true`.
*/
cMapPacked?: boolean | undefined;
/**
* Factory for reading built-in CMap files.
* Default: `{DOMCMapReaderFactory}`.
*/
CMapReaderFactory?: Object | undefined;
/**
* URL where predefined ICC profiles are located. Include trailing slash.
* Default: `undefined`.
*/
iccUrl?: string | undefined;
/**
* If `true`, non-embedded fonts fall back to system fonts.
* Default: `true` in browsers, `false` in Node.js (unless `disableFontFace === true`, then always `false`).
*/
useSystemFonts?: boolean | undefined;
/**
* URL for standard font files. Include trailing slash.
* Default: `undefined`.
*/
standardFontDataUrl?: string | undefined;
/**
* Factory for reading standard font files.
* Default: `{DOMStandardFontDataFactory}`.
*/
StandardFontDataFactory?: Object | undefined;
/**
* URL for WebAssembly files. Include trailing slash.
* Default: `undefined`.
*/
wasmUrl?: string | undefined;
/**
* Factory for reading WASM files.
* Default: `{DOMWasmFactory}`.
*/
WasmFactory?: Object | undefined;
/**
* Enable `fetch()` in worker thread for CMap/font/WASM files. If `true`, factory options are ignored.
* Default: `true` in browsers, `false` in Node.js.
*/
useWorkerFetch?: boolean | undefined;
/**
* Attempt to use WebAssembly for better performance (e.g., image decoding).
* Default: `true`.
*/
useWasm?: boolean | undefined;
/**
* Reject promises (e.g., `getTextContent`) on parse errors instead of recovering partially.
* Default: `false`.
*/
stopAtErrors?: boolean | undefined;
/**
* Max image size in total pixels (`width * height`). Use `-1` for no limit.
* Default: `-1`.
*/
maxImageSize?: number | undefined;
/**
* Whether evaluating strings as JS is allowed (for PDF function performance).
* Default: `true`.
*/
isEvalSupported?: boolean | undefined;
/**
* Whether `OffscreenCanvas` can be used in worker.
* Default: `true` in browsers, `false` in Node.js.
*/
isOffscreenCanvasSupported?: boolean | undefined;
/**
* Whether `ImageDecoder` can be used in worker.
* Default: `true` in browsers, `false` in Node.js.
* **NOTE**: Temporarily disabled in Chromium due to bugs:
* - Crashes with BMP decoder on huge images ([issue 374807001](https://issues.chromium.org/issues/374807001))
* - Broken JPEGs with custom color profiles ([issue 378869810](https://issues.chromium.org/issues/378869810))
*/
isImageDecoderSupported?: boolean | undefined;
/**
* Used to determine when to resize images (via `OffscreenCanvas`). Use `-1` to use a slower fallback algorithm.
* Default: `undefined`.
*/
canvasMaxAreaInBytes?: number | undefined;
/**
* Disable `@font-face`/Font Loading API; use built-in glyph renderer instead.
* Default: `false` in browsers, `true` in Node.js.
*/
disableFontFace?: boolean | undefined;
/**
* Include extra (non-rendering) font properties when exporting font data from worker. Increases memory usage.
* Default: `false`.
*/
fontExtraProperties?: boolean | undefined;
/**
* Render XFA forms if present.
* Default: `false`.
*/
enableXfa?: boolean | undefined;
/**
* Explicit document context for creating elements and loading resources. Defaults to current document.
* Default: `undefined`.
*/
ownerDocument?: HTMLDocument | undefined;
/**
* Disable range requests for PDF loading.
* Default: `false`.
*/
disableRange?: boolean | undefined;
/**
* Disable streaming PDF data.
* Default: `false`.
*/
disableStream?: boolean | undefined;
/**
* Disable pre-fetching of PDF data. Requires `disableStream: true` to work fully.
* Default: `false`.
*/
disableAutoFetch?: boolean | undefined;
/**
* Enable debugging hooks (see `web/debugger.js`).
* Default: `false`.
*/
pdfBug?: boolean | undefined;
/**
* Factory for creating canvases.
* Default: `{DOMCanvasFactory}`.
*/
CanvasFactory?: Object | undefined;
/**
* Factory for creating SVG filters during rendering.
* Default: `{DOMFilterFactory}`.
*/
FilterFactory?: Object | undefined;
/**
* Enable hardware acceleration for rendering.
* Default: `false`.
*/
enableHWA?: boolean | undefined;
}
export { Metadata }
/**
* @public
* Node representing a single item in the PDF outline (bookmarks).
* This mirrors the structure returned by PDF.js' getOutline() API.
*/
export declare interface OutlineNode {
title: string;
bold: boolean;
italic: boolean;
color: Uint8ClampedArray;
dest: string | Array<any> | null;
url: string | null;
unsafeUrl?: string;
newWindow?: boolean;
count?: number;
items: Array<any>;
}
/**
* @public
* PageImages
* - Represents all embedded images found on a single PDF page.
* - pageNumber: 1-based page index.
* - images: Array of EmbeddedImage objects for this page.
*/
export declare interface PageImages {
pageNumber: number;
images: EmbeddedImage[];
}
/**
* @public
* Per-page link extraction result.
* - pageNumber: the physical page index (1-based) within the PDF document.
* - pageLabel: optional printed page label shown by PDF viewers (e.g. "iii", "1", "A-1");
* this can differ from the physical page number and may be undefined
* when the document does not provide labels.
* - links: array of text-&gt;URL mappings that were found/overlaid on the page.
* - width/height: page dimensions in PDF units for the viewport used.
*/
export declare type PageLinkResult = {
pageNumber: number;
pageLabel?: string | null;
links: Array<{
text: string;
url: string;
}>;
width: number;
height: number;
};
/**
* @public
* PageTableResult
*/
export declare interface PageTableResult {
num: number;
tables: TableArray[];
}
/**
* @public
* PageTextResult
*/
export declare interface PageTextResult {
num: number;
text: string;
}
/**
* @public
* ParseParameters
* Options to control parsing behavior and output formatting.
*/
export declare interface ParseParameters {
/**
* Array of page numbers to parse.
* When provided, only these pages will be parsed and returned in the same order.
* Example: [1, 3, 5]. Parse only one page: [7].
* Default: `undefined`.
*/
partial?: Array<number>;
/**
* Parse the first N pages (pages 1..N).
* Ignored when `partial` is provided. If both `first` and `last` are set, they define
* an explicit inclusive page range (first..last) and this "first N" semantics is ignored.
* Default: `undefined`.
*/
first?: number;
/**
* Parse the last N pages (pages total-N+1..total).
* Ignored when `partial` is provided. If both `first` and `last` are set, they define
* an explicit inclusive page range (first..last) and this "last N" semantics is ignored.
* Default: `undefined`.
*/
last?: number;
/**
* Collect per-page metadata such as embedded links, title, pageLabel, and dimensions;
* ISBN, DOI, abstract, and references are work in progress when getInfo() is used.
* Default: `false`.
*/
parsePageInfo?: boolean;
/**
* Attempt to detect and include hyperlink annotations (e.g. URLs) associated with text.
* Detected links are formatted as Markdown inline links (for example: [text](https://example.com)).
* Default: `false`.
*/
parseHyperlinks?: boolean;
/**
* Enforce logical line breaks by inserting a newline when the vertical distance
* between text items exceeds `lineThreshold`.
* Useful to preserve paragraph/line structure when text items are emitted as separate segments.
* Default: `true`.
*/
lineEnforce?: boolean;
/**
* Threshold to decide whether nearby text items belong to different lines.
* Larger values make the parser more likely to start a new line between items.
* Default: `4.6`.
*/
lineThreshold?: number;
/**
* String inserted between text items on the same line when a sufficiently large horizontal gap is detected.
* Typically used to emulate a cell/column separator (for example, "\\t" for tabs).
* Default: `'\t'`.
*/
cellSeparator?: string;
/**
* Horizontal distance threshold to decide when two text items on the same baseline should be treated as separate cells.
* Larger value produces fewer (wider) cells; smaller value creates more cell breaks.
* Default: `7`.
*/
cellThreshold?: number;
/**
* Optional string appended at the end of each page's extracted text to mark page boundaries.
* Supports placeholders `page_number` and `total_number` which are substituted accordingly.
* If omitted or empty, no page boundary marker is added.
* Default: `'\n-- page_number of total_number --'`.
*/
pageJoiner?: string;
/**
* Optional string used to join text items when returning a page's text.
* If provided, this value is used instead of the default empty-string joining behavior.
* Default: `undefined`.
*/
itemJoiner?: string;
/**
* Minimum image dimension (in pixels) for width or height.
* When set, images where width OR height are below or equal this value will be ignored by `getImage()`.
* Useful for excluding tiny decorative or tracking images.
* Default: `80`.
* Disable: `0`.
*/
imageThreshold?: number;
/**
* Screenshot scale factor: use 1 for the original size, 1.5 for a 50% larger image, etc.
* Default: `1`.
*/
scale?: number;
/**
* Desired screenshot width in pixels.
* When set, the scale option is ignored.
* Default: `undefined`.
*/
desiredWidth?: number;
/**
* Applies to both getImage() and getScreenshot(): include the image as a base64 data URL string.
* Default: `true`.
*/
imageDataUrl?: boolean;
/**
* Applies to both getImage() and getScreenshot(): include the image as a binary buffer.
* Default: `true`.
*/
imageBuffer?: boolean;
/**
* Include marked content items in the items array of TextContent to capture PDF "marked content".
* Enables tags (MCID, role/props) and structural/accessibility information useful for mapping text ↔ structure.
* For plain text extraction it's usually false (trade-off: larger output).
* Default: `false`.
*/
includeMarkedContent?: boolean;
/**
* When true, text normalization is NOT performed in the worker thread.
* For plain text extraction, normalizing in the worker (false) is usually recommended.
* Default: `false`.
*/
disableNormalization?: boolean;
}
/**
* Error indicating a PDF file requires a password or the provided password is incorrect.
*
* @public
*/
export declare class PasswordException extends Error {
/**
* Create a new PasswordException.
* @param message - Optional error message.
* @param cause - Optional underlying cause.
*/
constructor(message?: string, cause?: unknown);
}
export { PDFDataRangeTransport }
/**
* @public
* Loads PDF documents and exposes helpers for text, image, table, metadata, and screenshot extraction.
*/
export declare class PDFParse {
private readonly options;
private doc;
progress: {
loaded: number;
total: number;
};
/**
* Create a new parser with `LoadParameters`.
* Converts Node.js `Buffer` data to `Uint8Array` automatically and ensures a default verbosity level.
* @param options - Initialization parameters.
*/
constructor(options: LoadParameters);
destroy(): Promise<void>;
static get isNodeJS(): boolean;
static setWorker(workerSrc?: string): string;
/**
* Load document-level metadata (info, outline, permissions, page labels) and optionally gather per-page link details.
* @param params - Parse options; set `parsePageInfo` to collect per-page metadata described in `ParseParameters`.
* @returns Aggregated document metadata in an `InfoResult`.
*/
getInfo(params?: ParseParameters): Promise<InfoResult>;
private getPageLinks;
/**
* Extract plain text for each requested page, optionally enriching hyperlinks and enforcing line or cell separators.
* @param params - Parse options controlling pagination, link handling, and line/cell thresholds.
* @returns A `TextResult` containing page-wise text and a concatenated document string.
*/
getText(params?: ParseParameters): Promise<TextResult>;
private load;
private shouldParse;
private getPageText;
private getHyperlinks;
/**
* Extract embedded images from requested pages.
*
* Behavior notes:
* - Pages are selected according to ParseParameters (partial, first, last).
* - Images smaller than `params.imageThreshold` (width OR height) are skipped.
* - Returned ImageResult contains per-page PageImages; each image entry includes:
* - data: Uint8Array (present when params.imageBuffer === true)
* - dataUrl: string (present when params.imageDataUrl === true)
* - width, height, kind, name
* - Works in both Node.js (canvas.toBuffer) and browser (canvas.toDataURL) environments.
*
* @param params - ParseParameters controlling page selection, thresholds and output format.
* @returns Promise<ImageResult> with extracted images grouped by page.
*/
getImage(params?: ParseParameters): Promise<ImageResult>;
private convertToRGBA;
private resolveEmbeddedImage;
/**
* Render pages to raster screenshots.
*
* Behavior notes:
* - Pages are selected according to ParseParameters (partial, first, last).
* - Use params.scale for zoom; if params.desiredWidth is specified it takes precedence.
* - Each ScreenshotResult page contains:
* - data: Uint8Array (when params.imageBuffer === true)
* - dataUrl: string (when params.imageDataUrl === true)
* - pageNumber, width, height, scale
* - Works in both Node.js (canvas.toBuffer) and browser (canvas.toDataURL) environments.
*
* @param parseParams - ParseParameters controlling page selection and render options.
* @returns Promise<ScreenshotResult> with rendered page images.
*/
getScreenshot(parseParams?: ParseParameters): Promise<ScreenshotResult>;
/**
* Detect and extract tables from pages by analysing vector drawing operators, then populate cells with text.
*
* Behavior notes:
* - Scans operator lists for rectangles/lines that form table grids (uses PathGeometry and LineStore).
* - Normalizes detected geometry and matches positioned text to table cells.
* - Honors ParseParameters for page selection.
*
* @param params - ParseParameters controlling which pages to analyse (partial/first/last).
* @returns Promise<TableResult> containing discovered tables per page.
*/
getTable(params?: ParseParameters): Promise<TableResult>;
private getPathGeometry;
private getPageTables;
private fillPageTables;
}
export { PDFWorker }
export declare class Point extends Shape {
x: number;
y: number;
constructor(x: number, y: number);
equal(point: Point): boolean;
transform(matrix: Array<number>): this;
}
export declare class Rectangle extends Shape {
from: Point;
width: number;
height: number;
constructor(from: Point, width: number, height: number);
get to(): Point;
getLines(): Line[];
transform(matrix: Array<number>): this;
}
/**
* Represents an HTTP/network response error encountered while fetching PDF data.
*
* The `status` and `missing` properties mirror values that may be provided
* by the underlying PDF library's network layer.
*
* @public
*/
export declare class ResponseException extends Error {
/**
* Create a new ResponseException.
* @param message - Optional error message.
* @param status - Optional numeric HTTP/status code.
* @param missing - Optional field describing missing resources.
* @param cause - Optional underlying cause.
*/
constructor(message?: string, status?: number, missing?: unknown, cause?: unknown);
}
/**
* @public
* SafeParseParameters
*/
export declare type SafeParseParameters = Required<Pick<ParseParameters, 'lineThreshold' | 'cellThreshold' | 'scale'>> & ParseParameters;
/**
* @public
* Screenshot
*/
export declare interface Screenshot {
data: Uint8Array;
dataUrl: string;
pageNumber: number;
width: number;
height: number;
scale: number;
}
/**
* @public
* ScreenshotResult
*/
export declare class ScreenshotResult {
pages: Array<Screenshot>;
total: number;
constructor(total: number);
}
export declare function setDefaultParseParameters(params: ParseParameters): SafeParseParameters;
export declare abstract class Shape {
static tolerance: number;
abstract transform(matrix: Array<number>): this;
static applyTransform(p: Array<number>, m: Array<number>): Array<number>;
}
export declare class Table {
hLines: Array<Line>;
vLines: Array<Line>;
constructor(line: Line);
get isValid(): boolean;
get rowPivots(): Array<number>;
get colPivots(): Array<number>;
add(line: Line): boolean;
private intersection;
private getSameHorizontal;
private getSameVertical;
private mergeHorizontalLines;
private mergeVerticalLines;
normalize(): void;
verticalExists(line: Line, y1: number, y2: number): boolean;
horizontalExists(line: Line, x1: number, x2: number): boolean;
private findBottomLineIndex;
private findVerticalLineIndexs;
private getRow;
toData(): TableData;
}
export declare type TableArray = Array<Array<string>>;
declare type TableCell = {
minXY: Point;
maxXY: Point;
width: number;
height: number;
colspan?: number;
rowspan?: number;
text: Array<string>;
};
declare class TableData {
minXY: Point;
maxXY: Point;
rows: Array<TableRow>;
private rowPivots;
private colPivots;
constructor(minXY: Point, maxXY: Point, rowPivots: Array<number>, colPivots: Array<number>);
findCell(x: number, y: number): TableCell | undefined;
get cellCount(): number;
get rowCount(): number;
check(): boolean;
toArray(): string[][];
}
/**
* @public
* TableResult
*/
export declare class TableResult {
pages: Array<PageTableResult>;
mergedTables: TableArray[];
total: number;
constructor(total: number);
}
declare type TableRow = Array<TableCell>;
/**
* @public
* TextResult
*/
export declare class TextResult {
pages: Array<PageTextResult>;
text: string;
total: number;
getPageText(num: number): string;
constructor(total: number);
}
export declare type TypedArray = Int8Array | Uint8Array | Uint8ClampedArray | Int16Array | Uint16Array | Int32Array | Uint32Array | Float32Array | Float64Array;
/**
* Generic wrapper for errors where the library cannot classify the cause.
*
* The `details` property may contain additional information provided by the
* underlying PDF library.
*
* @public
*/
export declare class UnknownErrorException extends Error {
/**
* Create a new UnknownErrorException.
* @param message - Optional error message.
* @param details - Optional additional details from the PDF library.
* @param cause - Optional underlying cause.
*/
constructor(message?: string, details?: unknown, cause?: unknown);
}
export { VerbosityLevel }
export { }

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,107 @@
/**
* Error thrown when the parsed data is not a valid PDF document.
*
* Use this exception to signal that the input cannot be interpreted as a PDF
* (corrupt file, invalid header, etc.).
*
* @public
*/
export declare class InvalidPDFException extends Error {
/**
* Create a new InvalidPDFException.
* @param message - Optional error message.
* @param cause - Optional underlying cause (preserved on modern runtimes).
*/
constructor(message?: string, cause?: unknown);
}
/**
* Error indicating a PDF file requires a password or the provided password is incorrect.
*
* @public
*/
export declare class PasswordException extends Error {
/**
* Create a new PasswordException.
* @param message - Optional error message.
* @param cause - Optional underlying cause.
*/
constructor(message?: string, cause?: unknown);
}
/**
* Error thrown when the PDF structure/contents are malformed and cannot be parsed.
*
* This is raised for low-level format problems detected while reading PDF objects.
* Errors caused during parsing PDF data.
*
* @public
*/
export declare class FormatError extends Error {
/**
* Create a new FormatError.
* @param message - Optional message describing the format problem.
* @param cause - Optional underlying cause.
*/
constructor(message?: string, cause?: unknown);
}
/**
* Generic wrapper for errors where the library cannot classify the cause.
*
* The `details` property may contain additional information provided by the
* underlying PDF library.
*
* @public
*/
export declare class UnknownErrorException extends Error {
/**
* Create a new UnknownErrorException.
* @param message - Optional error message.
* @param details - Optional additional details from the PDF library.
* @param cause - Optional underlying cause.
*/
constructor(message?: string, details?: unknown, cause?: unknown);
}
/**
* Represents an HTTP/network response error encountered while fetching PDF data.
*
* The `status` and `missing` properties mirror values that may be provided
* by the underlying PDF library's network layer.
*
* @public
*/
export declare class ResponseException extends Error {
/**
* Create a new ResponseException.
* @param message - Optional error message.
* @param status - Optional numeric HTTP/status code.
* @param missing - Optional field describing missing resources.
* @param cause - Optional underlying cause.
*/
constructor(message?: string, status?: number, missing?: unknown, cause?: unknown);
}
/**
* Error used to indicate that an operation was aborted (for example by an AbortSignal).
*
* @public
*/
export declare class AbortException extends Error {
/**
* Create a new AbortException.
* @param message - Optional error message.
* @param cause - Optional underlying cause.
*/
constructor(message?: string, cause?: unknown);
}
/**
* Normalize arbitrary thrown values into an Error instance used by the library.
*
* Known Error instances with specific names are mapped to the library's
* typed exceptions in order to preserve type information and any additional
* fields (for example `details`, `status`, etc.). If the value is not an
* Error it is converted to a generic Error containing the stringified value.
*
* @public
* @param error - The thrown value to normalize.
* @returns An Error instance representing the provided value.
*/
export declare function getException(error: unknown): Error;
//# sourceMappingURL=Exception.d.ts.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"Exception.d.ts","sourceRoot":"","sources":["../../../src/pdf-parse/Exception.ts"],"names":[],"mappings":"AAEA;;;;;;;GAOG;AACH,qBAAa,mBAAoB,SAAQ,KAAK;IAC7C;;;;OAIG;gBACS,OAAO,CAAC,EAAE,MAAM,EAAE,KAAK,CAAC,EAAE,OAAO;CAkB7C;AAED;;;;GAIG;AACH,qBAAa,iBAAkB,SAAQ,KAAK;IAC3C;;;;OAIG;gBACS,OAAO,CAAC,EAAE,MAAM,EAAE,KAAK,CAAC,EAAE,OAAO;CAa7C;AAED;;;;;;;GAOG;AACH,qBAAa,WAAY,SAAQ,KAAK;IACrC;;;;OAIG;gBACS,OAAO,CAAC,EAAE,MAAM,EAAE,KAAK,CAAC,EAAE,OAAO;CAa7C;AAED;;;;;;;GAOG;AACH,qBAAa,qBAAsB,SAAQ,KAAK;IAC/C;;;;;OAKG;gBACS,OAAO,CAAC,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,OAAO,EAAE,KAAK,CAAC,EAAE,OAAO;CAchE;AAED;;;;;;;GAOG;AACH,qBAAa,iBAAkB,SAAQ,KAAK;IAC3C;;;;;;OAMG;gBACS,OAAO,CAAC,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,OAAO,EAAE,KAAK,CAAC,EAAE,OAAO;CAejF;AAED;;;;GAIG;AACH,qBAAa,cAAe,SAAQ,KAAK;IACxC;;;;OAIG;gBACS,OAAO,CAAC,EAAE,MAAM,EAAE,KAAK,CAAC,EAAE,OAAO;CAY7C;AAED;;;;;;;;;;;GAWG;AACH,wBAAgB,YAAY,CAAC,KAAK,EAAE,OAAO,GAAG,KAAK,CAyBlD"}

217
node_modules/pdf-parse/dist/pdf-parse/esm/Exception.js generated vendored Normal file
View File

@@ -0,0 +1,217 @@
/* biome-ignore-all lint/suspicious/noExplicitAny: underline-type */
/**
* Error thrown when the parsed data is not a valid PDF document.
*
* Use this exception to signal that the input cannot be interpreted as a PDF
* (corrupt file, invalid header, etc.).
*
* @public
*/
export class InvalidPDFException extends Error {
/**
* Create a new InvalidPDFException.
* @param message - Optional error message.
* @param cause - Optional underlying cause (preserved on modern runtimes).
*/
constructor(message, cause) {
if (cause !== undefined) {
// Use modern ErrorOptions to attach cause when supported
super(message ?? 'Invalid PDF', { cause });
}
else {
super(message ?? 'Invalid PDF');
}
this.name = 'InvalidPDFException';
// Fix TS/ES prototype chain (required)
Object.setPrototypeOf(this, InvalidPDFException.prototype);
// preserve native stack trace where available
if (typeof Error.captureStackTrace === 'function') {
Error.captureStackTrace(this, InvalidPDFException);
}
// If you need to support older TS/targets that don't accept ErrorOptions,
// replace the above super(...) with super(...); and uncomment:
// if (cause !== undefined) (this as any).cause = cause;
}
}
/**
* Error indicating a PDF file requires a password or the provided password is incorrect.
*
* @public
*/
export class PasswordException extends Error {
/**
* Create a new PasswordException.
* @param message - Optional error message.
* @param cause - Optional underlying cause.
*/
constructor(message, cause) {
if (cause !== undefined) {
super(message ?? 'Password required or incorrect', { cause });
}
else {
super(message ?? 'Password required or incorrect');
}
this.name = 'PasswordException';
Object.setPrototypeOf(this, PasswordException.prototype);
if (typeof Error.captureStackTrace === 'function') {
Error.captureStackTrace(this, PasswordException);
}
// Fallback for older targets: if needed use (this as any).cause = cause;
}
}
/**
* Error thrown when the PDF structure/contents are malformed and cannot be parsed.
*
* This is raised for low-level format problems detected while reading PDF objects.
* Errors caused during parsing PDF data.
*
* @public
*/
export class FormatError extends Error {
/**
* Create a new FormatError.
* @param message - Optional message describing the format problem.
* @param cause - Optional underlying cause.
*/
constructor(message, cause) {
if (cause !== undefined) {
super(message ?? 'PDF format error', { cause });
}
else {
super(message ?? 'PDF format error');
}
this.name = 'FormatError';
Object.setPrototypeOf(this, FormatError.prototype);
if (typeof Error.captureStackTrace === 'function') {
Error.captureStackTrace(this, FormatError);
}
// Fallback for older targets: if needed use (this as any).cause = cause;
}
}
/**
* Generic wrapper for errors where the library cannot classify the cause.
*
* The `details` property may contain additional information provided by the
* underlying PDF library.
*
* @public
*/
export class UnknownErrorException extends Error {
/**
* Create a new UnknownErrorException.
* @param message - Optional error message.
* @param details - Optional additional details from the PDF library.
* @param cause - Optional underlying cause.
*/
constructor(message, details, cause) {
if (cause !== undefined) {
super(message ?? 'Unknown error', { cause });
}
else {
super(message ?? 'Unknown error');
}
this.name = 'UnknownErrorException';
Object.setPrototypeOf(this, UnknownErrorException.prototype);
if (typeof Error.captureStackTrace === 'function') {
Error.captureStackTrace(this, UnknownErrorException);
}
// additional info field from pdf.mjs
this.details = details;
}
}
/**
* Represents an HTTP/network response error encountered while fetching PDF data.
*
* The `status` and `missing` properties mirror values that may be provided
* by the underlying PDF library's network layer.
*
* @public
*/
export class ResponseException extends Error {
/**
* Create a new ResponseException.
* @param message - Optional error message.
* @param status - Optional numeric HTTP/status code.
* @param missing - Optional field describing missing resources.
* @param cause - Optional underlying cause.
*/
constructor(message, status, missing, cause) {
if (cause !== undefined) {
super(message ?? 'Response error', { cause });
}
else {
super(message ?? 'Response error');
}
this.name = 'ResponseException';
Object.setPrototypeOf(this, ResponseException.prototype);
if (typeof Error.captureStackTrace === 'function') {
Error.captureStackTrace(this, ResponseException);
}
// fields from pdf.mjs
this.status = status;
this.missing = missing;
}
}
/**
* Error used to indicate that an operation was aborted (for example by an AbortSignal).
*
* @public
*/
export class AbortException extends Error {
/**
* Create a new AbortException.
* @param message - Optional error message.
* @param cause - Optional underlying cause.
*/
constructor(message, cause) {
if (cause !== undefined) {
super(message ?? 'Operation aborted', { cause });
}
else {
super(message ?? 'Operation aborted');
}
this.name = 'AbortException';
Object.setPrototypeOf(this, AbortException.prototype);
if (typeof Error.captureStackTrace === 'function') {
Error.captureStackTrace(this, AbortException);
}
}
}
/**
* Normalize arbitrary thrown values into an Error instance used by the library.
*
* Known Error instances with specific names are mapped to the library's
* typed exceptions in order to preserve type information and any additional
* fields (for example `details`, `status`, etc.). If the value is not an
* Error it is converted to a generic Error containing the stringified value.
*
* @public
* @param error - The thrown value to normalize.
* @returns An Error instance representing the provided value.
*/
export function getException(error) {
if (error instanceof Error) {
// preserve original error (stack) when not remapping
switch (error.name) {
case 'InvalidPDFException':
return new InvalidPDFException(error.message, error);
case 'PasswordException':
return new PasswordException(error.message, error);
case 'FormatError':
return new FormatError(error.message, error);
case 'UnknownErrorException':
// preserve details if present on original
return new UnknownErrorException(error.message, error.details, error);
case 'ResponseException':
return new ResponseException(error.message, error.status, error.missing, error);
case 'AbortException':
return new AbortException(error.message, error);
// add other mappings as needed
default:
return error;
}
}
// non-Error value -> convert to Error
return new Error(String(error));
}
//# sourceMappingURL=Exception.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"Exception.js","sourceRoot":"","sources":["../../../src/pdf-parse/Exception.ts"],"names":[],"mappings":"AAAA,oEAAoE;AAEpE;;;;;;;GAOG;AACH,MAAM,OAAO,mBAAoB,SAAQ,KAAK;IAC7C;;;;OAIG;IACH,YAAY,OAAgB,EAAE,KAAe;QAC5C,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;YACzB,yDAAyD;YACzD,KAAK,CAAC,OAAO,IAAI,aAAa,EAAE,EAAE,KAAK,EAAE,CAAC,CAAC;QAC5C,CAAC;aAAM,CAAC;YACP,KAAK,CAAC,OAAO,IAAI,aAAa,CAAC,CAAC;QACjC,CAAC;QACD,IAAI,CAAC,IAAI,GAAG,qBAAqB,CAAC;QAClC,uCAAuC;QACvC,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,mBAAmB,CAAC,SAAS,CAAC,CAAC;QAC3D,8CAA8C;QAC9C,IAAI,OAAQ,KAAa,CAAC,iBAAiB,KAAK,UAAU,EAAE,CAAC;YAC3D,KAAa,CAAC,iBAAiB,CAAC,IAAI,EAAE,mBAAmB,CAAC,CAAC;QAC7D,CAAC;QACD,0EAA0E;QAC1E,+DAA+D;QAC/D,wDAAwD;IACzD,CAAC;CACD;AAED;;;;GAIG;AACH,MAAM,OAAO,iBAAkB,SAAQ,KAAK;IAC3C;;;;OAIG;IACH,YAAY,OAAgB,EAAE,KAAe;QAC5C,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;YACzB,KAAK,CAAC,OAAO,IAAI,gCAAgC,EAAE,EAAE,KAAK,EAAE,CAAC,CAAC;QAC/D,CAAC;aAAM,CAAC;YACP,KAAK,CAAC,OAAO,IAAI,gCAAgC,CAAC,CAAC;QACpD,CAAC;QACD,IAAI,CAAC,IAAI,GAAG,mBAAmB,CAAC;QAChC,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,iBAAiB,CAAC,SAAS,CAAC,CAAC;QACzD,IAAI,OAAQ,KAAa,CAAC,iBAAiB,KAAK,UAAU,EAAE,CAAC;YAC3D,KAAa,CAAC,iBAAiB,CAAC,IAAI,EAAE,iBAAiB,CAAC,CAAC;QAC3D,CAAC;QACD,yEAAyE;IAC1E,CAAC;CACD;AAED;;;;;;;GAOG;AACH,MAAM,OAAO,WAAY,SAAQ,KAAK;IACrC;;;;OAIG;IACH,YAAY,OAAgB,EAAE,KAAe;QAC5C,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;YACzB,KAAK,CAAC,OAAO,IAAI,kBAAkB,EAAE,EAAE,KAAK,EAAE,CAAC,CAAC;QACjD,CAAC;aAAM,CAAC;YACP,KAAK,CAAC,OAAO,IAAI,kBAAkB,CAAC,CAAC;QACtC,CAAC;QACD,IAAI,CAAC,IAAI,GAAG,aAAa,CAAC;QAC1B,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,WAAW,CAAC,SAAS,CAAC,CAAC;QACnD,IAAI,OAAQ,KAAa,CAAC,iBAAiB,KAAK,UAAU,EAAE,CAAC;YAC3D,KAAa,CAAC,iBAAiB,CAAC,IAAI,EAAE,WAAW,CAAC,CAAC;QACrD,CAAC;QACD,yEAAyE;IAC1E,CAAC;CACD;AAED;;;;;;;GAOG;AACH,MAAM,OAAO,qBAAsB,SAAQ,KAAK;IAC/C;;;;;OAKG;IACH,YAAY,OAAgB,EAAE,OAAiB,EAAE,KAAe;QAC/D,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;YACzB,KAAK,CAAC,OAAO,IAAI,eAAe,EAAE,EAAE,KAAK,EAAE,CAAC,CAAC;QAC9C,CAAC;aAAM,CAAC;YACP,KAAK,CAAC,OAAO,IAAI,eAAe,CAAC,CAAC;QACnC,CAAC;QACD,IAAI,CAAC,IAAI,GAAG,uBAAuB,CAAC;QACpC,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,qBAAqB,CAAC,SAAS,CAAC,CAAC;QAC7D,IAAI,OAAQ,KAAa,CAAC,iBAAiB,KAAK,UAAU,EAAE,CAAC;YAC3D,KAAa,CAAC,iBAAiB,CAAC,IAAI,EAAE,qBAAqB,CAAC,CAAC;QAC/D,CAAC;QACD,qCAAqC;QACpC,IAAY,CAAC,OAAO,GAAG,OAAO,CAAC;IACjC,CAAC;CACD;AAED;;;;;;;GAOG;AACH,MAAM,OAAO,iBAAkB,SAAQ,KAAK;IAC3C;;;;;;OAMG;IACH,YAAY,OAAgB,EAAE,MAAe,EAAE,OAAiB,EAAE,KAAe;QAChF,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;YACzB,KAAK,CAAC,OAAO,IAAI,gBAAgB,EAAE,EAAE,KAAK,EAAE,CAAC,CAAC;QAC/C,CAAC;aAAM,CAAC;YACP,KAAK,CAAC,OAAO,IAAI,gBAAgB,CAAC,CAAC;QACpC,CAAC;QACD,IAAI,CAAC,IAAI,GAAG,mBAAmB,CAAC;QAChC,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,iBAAiB,CAAC,SAAS,CAAC,CAAC;QACzD,IAAI,OAAQ,KAAa,CAAC,iBAAiB,KAAK,UAAU,EAAE,CAAC;YAC3D,KAAa,CAAC,iBAAiB,CAAC,IAAI,EAAE,iBAAiB,CAAC,CAAC;QAC3D,CAAC;QACD,sBAAsB;QACrB,IAAY,CAAC,MAAM,GAAG,MAAM,CAAC;QAC7B,IAAY,CAAC,OAAO,GAAG,OAAO,CAAC;IACjC,CAAC;CACD;AAED;;;;GAIG;AACH,MAAM,OAAO,cAAe,SAAQ,KAAK;IACxC;;;;OAIG;IACH,YAAY,OAAgB,EAAE,KAAe;QAC5C,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;YACzB,KAAK,CAAC,OAAO,IAAI,mBAAmB,EAAE,EAAE,KAAK,EAAE,CAAC,CAAC;QAClD,CAAC;aAAM,CAAC;YACP,KAAK,CAAC,OAAO,IAAI,mBAAmB,CAAC,CAAC;QACvC,CAAC;QACD,IAAI,CAAC,IAAI,GAAG,gBAAgB,CAAC;QAC7B,MAAM,CAAC,cAAc,CAAC,IAAI,EAAE,cAAc,CAAC,SAAS,CAAC,CAAC;QACtD,IAAI,OAAQ,KAAa,CAAC,iBAAiB,KAAK,UAAU,EAAE,CAAC;YAC3D,KAAa,CAAC,iBAAiB,CAAC,IAAI,EAAE,cAAc,CAAC,CAAC;QACxD,CAAC;IACF,CAAC;CACD;AAED;;;;;;;;;;;GAWG;AACH,MAAM,UAAU,YAAY,CAAC,KAAc;IAC1C,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;QAC5B,qDAAqD;QACrD,QAAQ,KAAK,CAAC,IAAI,EAAE,CAAC;YACpB,KAAK,qBAAqB;gBACzB,OAAO,IAAI,mBAAmB,CAAC,KAAK,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;YACtD,KAAK,mBAAmB;gBACvB,OAAO,IAAI,iBAAiB,CAAC,KAAK,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;YACpD,KAAK,aAAa;gBACjB,OAAO,IAAI,WAAW,CAAC,KAAK,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;YAC9C,KAAK,uBAAuB;gBAC3B,0CAA0C;gBAC1C,OAAO,IAAI,qBAAqB,CAAC,KAAK,CAAC,OAAO,EAAG,KAAa,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;YAChF,KAAK,mBAAmB;gBACvB,OAAO,IAAI,iBAAiB,CAAC,KAAK,CAAC,OAAO,EAAG,KAAa,CAAC,MAAM,EAAG,KAAa,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;YACnG,KAAK,gBAAgB;gBACpB,OAAO,IAAI,cAAc,CAAC,KAAK,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;YACjD,+BAA+B;YAC/B;gBACC,OAAO,KAAK,CAAC;QACf,CAAC;IACF,CAAC;IAED,sCAAsC;IACtC,OAAO,IAAI,KAAK,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC;AACjC,CAAC"}

View File

@@ -0,0 +1,55 @@
import type { ImageKind } from 'pdfjs-dist/legacy/build/pdf.mjs';
/**
* @public
* ImageKindKey
* - Represents the keys of the ImageKind enum (e.g. "GRAYSCALE_1BPP", "RGB_24BPP", "RGBA_32BPP").
*/
export type ImageKindKey = keyof typeof ImageKind;
/**
* @public
* ImageKindValue
* - Represents the numeric values of the ImageKind enum (e.g. 1, 2, 3).
*/
export type ImageKindValue = (typeof ImageKind)[ImageKindKey];
/**
* @public
* ImageResult
* Helper container for extracted images grouped per page.
*/
export declare class ImageResult {
pages: Array<PageImages>;
total: number;
getPageImage(num: number, name: string): EmbeddedImage | null;
constructor(total: number);
}
/**
* @public
* PageImages
* - Represents all embedded images found on a single PDF page.
* - pageNumber: 1-based page index.
* - images: Array of EmbeddedImage objects for this page.
*/
export interface PageImages {
pageNumber: number;
images: EmbeddedImage[];
}
/**
* @public
* EmbeddedImage
* - Normalized representation of an embedded image extracted from the PDF.
* - `data`: Raw image bytes (e.g. PNG/JPEG) as Uint8Array. Use this for file writing or binary processing.
* - `dataUrl`: Optional data URL (e.g. "data:image/png;base64,...") for directly embedding in <img> src.
* Storing both lets consumers choose the most convenient form; consider omitting one to save memory.
* - `name`: Resource name for the image.
* - `width` / `height`: Dimensions in pixels.
* - `kind`: ImageKindValue from indicating the pixel format (e.g. GRAYSCALE_1BPP / RGB_24BPP / RGBA_32BPP).
*/
export interface EmbeddedImage {
data: Uint8Array;
dataUrl: string;
name: string;
width: number;
height: number;
kind: ImageKindValue;
}
//# sourceMappingURL=ImageResult.d.ts.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"ImageResult.d.ts","sourceRoot":"","sources":["../../../src/pdf-parse/ImageResult.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,iCAAiC,CAAC;AAEjE;;;;GAIG;AACH,MAAM,MAAM,YAAY,GAAG,MAAM,OAAO,SAAS,CAAC;AAElD;;;;GAIG;AACH,MAAM,MAAM,cAAc,GAAG,CAAC,OAAO,SAAS,CAAC,CAAC,YAAY,CAAC,CAAC;AAE9D;;;;GAIG;AACH,qBAAa,WAAW;IACvB,KAAK,EAAE,KAAK,CAAC,UAAU,CAAC,CAAM;IAC9B,KAAK,EAAE,MAAM,CAAK;IAEX,YAAY,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,aAAa,GAAG,IAAI;gBAaxD,KAAK,EAAE,MAAM;CAGzB;AAED;;;;;;GAMG;AACH,MAAM,WAAW,UAAU;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,MAAM,EAAE,aAAa,EAAE,CAAC;CACxB;AAED;;;;;;;;;;GAUG;AACH,MAAM,WAAW,aAAa;IAE7B,IAAI,EAAE,UAAU,CAAC;IAGjB,OAAO,EAAE,MAAM,CAAC;IAGhB,IAAI,EAAE,MAAM,CAAC;IAGb,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IAGf,IAAI,EAAE,cAAc,CAAC;CACrB"}

View File

@@ -0,0 +1,25 @@
/**
* @public
* ImageResult
* Helper container for extracted images grouped per page.
*/
export class ImageResult {
pages = [];
total = 0;
getPageImage(num, name) {
for (const pageData of this.pages) {
if (pageData.pageNumber === num) {
for (const img of pageData.images) {
if (img.name === name) {
return img;
}
}
}
}
return null;
}
constructor(total) {
this.total = total;
}
}
//# sourceMappingURL=ImageResult.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"ImageResult.js","sourceRoot":"","sources":["../../../src/pdf-parse/ImageResult.ts"],"names":[],"mappings":"AAgBA;;;;GAIG;AACH,MAAM,OAAO,WAAW;IACvB,KAAK,GAAsB,EAAE,CAAC;IAC9B,KAAK,GAAW,CAAC,CAAC;IAEX,YAAY,CAAC,GAAW,EAAE,IAAY;QAC5C,KAAK,MAAM,QAAQ,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;YACnC,IAAI,QAAQ,CAAC,UAAU,KAAK,GAAG,EAAE,CAAC;gBACjC,KAAK,MAAM,GAAG,IAAI,QAAQ,CAAC,MAAM,EAAE,CAAC;oBACnC,IAAI,GAAG,CAAC,IAAI,KAAK,IAAI,EAAE,CAAC;wBACvB,OAAO,GAAG,CAAC;oBACZ,CAAC;gBACF,CAAC;YACF,CAAC;QACF,CAAC;QACD,OAAO,IAAI,CAAC;IACb,CAAC;IAED,YAAY,KAAa;QACxB,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACpB,CAAC;CACD"}

View File

@@ -0,0 +1,104 @@
import type { Metadata } from 'pdfjs-dist/types/src/display/metadata.js';
export type { Metadata } from 'pdfjs-dist/types/src/display/metadata.js';
/**
* @public
* Node representing a single item in the PDF outline (bookmarks).
* This mirrors the structure returned by PDF.js' getOutline() API.
*/
export interface OutlineNode {
title: string;
bold: boolean;
italic: boolean;
color: Uint8ClampedArray;
dest: string | Array<any> | null;
url: string | null;
unsafeUrl?: string;
newWindow?: boolean;
count?: number;
items: Array<any>;
}
/**
* @public
* Consolidated date information gathered from different PDF sources.
* The PDF 'Info' dictionary contains CreationDate / ModDate and
* the XMP/XAP metadata can contain several timestamps as well. This
* structure collects those values (if present) as JavaScript Date objects
* or null when the property exists but cannot be parsed.
*/
export type DateNode = {
CreationDate?: Date | null;
ModDate?: Date | null;
XmpCreateDate?: Date | null;
XmpModifyDate?: Date | null;
XmpMetadataDate?: Date | null;
XapCreateDate?: Date | null;
XapModifyDate?: Date | null;
XapMetadataDate?: Date | null;
};
/**
* @public
* Per-page link extraction result.
* - pageNumber: the physical page index (1-based) within the PDF document.
* - pageLabel: optional printed page label shown by PDF viewers (e.g. "iii", "1", "A-1");
* this can differ from the physical page number and may be undefined
* when the document does not provide labels.
* - links: array of text-&gt;URL mappings that were found/overlaid on the page.
* - width/height: page dimensions in PDF units for the viewport used.
*/
export type PageLinkResult = {
pageNumber: number;
pageLabel?: string | null;
links: Array<{
text: string;
url: string;
}>;
width: number;
height: number;
};
/**
* @public
* Aggregated information about a PDF document returned by getInfo().
* The object contains high-level metadata, outline/bookmark structure,
* per-page extracted hyperlinks and utility helpers for parsing dates.
*/
export declare class InfoResult {
total: number;
/**
* The PDF 'Info' dictionary. Typical fields include title, author, subject,
* Creator, Producer and Creation/Modification dates. The exact structure is
* determined by the PDF and as returned by PDF.js.
*/
info?: any;
metadata?: Metadata;
/**
* An array of document fingerprint strings provided by PDF.js. Useful
* for caching, de-duplication or identifying a document across runs.
*/
fingerprints?: Array<string | null>;
/**
* Permission flags for the document as returned by PDF.js (or null).
* These flags indicate capabilities such as printing, copying and
* other restrictions imposed by the PDF security settings.
*/
permission?: number[] | null;
/**
* Optional document outline (bookmarks). When present this is the
* hierarchical navigation structure which viewers use for quick access.
*/
outline?: Array<OutlineNode> | null;
pages: Array<PageLinkResult>;
/**
* Collects dates from different sources (Info dictionary and XMP/XAP metadata)
* and returns them as a DateNode where available. This helps callers compare
* and choose the most relevant timestamp (for example a creation date vs XMP date).
*/
getDateNode(): DateNode;
/**
* Try to parse an ISO-8601 date string from XMP/XAP metadata. If the
* value is falsy or cannot be parsed, undefined is returned to indicate
* absence or unparsable input.
*/
private parseISODateString;
constructor(total: number);
}
//# sourceMappingURL=InfoResult.d.ts.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"InfoResult.d.ts","sourceRoot":"","sources":["../../../src/pdf-parse/InfoResult.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,0CAA0C,CAAC;AAWzE,YAAY,EAAE,QAAQ,EAAE,MAAM,0CAA0C,CAAC;AAEzE;;;;GAIG;AACH,MAAM,WAAW,WAAW;IAE3B,KAAK,EAAE,MAAM,CAAC;IAGd,IAAI,EAAE,OAAO,CAAC;IAGd,MAAM,EAAE,OAAO,CAAC;IAGhB,KAAK,EAAE,iBAAiB,CAAC;IAMzB,IAAI,EAAE,MAAM,GAAG,KAAK,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC;IAGjC,GAAG,EAAE,MAAM,GAAG,IAAI,CAAC;IAGnB,SAAS,CAAC,EAAE,MAAM,CAAC;IAGnB,SAAS,CAAC,EAAE,OAAO,CAAC;IAGpB,KAAK,CAAC,EAAE,MAAM,CAAC;IAIf,KAAK,EAAE,KAAK,CAAC,GAAG,CAAC,CAAC;CAClB;AAED;;;;;;;GAOG;AACH,MAAM,MAAM,QAAQ,GAAG;IACtB,YAAY,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC;IAC3B,OAAO,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC;IACtB,aAAa,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC;IAC5B,aAAa,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC;IAC5B,eAAe,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC;IAC9B,aAAa,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC;IAC5B,aAAa,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC;IAC5B,eAAe,CAAC,EAAE,IAAI,GAAG,IAAI,CAAC;CAC9B,CAAC;AAEF;;;;;;;;;GASG;AACH,MAAM,MAAM,cAAc,GAAG;IAE5B,UAAU,EAAE,MAAM,CAAC;IAInB,SAAS,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;IAI1B,KAAK,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,GAAG,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAG5C,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;CACf,CAAC;AAEF;;;;;GAKG;AACH,qBAAa,UAAU;IAEtB,KAAK,EAAE,MAAM,CAAC;IAEd;;;;OAIG;IAEH,IAAI,CAAC,EAAE,GAAG,CAAC;IAIX,QAAQ,CAAC,EAAE,QAAQ,CAAC;IAEpB;;;OAGG;IACH,YAAY,CAAC,EAAE,KAAK,CAAC,MAAM,GAAG,IAAI,CAAC,CAAC;IAEpC;;;;OAIG;IACH,UAAU,CAAC,EAAE,MAAM,EAAE,GAAG,IAAI,CAAC;IAE7B;;;OAGG;IACH,OAAO,CAAC,EAAE,KAAK,CAAC,WAAW,CAAC,GAAG,IAAI,CAAC;IAGpC,KAAK,EAAE,KAAK,CAAC,cAAc,CAAC,CAAM;IAElC;;;;OAIG;IACI,WAAW,IAAI,QAAQ;IAuD9B;;;;OAIG;IACH,OAAO,CAAC,kBAAkB;gBAWd,KAAK,EAAE,MAAM;CAGzB"}

116
node_modules/pdf-parse/dist/pdf-parse/esm/InfoResult.js generated vendored Normal file
View File

@@ -0,0 +1,116 @@
import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
const XMP_DATE_PROPERTIES = [
'xmp:createdate',
'xmp:modifydate',
'xmp:metadatadate',
'xap:createdate',
'xap:modifydate',
'xap:metadatadate',
];
/**
* @public
* Aggregated information about a PDF document returned by getInfo().
* The object contains high-level metadata, outline/bookmark structure,
* per-page extracted hyperlinks and utility helpers for parsing dates.
*/
export class InfoResult {
// Total number of pages in the PDF document (count of physical pages).
total;
/**
* The PDF 'Info' dictionary. Typical fields include title, author, subject,
* Creator, Producer and Creation/Modification dates. The exact structure is
* determined by the PDF and as returned by PDF.js.
*/
// biome-ignore lint/suspicious/noExplicitAny: <unsupported underline type>
info;
// Low-level document metadata object (XMP). Use this to access extended
// properties that are not present in the Info dictionary.
metadata;
/**
* An array of document fingerprint strings provided by PDF.js. Useful
* for caching, de-duplication or identifying a document across runs.
*/
fingerprints;
/**
* Permission flags for the document as returned by PDF.js (or null).
* These flags indicate capabilities such as printing, copying and
* other restrictions imposed by the PDF security settings.
*/
permission;
/**
* Optional document outline (bookmarks). When present this is the
* hierarchical navigation structure which viewers use for quick access.
*/
outline;
// Results with per-page hyperlink extraction. Empty array by default.
pages = [];
/**
* Collects dates from different sources (Info dictionary and XMP/XAP metadata)
* and returns them as a DateNode where available. This helps callers compare
* and choose the most relevant timestamp (for example a creation date vs XMP date).
*/
getDateNode() {
const result = {};
// The Info dictionary may contain CreationDate/ModDate in PDF date string format.
// biome-ignore lint/suspicious/noExplicitAny: <unsupported underline type>
const CreationDate = this.info?.CreationDate;
if (CreationDate) {
result.CreationDate = pdfjs.PDFDateString.toDateObject(CreationDate);
}
// biome-ignore lint/suspicious/noExplicitAny: <unsupported underline type>
const ModDate = this.info?.ModDate;
if (ModDate) {
result.ModDate = pdfjs.PDFDateString.toDateObject(ModDate);
}
// If no XMP metadata is present, return the Info-based dates only.
if (!this.metadata) {
return result;
}
// Extract several XMP/XAP date properties (if present) and attempt to
// parse them as ISO-like strings. Parsed values are added to the
// corresponding DateNode fields.
for (const prop of XMP_DATE_PROPERTIES) {
const value = this.metadata?.get(prop);
const date = this.parseISODateString(value);
switch (prop) {
case XMP_DATE_PROPERTIES[0]:
result.XmpCreateDate = date;
break;
case XMP_DATE_PROPERTIES[1]:
result.XmpModifyDate = date;
break;
case XMP_DATE_PROPERTIES[2]:
result.XmpMetadataDate = date;
break;
case XMP_DATE_PROPERTIES[3]:
result.XapCreateDate = date;
break;
case XMP_DATE_PROPERTIES[4]:
result.XapModifyDate = date;
break;
case XMP_DATE_PROPERTIES[5]:
result.XapMetadataDate = date;
break;
}
}
return result;
}
/**
* Try to parse an ISO-8601 date string from XMP/XAP metadata. If the
* value is falsy or cannot be parsed, undefined is returned to indicate
* absence or unparsable input.
*/
parseISODateString(isoDateString) {
if (!isoDateString)
return undefined;
const parsedDate = Date.parse(isoDateString);
if (!Number.isNaN(parsedDate)) {
return new Date(parsedDate);
}
return undefined;
}
constructor(total) {
this.total = total;
}
}
//# sourceMappingURL=InfoResult.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"InfoResult.js","sourceRoot":"","sources":["../../../src/pdf-parse/InfoResult.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,KAAK,MAAM,iCAAiC,CAAC;AAGzD,MAAM,mBAAmB,GAAG;IAC3B,gBAAgB;IAChB,gBAAgB;IAChB,kBAAkB;IAClB,gBAAgB;IAChB,gBAAgB;IAChB,kBAAkB;CAClB,CAAC;AA2FF;;;;;GAKG;AACH,MAAM,OAAO,UAAU;IACtB,uEAAuE;IACvE,KAAK,CAAS;IAEd;;;;OAIG;IACH,2EAA2E;IAC3E,IAAI,CAAO;IAEX,wEAAwE;IACxE,0DAA0D;IAC1D,QAAQ,CAAY;IAEpB;;;OAGG;IACH,YAAY,CAAwB;IAEpC;;;;OAIG;IACH,UAAU,CAAmB;IAE7B;;;OAGG;IACH,OAAO,CAA6B;IAEpC,sEAAsE;IACtE,KAAK,GAA0B,EAAE,CAAC;IAElC;;;;OAIG;IACI,WAAW;QACjB,MAAM,MAAM,GAAa,EAAE,CAAC;QAE5B,kFAAkF;QAClF,2EAA2E;QAC3E,MAAM,YAAY,GAAI,IAAI,CAAC,IAAY,EAAE,YAAY,CAAC;QAEtD,IAAI,YAAY,EAAE,CAAC;YAClB,MAAM,CAAC,YAAY,GAAG,KAAK,CAAC,aAAa,CAAC,YAAY,CAAC,YAAY,CAAC,CAAC;QACtE,CAAC;QAED,2EAA2E;QAC3E,MAAM,OAAO,GAAI,IAAI,CAAC,IAAY,EAAE,OAAO,CAAC;QAE5C,IAAI,OAAO,EAAE,CAAC;YACb,MAAM,CAAC,OAAO,GAAG,KAAK,CAAC,aAAa,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC;QAC5D,CAAC;QAED,mEAAmE;QACnE,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,CAAC;YACpB,OAAO,MAAM,CAAC;QACf,CAAC;QAED,sEAAsE;QACtE,iEAAiE;QACjE,iCAAiC;QACjC,KAAK,MAAM,IAAI,IAAI,mBAAmB,EAAE,CAAC;YACxC,MAAM,KAAK,GAAG,IAAI,CAAC,QAAQ,EAAE,GAAG,CAAC,IAAI,CAAC,CAAC;YACvC,MAAM,IAAI,GAAG,IAAI,CAAC,kBAAkB,CAAC,KAAK,CAAC,CAAC;YAE5C,QAAQ,IAAI,EAAE,CAAC;gBACd,KAAK,mBAAmB,CAAC,CAAC,CAAC;oBAC1B,MAAM,CAAC,aAAa,GAAG,IAAI,CAAC;oBAC5B,MAAM;gBACP,KAAK,mBAAmB,CAAC,CAAC,CAAC;oBAC1B,MAAM,CAAC,aAAa,GAAG,IAAI,CAAC;oBAC5B,MAAM;gBACP,KAAK,mBAAmB,CAAC,CAAC,CAAC;oBAC1B,MAAM,CAAC,eAAe,GAAG,IAAI,CAAC;oBAC9B,MAAM;gBACP,KAAK,mBAAmB,CAAC,CAAC,CAAC;oBAC1B,MAAM,CAAC,aAAa,GAAG,IAAI,CAAC;oBAC5B,MAAM;gBACP,KAAK,mBAAmB,CAAC,CAAC,CAAC;oBAC1B,MAAM,CAAC,aAAa,GAAG,IAAI,CAAC;oBAC5B,MAAM;gBACP,KAAK,mBAAmB,CAAC,CAAC,CAAC;oBAC1B,MAAM,CAAC,eAAe,GAAG,IAAI,CAAC;oBAC9B,MAAM;YACR,CAAC;QACF,CAAC;QAED,OAAO,MAAM,CAAC;IACf,CAAC;IAED;;;;OAIG;IACK,kBAAkB,CAAC,aAAqB;QAC/C,IAAI,CAAC,aAAa;YAAE,OAAO,SAAS,CAAC;QAErC,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;QAC7C,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,UAAU,CAAC,EAAE,CAAC;YAC/B,OAAO,IAAI,IAAI,CAAC,UAAU,CAAC,CAAC;QAC7B,CAAC;QAED,OAAO,SAAS,CAAC;IAClB,CAAC;IAED,YAAY,KAAa;QACxB,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACpB,CAAC;CACD"}

View File

@@ -0,0 +1,212 @@
/** biome-ignore-all lint/complexity/noBannedTypes: for underline types */
import type { DocumentInitParameters, PDFDataRangeTransport, PDFWorker } from 'pdfjs-dist/types/src/display/api.js';
export type { PDFDataRangeTransport, PDFWorker };
/**
* @public
* LoadParameters
* PDF loading parameters.
*/
export interface LoadParameters extends DocumentInitParameters {
/**
* The URL of the PDF.
* Default: `undefined`.
*/
url?: string | URL | undefined;
/**
* Binary PDF data.
* Use TypedArrays (e.g., `Uint8Array`) to improve memory usage. If PDF data is BASE64-encoded, use `atob()` to convert it to a binary string first.
* **NOTE**: If TypedArrays are used, they will generally be transferred to the worker thread, reducing main-thread memory usage but taking ownership of the array.
* Default: `undefined`.
*/
data?: string | number[] | ArrayBuffer | TypedArray | undefined;
/**
* Basic authentication headers.
* Default: `undefined`.
*/
httpHeaders?: Object | undefined;
/**
* Indicates whether cross-site Access-Control requests should be made using credentials (e.g., cookies or auth headers).
* Default: `false`.
*/
withCredentials?: boolean | undefined;
/**
* For decrypting password-protected PDFs.
* Default: `undefined`.
*/
password?: string | undefined;
/**
* The PDF file length. Used for progress reports and range requests.
* Default: `undefined`.
*/
length?: number | undefined;
/**
* Allows using a custom range transport implementation.
* Default: `undefined`.
*/
range?: PDFDataRangeTransport | undefined;
/**
* Maximum number of bytes fetched per range request.
* Default: `65536` (`2^16`).
*/
rangeChunkSize?: number | undefined;
/**
* The worker used for loading and parsing PDF data.
* Default: `undefined`.
*/
worker?: PDFWorker | undefined;
/**
* Controls logging level; use constants from `VerbosityLevel`.
* Default: `undefined`.
*/
verbosity?: number | undefined;
/**
* Base URL of the document, used to resolve relative URLs in annotations and outline items.
* Default: `undefined`.
*/
docBaseUrl?: string | undefined;
/**
* URL where predefined Adobe CMaps are located. Include trailing slash.
* Default: `undefined`.
*/
cMapUrl?: string | undefined;
/**
* Specifies if Adobe CMaps are binary-packed.
* Default: `true`.
*/
cMapPacked?: boolean | undefined;
/**
* Factory for reading built-in CMap files.
* Default: `{DOMCMapReaderFactory}`.
*/
CMapReaderFactory?: Object | undefined;
/**
* URL where predefined ICC profiles are located. Include trailing slash.
* Default: `undefined`.
*/
iccUrl?: string | undefined;
/**
* If `true`, non-embedded fonts fall back to system fonts.
* Default: `true` in browsers, `false` in Node.js (unless `disableFontFace === true`, then always `false`).
*/
useSystemFonts?: boolean | undefined;
/**
* URL for standard font files. Include trailing slash.
* Default: `undefined`.
*/
standardFontDataUrl?: string | undefined;
/**
* Factory for reading standard font files.
* Default: `{DOMStandardFontDataFactory}`.
*/
StandardFontDataFactory?: Object | undefined;
/**
* URL for WebAssembly files. Include trailing slash.
* Default: `undefined`.
*/
wasmUrl?: string | undefined;
/**
* Factory for reading WASM files.
* Default: `{DOMWasmFactory}`.
*/
WasmFactory?: Object | undefined;
/**
* Enable `fetch()` in worker thread for CMap/font/WASM files. If `true`, factory options are ignored.
* Default: `true` in browsers, `false` in Node.js.
*/
useWorkerFetch?: boolean | undefined;
/**
* Attempt to use WebAssembly for better performance (e.g., image decoding).
* Default: `true`.
*/
useWasm?: boolean | undefined;
/**
* Reject promises (e.g., `getTextContent`) on parse errors instead of recovering partially.
* Default: `false`.
*/
stopAtErrors?: boolean | undefined;
/**
* Max image size in total pixels (`width * height`). Use `-1` for no limit.
* Default: `-1`.
*/
maxImageSize?: number | undefined;
/**
* Whether evaluating strings as JS is allowed (for PDF function performance).
* Default: `true`.
*/
isEvalSupported?: boolean | undefined;
/**
* Whether `OffscreenCanvas` can be used in worker.
* Default: `true` in browsers, `false` in Node.js.
*/
isOffscreenCanvasSupported?: boolean | undefined;
/**
* Whether `ImageDecoder` can be used in worker.
* Default: `true` in browsers, `false` in Node.js.
* **NOTE**: Temporarily disabled in Chromium due to bugs:
* - Crashes with BMP decoder on huge images ([issue 374807001](https://issues.chromium.org/issues/374807001))
* - Broken JPEGs with custom color profiles ([issue 378869810](https://issues.chromium.org/issues/378869810))
*/
isImageDecoderSupported?: boolean | undefined;
/**
* Used to determine when to resize images (via `OffscreenCanvas`). Use `-1` to use a slower fallback algorithm.
* Default: `undefined`.
*/
canvasMaxAreaInBytes?: number | undefined;
/**
* Disable `@font-face`/Font Loading API; use built-in glyph renderer instead.
* Default: `false` in browsers, `true` in Node.js.
*/
disableFontFace?: boolean | undefined;
/**
* Include extra (non-rendering) font properties when exporting font data from worker. Increases memory usage.
* Default: `false`.
*/
fontExtraProperties?: boolean | undefined;
/**
* Render XFA forms if present.
* Default: `false`.
*/
enableXfa?: boolean | undefined;
/**
* Explicit document context for creating elements and loading resources. Defaults to current document.
* Default: `undefined`.
*/
ownerDocument?: HTMLDocument | undefined;
/**
* Disable range requests for PDF loading.
* Default: `false`.
*/
disableRange?: boolean | undefined;
/**
* Disable streaming PDF data.
* Default: `false`.
*/
disableStream?: boolean | undefined;
/**
* Disable pre-fetching of PDF data. Requires `disableStream: true` to work fully.
* Default: `false`.
*/
disableAutoFetch?: boolean | undefined;
/**
* Enable debugging hooks (see `web/debugger.js`).
* Default: `false`.
*/
pdfBug?: boolean | undefined;
/**
* Factory for creating canvases.
* Default: `{DOMCanvasFactory}`.
*/
CanvasFactory?: Object | undefined;
/**
* Factory for creating SVG filters during rendering.
* Default: `{DOMFilterFactory}`.
*/
FilterFactory?: Object | undefined;
/**
* Enable hardware acceleration for rendering.
* Default: `false`.
*/
enableHWA?: boolean | undefined;
}
export type TypedArray = Int8Array | Uint8Array | Uint8ClampedArray | Int16Array | Uint16Array | Int32Array | Uint32Array | Float32Array | Float64Array;
//# sourceMappingURL=LoadParameters.d.ts.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"LoadParameters.d.ts","sourceRoot":"","sources":["../../../src/pdf-parse/LoadParameters.ts"],"names":[],"mappings":"AAAA,0EAA0E;AAE1E,OAAO,KAAK,EAAE,sBAAsB,EAAE,qBAAqB,EAAE,SAAS,EAAE,MAAM,qCAAqC,CAAC;AAEpH,YAAY,EAAE,qBAAqB,EAAE,SAAS,EAAE,CAAC;AAEjD;;;;GAIG;AACH,MAAM,WAAW,cAAe,SAAQ,sBAAsB;IAC7D;;;OAGG;IACH,GAAG,CAAC,EAAE,MAAM,GAAG,GAAG,GAAG,SAAS,CAAC;IAC/B;;;;;OAKG;IACH,IAAI,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,WAAW,GAAG,UAAU,GAAG,SAAS,CAAC;IAChE;;;OAGG;IACH,WAAW,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IACjC;;;OAGG;IACH,eAAe,CAAC,EAAE,OAAO,GAAG,SAAS,CAAC;IACtC;;;OAGG;IACH,QAAQ,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IAC9B;;;OAGG;IACH,MAAM,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IAC5B;;;OAGG;IACH,KAAK,CAAC,EAAE,qBAAqB,GAAG,SAAS,CAAC;IAC1C;;;OAGG;IACH,cAAc,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IACpC;;;OAGG;IACH,MAAM,CAAC,EAAE,SAAS,GAAG,SAAS,CAAC;IAC/B;;;OAGG;IACH,SAAS,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IAC/B;;;OAGG;IACH,UAAU,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IAChC;;;OAGG;IACH,OAAO,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IAC7B;;;OAGG;IACH,UAAU,CAAC,EAAE,OAAO,GAAG,SAAS,CAAC;IACjC;;;OAGG;IACH,iBAAiB,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IACvC;;;OAGG;IACH,MAAM,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IAC5B;;;OAGG;IACH,cAAc,CAAC,EAAE,OAAO,GAAG,SAAS,CAAC;IACrC;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IACzC;;;OAGG;IACH,uBAAuB,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IAC7C;;;OAGG;IACH,OAAO,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IAC7B;;;OAGG;IACH,WAAW,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IACjC;;;OAGG;IACH,cAAc,CAAC,EAAE,OAAO,GAAG,SAAS,CAAC;IACrC;;;OAGG;IACH,OAAO,CAAC,EAAE,OAAO,GAAG,SAAS,CAAC;IAC9B;;;OAGG;IACH,YAAY,CAAC,EAAE,OAAO,GAAG,SAAS,CAAC;IACnC;;;OAGG;IACH,YAAY,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IAClC;;;OAGG;IACH,eAAe,CAAC,EAAE,OAAO,GAAG,SAAS,CAAC;IACtC;;;OAGG;IACH,0BAA0B,CAAC,EAAE,OAAO,GAAG,SAAS,CAAC;IACjD;;;;;;OAMG;IACH,uBAAuB,CAAC,EAAE,OAAO,GAAG,SAAS,CAAC;IAC9C;;;OAGG;IACH,oBAAoB,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IAC1C;;;OAGG;IACH,eAAe,CAAC,EAAE,OAAO,GAAG,SAAS,CAAC;IACtC;;;OAGG;IACH,mBAAmB,CAAC,EAAE,OAAO,GAAG,SAAS,CAAC;IAC1C;;;OAGG;IACH,SAAS,CAAC,EAAE,OAAO,GAAG,SAAS,CAAC;IAChC;;;OAGG;IACH,aAAa,CAAC,EAAE,YAAY,GAAG,SAAS,CAAC;IACzC;;;OAGG;IACH,YAAY,CAAC,EAAE,OAAO,GAAG,SAAS,CAAC;IACnC;;;OAGG;IACH,aAAa,CAAC,EAAE,OAAO,GAAG,SAAS,CAAC;IACpC;;;OAGG;IACH,gBAAgB,CAAC,EAAE,OAAO,GAAG,SAAS,CAAC;IACvC;;;OAGG;IACH,MAAM,CAAC,EAAE,OAAO,GAAG,SAAS,CAAC;IAC7B;;;OAGG;IACH,aAAa,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IACnC;;;OAGG;IACH,aAAa,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC;IACnC;;;OAGG;IACH,SAAS,CAAC,EAAE,OAAO,GAAG,SAAS,CAAC;CAChC;AAED,MAAM,MAAM,UAAU,GACnB,SAAS,GACT,UAAU,GACV,iBAAiB,GACjB,UAAU,GACV,WAAW,GACX,UAAU,GACV,WAAW,GACX,YAAY,GACZ,YAAY,CAAC"}

View File

@@ -0,0 +1,3 @@
/** biome-ignore-all lint/complexity/noBannedTypes: for underline types */
export {};
//# sourceMappingURL=LoadParameters.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"LoadParameters.js","sourceRoot":"","sources":["../../../src/pdf-parse/LoadParameters.ts"],"names":[],"mappings":"AAAA,0EAA0E"}

View File

@@ -0,0 +1,95 @@
import { ImageResult } from './ImageResult.js';
import { InfoResult } from './InfoResult.js';
import type { LoadParameters } from './LoadParameters.js';
import { type ParseParameters } from './ParseParameters.js';
import { ScreenshotResult } from './ScreenshotResult.js';
import { TableResult } from './TableResult.js';
import { TextResult } from './TextResult.js';
/**
* @public
* Loads PDF documents and exposes helpers for text, image, table, metadata, and screenshot extraction.
*/
export declare class PDFParse {
private readonly options;
private doc;
progress: {
loaded: number;
total: number;
};
/**
* Create a new parser with `LoadParameters`.
* Converts Node.js `Buffer` data to `Uint8Array` automatically and ensures a default verbosity level.
* @param options - Initialization parameters.
*/
constructor(options: LoadParameters);
destroy(): Promise<void>;
static get isNodeJS(): boolean;
static setWorker(workerSrc?: string): string;
/**
* Load document-level metadata (info, outline, permissions, page labels) and optionally gather per-page link details.
* @param params - Parse options; set `parsePageInfo` to collect per-page metadata described in `ParseParameters`.
* @returns Aggregated document metadata in an `InfoResult`.
*/
getInfo(params?: ParseParameters): Promise<InfoResult>;
private getPageLinks;
/**
* Extract plain text for each requested page, optionally enriching hyperlinks and enforcing line or cell separators.
* @param params - Parse options controlling pagination, link handling, and line/cell thresholds.
* @returns A `TextResult` containing page-wise text and a concatenated document string.
*/
getText(params?: ParseParameters): Promise<TextResult>;
private load;
private shouldParse;
private getPageText;
private getHyperlinks;
/**
* Extract embedded images from requested pages.
*
* Behavior notes:
* - Pages are selected according to ParseParameters (partial, first, last).
* - Images smaller than `params.imageThreshold` (width OR height) are skipped.
* - Returned ImageResult contains per-page PageImages; each image entry includes:
* - data: Uint8Array (present when params.imageBuffer === true)
* - dataUrl: string (present when params.imageDataUrl === true)
* - width, height, kind, name
* - Works in both Node.js (canvas.toBuffer) and browser (canvas.toDataURL) environments.
*
* @param params - ParseParameters controlling page selection, thresholds and output format.
* @returns Promise<ImageResult> with extracted images grouped by page.
*/
getImage(params?: ParseParameters): Promise<ImageResult>;
private convertToRGBA;
private resolveEmbeddedImage;
/**
* Render pages to raster screenshots.
*
* Behavior notes:
* - Pages are selected according to ParseParameters (partial, first, last).
* - Use params.scale for zoom; if params.desiredWidth is specified it takes precedence.
* - Each ScreenshotResult page contains:
* - data: Uint8Array (when params.imageBuffer === true)
* - dataUrl: string (when params.imageDataUrl === true)
* - pageNumber, width, height, scale
* - Works in both Node.js (canvas.toBuffer) and browser (canvas.toDataURL) environments.
*
* @param parseParams - ParseParameters controlling page selection and render options.
* @returns Promise<ScreenshotResult> with rendered page images.
*/
getScreenshot(parseParams?: ParseParameters): Promise<ScreenshotResult>;
/**
* Detect and extract tables from pages by analysing vector drawing operators, then populate cells with text.
*
* Behavior notes:
* - Scans operator lists for rectangles/lines that form table grids (uses PathGeometry and LineStore).
* - Normalizes detected geometry and matches positioned text to table cells.
* - Honors ParseParameters for page selection.
*
* @param params - ParseParameters controlling which pages to analyse (partial/first/last).
* @returns Promise<TableResult> containing discovered tables per page.
*/
getTable(params?: ParseParameters): Promise<TableResult>;
private getPathGeometry;
private getPageTables;
private fillPageTables;
}
//# sourceMappingURL=PDFParse.d.ts.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"PDFParse.d.ts","sourceRoot":"","sources":["../../../src/pdf-parse/PDFParse.ts"],"names":[],"mappings":"AAQA,OAAO,EAAE,WAAW,EAAmB,MAAM,kBAAkB,CAAC;AAChE,OAAO,EAAE,UAAU,EAAuB,MAAM,iBAAiB,CAAC;AAClE,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AAC1D,OAAO,EAAE,KAAK,eAAe,EAA6B,MAAM,sBAAsB,CAAC;AAEvF,OAAO,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AACzD,OAAO,EAAwB,WAAW,EAAE,MAAM,kBAAkB,CAAC;AACrE,OAAO,EAA0B,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAErE;;;GAGG;AACH,qBAAa,QAAQ;IACpB,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAiB;IACzC,OAAO,CAAC,GAAG,CAA+B;IACnC,QAAQ,EAAE;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAA4B;IAE9E;;;;OAIG;gBACS,OAAO,EAAE,cAAc;IAYtB,OAAO;IAQpB,WAAkB,QAAQ,IAAI,OAAO,CAYpC;WAEa,SAAS,CAAC,SAAS,CAAC,EAAE,MAAM,GAAG,MAAM;IAsBnD;;;;OAIG;IACU,OAAO,CAAC,MAAM,GAAE,eAAoB,GAAG,OAAO,CAAC,UAAU,CAAC;YA6BzD,YAAY;IA2B1B;;;;OAIG;IACU,OAAO,CAAC,MAAM,GAAE,eAAoB,GAAG,OAAO,CAAC,UAAU,CAAC;YA6BzD,IAAI;IAkBlB,OAAO,CAAC,WAAW;YAwCL,WAAW;YAiFX,aAAa;IAkC3B;;;;;;;;;;;;;;OAcG;IACU,QAAQ,CAAC,MAAM,GAAE,eAAoB,GAAG,OAAO,CAAC,WAAW,CAAC;IA0GzE,OAAO,CAAC,aAAa;IAuErB,OAAO,CAAC,oBAAoB;IAqD5B;;;;;;;;;;;;;;OAcG;IACU,aAAa,CAAC,WAAW,GAAE,eAAoB,GAAG,OAAO,CAAC,gBAAgB,CAAC;IAoGxF;;;;;;;;;;OAUG;IACU,QAAQ,CAAC,MAAM,GAAE,eAAoB,GAAG,OAAO,CAAC,WAAW,CAAC;IA4CzE,OAAO,CAAC,eAAe;YAmBT,aAAa;YAwJb,cAAc;CA0C5B"}

827
node_modules/pdf-parse/dist/pdf-parse/esm/PDFParse.js generated vendored Normal file
View File

@@ -0,0 +1,827 @@
import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
import { getException } from './Exception.js';
import { Line, LineStore, Point, Rectangle } from './geometry/index.js';
import { ImageResult } from './ImageResult.js';
import { InfoResult } from './InfoResult.js';
import { setDefaultParseParameters } from './ParseParameters.js';
import { PathGeometry } from './PathGeometry.js';
import { ScreenshotResult } from './ScreenshotResult.js';
import { TableResult } from './TableResult.js';
import { TextResult } from './TextResult.js';
/**
* @public
* Loads PDF documents and exposes helpers for text, image, table, metadata, and screenshot extraction.
*/
export class PDFParse {
options;
doc;
progress = { loaded: -1, total: 0 };
/**
* Create a new parser with `LoadParameters`.
* Converts Node.js `Buffer` data to `Uint8Array` automatically and ensures a default verbosity level.
* @param options - Initialization parameters.
*/
constructor(options) {
if (options.verbosity === undefined) {
options.verbosity = pdfjs.VerbosityLevel.ERRORS;
}
if (typeof Buffer !== 'undefined' && options.data instanceof Buffer) {
options.data = new Uint8Array(options.data);
}
this.options = options;
}
async destroy() {
if (this.doc) {
await this.doc.destroy();
this.doc = undefined;
}
}
// biome-ignore-start lint/suspicious/noExplicitAny: unsupported underline type
static get isNodeJS() {
const isNodeJS = typeof process === 'object' &&
`${process}` === '[object process]' &&
!process.versions.nw &&
!(process.versions.electron &&
typeof process.type !== 'undefined' &&
process.type !== 'browser');
return isNodeJS;
}
static setWorker(workerSrc) {
if (typeof globalThis.pdfjs === 'undefined') {
globalThis.pdfjs = pdfjs;
}
if (pdfjs?.GlobalWorkerOptions === null)
return '';
if (workerSrc !== undefined) {
pdfjs.GlobalWorkerOptions.workerSrc = workerSrc;
return pdfjs.GlobalWorkerOptions.workerSrc;
}
// if (!PDFParse.isNodeJS) {
// pdfjs.GlobalWorkerOptions.workerSrc =
// 'https://cdn.jsdelivr.net/npm/pdf-parse@latest/dist/browser/pdf.worker.min.mjs';
// return pdfjs.GlobalWorkerOptions.workerSrc;
// }
return pdfjs.GlobalWorkerOptions.workerSrc;
}
// biome-ignore-end lint/suspicious/noExplicitAny: unsupported underline type
/**
* Load document-level metadata (info, outline, permissions, page labels) and optionally gather per-page link details.
* @param params - Parse options; set `parsePageInfo` to collect per-page metadata described in `ParseParameters`.
* @returns Aggregated document metadata in an `InfoResult`.
*/
async getInfo(params = {}) {
const doc = await this.load();
const result = new InfoResult(doc.numPages);
const { info, metadata } = await doc.getMetadata();
result.info = info;
result.metadata = metadata;
result.fingerprints = doc.fingerprints;
result.outline = await doc.getOutline();
result.permission = await doc.getPermissions();
const pageLabels = await doc.getPageLabels();
if (params.parsePageInfo) {
for (let i = 1; i <= result.total; i++) {
if (this.shouldParse(i, result.total, params)) {
const page = await doc.getPage(i);
const pageLinkResult = await this.getPageLinks(page);
pageLinkResult.pageLabel = pageLabels?.[page.pageNumber];
result.pages.push(pageLinkResult);
page.cleanup();
}
}
}
return result;
}
async getPageLinks(page) {
const viewport = page.getViewport({ scale: 1 });
const result = {
pageNumber: page.pageNumber,
links: [],
width: viewport.width,
height: viewport.height,
};
// biome-ignore lint/suspicious/noExplicitAny: <unsupported underline type>
const annotations = (await page.getAnnotations({ intent: 'display' })) || [];
for (const i of annotations) {
if (i.subtype !== 'Link')
continue;
const url = i.url ?? i.unsafeUrl;
if (!url)
continue;
const text = i.overlaidText || '';
result.links.push({ url, text });
}
return result;
}
/**
* Extract plain text for each requested page, optionally enriching hyperlinks and enforcing line or cell separators.
* @param params - Parse options controlling pagination, link handling, and line/cell thresholds.
* @returns A `TextResult` containing page-wise text and a concatenated document string.
*/
async getText(params = {}) {
const doc = await this.load();
const result = new TextResult(doc.numPages);
for (let i = 1; i <= result.total; i++) {
if (this.shouldParse(i, result.total, params)) {
const page = await doc.getPage(i);
const text = await this.getPageText(page, params, result.total);
result.pages.push({
text: text,
num: i,
});
page.cleanup();
}
}
for (const page of result.pages) {
if (params.pageJoiner) {
let pageNumber = params.pageJoiner.replace('page_number', `${page.num}`);
pageNumber = pageNumber.replace('total_number', `${result.total}`);
result.text += `${page.text}\n${pageNumber}\n\n`;
}
else {
result.text += `${page.text}\n\n`;
}
}
return result;
}
async load() {
try {
if (this.doc === undefined) {
const loadingTask = pdfjs.getDocument(this.options);
loadingTask.onProgress = (progress) => {
this.progress = progress;
};
this.doc = await loadingTask.promise;
}
return this.doc;
}
catch (error) {
throw getException(error);
}
}
shouldParse(currentPage, totalPage, params) {
params.partial = params?.partial ?? [];
params.first = params?.first ?? 0;
params.last = params?.last ?? 0;
// parse specific pages
if (params.partial.length > 0) {
if (params.partial.includes(currentPage)) {
return true;
}
return false;
}
// parse pagest beetween first..last
if (params.first > 0 && params.last > 0) {
if (currentPage >= params.first && currentPage <= params.last) {
return true;
}
return false;
}
// parse first x page
if (params.first > 0) {
if (currentPage <= params.first) {
return true;
}
return false;
}
// parse last x page
if (params.last > 0) {
if (currentPage > totalPage - params.last) {
return true;
}
return false;
}
return true;
}
async getPageText(page, parseParams, total) {
const viewport = page.getViewport({ scale: 1 });
const params = setDefaultParseParameters(parseParams);
const textContent = await page.getTextContent({
includeMarkedContent: !!params.includeMarkedContent,
disableNormalization: !!params.disableNormalization,
});
let links = new Map();
if (params.parseHyperlinks) {
links = await this.getHyperlinks(page, viewport);
}
const strBuf = [];
let lastX;
let lastY;
let lineHeight = 0;
for (const item of textContent.items) {
if (!('str' in item))
continue;
const tm = item.transform ?? item.transform;
const [x, y] = viewport.convertToViewportPoint(tm[4], tm[5]);
if (params.parseHyperlinks) {
const posArr = links.get(item.str) || [];
const hit = posArr.find((l) => x >= l.rect.left && x <= l.rect.right && y >= l.rect.top && y <= l.rect.bottom);
if (hit) {
item.str = `[${item.str}](${hit.url})`;
}
}
if (params.lineEnforce) {
if (lastY !== undefined && Math.abs(lastY - y) > params.lineThreshold) {
const lastItem = strBuf.length ? strBuf[strBuf.length - 1] : undefined;
const isCurrentItemHasNewLine = item.str.startsWith('\n') || (item.str.trim() === '' && item.hasEOL);
if (lastItem?.endsWith('\n') === false && !isCurrentItemHasNewLine) {
const ydiff = Math.abs(lastY - y);
if (ydiff - 1 > lineHeight) {
strBuf.push('\n');
lineHeight = 0;
}
}
}
}
if (params.cellSeparator) {
if (lastY !== undefined && Math.abs(lastY - y) < params.lineThreshold) {
if (lastX !== undefined && Math.abs(lastX - x) > params.cellThreshold) {
item.str = `${params.cellSeparator}${item.str}`;
}
}
}
strBuf.push(item.str);
lastX = x + item.width;
lastY = y;
lineHeight = Math.max(lineHeight, item.height);
if (item.hasEOL) {
strBuf.push('\n');
}
if (item.hasEOL || item.str.endsWith('\n')) {
lineHeight = 0;
}
}
if (params.itemJoiner) {
return strBuf.join(params.itemJoiner);
}
return strBuf.join('');
}
async getHyperlinks(page, viewport) {
const result = new Map();
// biome-ignore lint/suspicious/noExplicitAny: <unsupported underline type>
const annotations = (await page.getAnnotations({ intent: 'display' })) || [];
for (const i of annotations) {
if (i.subtype !== 'Link')
continue;
const url = i.url ?? i.unsafeUrl;
if (!url)
continue;
const text = i.overlaidText;
if (!text)
continue;
const rectVp = viewport.convertToViewportRectangle(i.rect);
const left = Math.min(rectVp[0], rectVp[2]) - 0.5;
const top = Math.min(rectVp[1], rectVp[3]) - 0.5;
const right = Math.max(rectVp[0], rectVp[2]) + 0.5;
const bottom = Math.max(rectVp[1], rectVp[3]) + 0.5;
const pos = { rect: { left, top, right, bottom }, url, text, used: false };
const el = result.get(text);
if (el) {
el.push(pos);
}
else {
result.set(text, [pos]);
}
}
return result;
}
/**
* Extract embedded images from requested pages.
*
* Behavior notes:
* - Pages are selected according to ParseParameters (partial, first, last).
* - Images smaller than `params.imageThreshold` (width OR height) are skipped.
* - Returned ImageResult contains per-page PageImages; each image entry includes:
* - data: Uint8Array (present when params.imageBuffer === true)
* - dataUrl: string (present when params.imageDataUrl === true)
* - width, height, kind, name
* - Works in both Node.js (canvas.toBuffer) and browser (canvas.toDataURL) environments.
*
* @param params - ParseParameters controlling page selection, thresholds and output format.
* @returns Promise<ImageResult> with extracted images grouped by page.
*/
async getImage(params = {}) {
const doc = await this.load();
const result = new ImageResult(doc.numPages);
setDefaultParseParameters(params);
for (let i = 1; i <= result.total; i++) {
if (this.shouldParse(i, result.total, params)) {
const page = await doc.getPage(i);
const ops = await page.getOperatorList();
const pageImages = { pageNumber: i, images: [] };
result.pages.push(pageImages);
for (let j = 0; j < ops.fnArray.length; j++) {
if (ops.fnArray[j] === pdfjs.OPS.paintInlineImageXObject || ops.fnArray[j] === pdfjs.OPS.paintImageXObject) {
const name = ops.argsArray[j][0];
const isCommon = page.commonObjs.has(name);
const imgPromise = isCommon
? this.resolveEmbeddedImage(page.commonObjs, name)
: this.resolveEmbeddedImage(page.objs, name);
const { width, height, kind, data } = await imgPromise;
if (params.imageThreshold) {
if (params.imageThreshold >= width || params.imageThreshold >= height) {
continue;
}
}
// biome-ignore lint/suspicious/noExplicitAny: <underlying library does not contain valid typedefs>
const canvasFactory = doc.canvasFactory;
const canvasAndContext = canvasFactory.create(width, height);
const context = canvasAndContext.context;
let imgData = null;
if (kind === pdfjs.ImageKind.RGBA_32BPP) {
imgData = context.createImageData(width, height);
imgData.data.set(data);
}
else {
imgData = context.createImageData(width, height);
this.convertToRGBA({
src: data,
dest: new Uint32Array(imgData.data.buffer),
width,
height,
kind,
});
}
context.putImageData(imgData, 0, 0);
// Browser and Node.js compatibility
let buffer = new Uint8Array();
let dataUrl = '';
if (typeof canvasAndContext.canvas.toBuffer === 'function') {
// Node.js environment (canvas package)
// biome-ignore lint/suspicious/noExplicitAny: <underline lib not support>
let nodeBuffer;
if (params.imageBuffer) {
nodeBuffer = canvasAndContext.canvas.toBuffer('image/png');
buffer = new Uint8Array(nodeBuffer);
}
if (params.imageDataUrl) {
if (nodeBuffer) {
dataUrl = `data:image/png;base64,${nodeBuffer.toString('base64')}`;
}
else {
nodeBuffer = canvasAndContext.canvas.toBuffer('image/png');
buffer = new Uint8Array(nodeBuffer);
dataUrl = `data:image/png;base64,${nodeBuffer.toString('base64')}`;
}
}
}
else {
// Browser environment
if (params.imageBuffer) {
const imageData = canvasAndContext.context.getImageData(0, 0, canvasAndContext.canvas.width, canvasAndContext.canvas.height);
buffer = new Uint8Array(imageData.data);
}
if (params.imageDataUrl) {
dataUrl = canvasAndContext.canvas.toDataURL('image/png');
}
}
pageImages.images.push({
data: buffer,
dataUrl,
name,
height,
width,
kind,
});
}
}
}
}
return result;
}
convertToRGBA({ src, dest, width, height, kind, }) {
if (kind === pdfjs.ImageKind.RGB_24BPP) {
// RGB 24-bit per pixel
for (let i = 0, j = 0; i < src.length; i += 3, j++) {
const r = src[i];
const g = src[i + 1];
const b = src[i + 2];
dest[j] = (255 << 24) | (b << 16) | (g << 8) | r;
}
}
else if (kind === pdfjs.ImageKind.GRAYSCALE_1BPP) {
// Grayscale 1-bit per pixel
let pixelIndex = 0;
for (let i = 0; i < src.length; i++) {
const byte = src[i];
for (let bit = 7; bit >= 0; bit--) {
if (pixelIndex >= width * height)
break;
const isWhite = ((byte >> bit) & 1) === 1;
const gray = isWhite ? 255 : 0;
dest[pixelIndex++] = (255 << 24) | (gray << 16) | (gray << 8) | gray;
}
}
}
else if (kind === undefined || kind === null) {
// Unknown or undefined kind - try to infer from data length
const bytesPerPixel = src.length / (width * height);
if (Math.abs(bytesPerPixel - 3) < 0.1) {
// Likely RGB 24BPP
for (let i = 0, j = 0; i < src.length; i += 3, j++) {
const r = src[i];
const g = src[i + 1];
const b = src[i + 2];
dest[j] = (255 << 24) | (b << 16) | (g << 8) | r;
}
}
else if (Math.abs(bytesPerPixel - 4) < 0.1) {
// Likely RGBA 32BPP
for (let i = 0, j = 0; i < src.length; i += 4, j++) {
const r = src[i];
const g = src[i + 1];
const b = src[i + 2];
const a = src[i + 3];
dest[j] = (a << 24) | (b << 16) | (g << 8) | r;
}
}
else if (Math.abs(bytesPerPixel - 1) < 0.1) {
// Likely grayscale 8BPP
for (let i = 0; i < src.length; i++) {
const gray = src[i];
dest[i] = (255 << 24) | (gray << 16) | (gray << 8) | gray;
}
}
else {
throw new Error(`convertToRGBA: Cannot infer image format. kind: ${kind}, bytesPerPixel: ${bytesPerPixel}, width: ${width}, height: ${height}, dataLength: ${src.length}`);
}
}
else {
throw new Error(`convertToRGBA: Unsupported image kind: ${kind}. Available kinds: GRAYSCALE_1BPP=${pdfjs.ImageKind.GRAYSCALE_1BPP}, RGB_24BPP=${pdfjs.ImageKind.RGB_24BPP}, RGBA_32BPP=${pdfjs.ImageKind.RGBA_32BPP}`);
}
}
resolveEmbeddedImage(pdfObjects, name) {
return new Promise((resolve, reject) => {
// biome-ignore lint/suspicious/noExplicitAny: <underlying library does not contain valid typedefs>
pdfObjects.get(name, (imgData) => {
if (imgData) {
// Check different possible data sources
let dataBuff;
if (imgData.data instanceof Uint8Array) {
dataBuff = imgData.data;
}
else if (imgData.data instanceof Uint8ClampedArray) {
dataBuff = new Uint8Array(imgData.data);
}
else if (imgData.data?.buffer) {
// Typed array with buffer
dataBuff = new Uint8Array(imgData.data.buffer);
}
else if (imgData.bitmap) {
// Some browsers might use bitmap
// biome-ignore lint/suspicious/noExplicitAny: <underlying library does not contain valid typedefs>
const canvasFactory = this.doc.canvasFactory;
const canvasAndContext = canvasFactory.create(imgData.bitmap.width, imgData.bitmap.height);
canvasAndContext.context.drawImage(imgData.bitmap, 0, 0);
const imageData = canvasAndContext.context.getImageData(0, 0, imgData.bitmap.width, imgData.bitmap.height);
dataBuff = new Uint8Array(imageData.data.buffer);
}
else if (ArrayBuffer.isView(imgData.data)) {
// Generic typed array
dataBuff = new Uint8Array(imgData.data.buffer, imgData.data.byteOffset, imgData.data.byteLength);
}
if (!dataBuff) {
reject(new Error(`Image object ${name}: data field is empty or invalid. Available fields: ${Object.keys(imgData).join(', ')}`));
return;
}
if (dataBuff.length === 0) {
reject(new Error(`Image object ${name}: data buffer is empty (length: 0)`));
return;
}
resolve({ width: imgData.width, height: imgData.height, kind: imgData.kind, data: dataBuff });
}
else {
reject(new Error(`Image object ${name} not found`));
}
});
});
}
/**
* Render pages to raster screenshots.
*
* Behavior notes:
* - Pages are selected according to ParseParameters (partial, first, last).
* - Use params.scale for zoom; if params.desiredWidth is specified it takes precedence.
* - Each ScreenshotResult page contains:
* - data: Uint8Array (when params.imageBuffer === true)
* - dataUrl: string (when params.imageDataUrl === true)
* - pageNumber, width, height, scale
* - Works in both Node.js (canvas.toBuffer) and browser (canvas.toDataURL) environments.
*
* @param parseParams - ParseParameters controlling page selection and render options.
* @returns Promise<ScreenshotResult> with rendered page images.
*/
async getScreenshot(parseParams = {}) {
//const base = new URL('../../node_modules/pdfjs-dist/', import.meta.url);
//this.options.cMapUrl = new URL('cmaps/', base).href;
//this.options.cMapPacked = true;
//this.options.standardFontDataUrl = new URL('legacy/build/standard_fonts/', base).href;
const params = setDefaultParseParameters(parseParams);
const doc = await this.load();
const result = new ScreenshotResult(doc.numPages);
if (this.doc === undefined) {
throw new Error('PDF document not loaded');
}
for (let i = 1; i <= result.total; i++) {
if (this.shouldParse(i, result.total, params)) {
const page = await this.doc.getPage(i);
let viewport = page.getViewport({ scale: params.scale });
if (params.desiredWidth) {
viewport = page.getViewport({ scale: 1 });
// desiredWidth
const scale = params.desiredWidth / viewport.width;
viewport = page.getViewport({ scale: scale });
}
// biome-ignore lint/suspicious/noExplicitAny: <underlying library does not contain valid typedefs>
const canvasFactory = this.doc.canvasFactory;
const canvasAndContext = canvasFactory.create(viewport.width, viewport.height);
const renderContext = {
canvasContext: canvasAndContext.context,
viewport,
canvas: canvasAndContext.canvas,
};
const renderTask = page.render(renderContext);
await renderTask.promise;
// Convert the canvas to an image buffer.
let data = new Uint8Array();
let dataUrl = '';
if (typeof canvasAndContext.canvas.toBuffer === 'function') {
// Node.js environment (canvas package)
// biome-ignore lint/suspicious/noExplicitAny: <underline lib not support>
let nodeBuffer;
if (params.imageBuffer) {
nodeBuffer = canvasAndContext.canvas.toBuffer('image/png');
data = new Uint8Array(nodeBuffer);
}
if (params.imageDataUrl) {
if (nodeBuffer) {
dataUrl = `data:image/png;base64,${nodeBuffer.toString('base64')}`;
}
else {
nodeBuffer = canvasAndContext.canvas.toBuffer('image/png');
data = new Uint8Array(nodeBuffer);
dataUrl = `data:image/png;base64,${nodeBuffer.toString('base64')}`;
}
}
}
else {
// Browser environment
if (params.imageBuffer) {
const imageData = canvasAndContext.context.getImageData(0, 0, canvasAndContext.canvas.width, canvasAndContext.canvas.height);
data = new Uint8Array(imageData.data);
}
if (params.imageDataUrl) {
dataUrl = canvasAndContext.canvas.toDataURL('image/png');
//const base64 = dataUrl.split(',')[1];
//const binaryString = atob(base64);
//data = new Uint8Array(binaryString.length);
//for (let i = 0; i < binaryString.length; i++) {
// data[i] = binaryString.charCodeAt(i);
//}
}
}
result.pages.push({
data,
dataUrl,
pageNumber: i,
width: viewport.width,
height: viewport.height,
scale: viewport.scale,
});
page.cleanup();
}
}
return result;
}
/**
* Detect and extract tables from pages by analysing vector drawing operators, then populate cells with text.
*
* Behavior notes:
* - Scans operator lists for rectangles/lines that form table grids (uses PathGeometry and LineStore).
* - Normalizes detected geometry and matches positioned text to table cells.
* - Honors ParseParameters for page selection.
*
* @param params - ParseParameters controlling which pages to analyse (partial/first/last).
* @returns Promise<TableResult> containing discovered tables per page.
*/
async getTable(params = {}) {
const doc = await this.load();
const result = new TableResult(doc.numPages);
if (this.doc === undefined) {
throw new Error('PDF document not loaded');
}
for (let i = 1; i <= result.total; i++) {
if (this.shouldParse(i, result.total, params)) {
const page = await this.doc.getPage(i);
//const viewport = page.getViewport({ scale: 1 });
//viewport.convertToViewportPoint(0, 0);
const store = await this.getPageTables(page);
//const store = await this.getPageGeometry(page);
store.normalize();
const tableDataArr = store.getTableData();
await this.fillPageTables(page, tableDataArr);
const pageTableResult = { num: i, tables: [] };
for (const table of tableDataArr) {
//if (table.cellCount < 3) continue
pageTableResult.tables.push(table.toArray());
//const pageTableResult: PageTableResult = { num: i, tables: table.toArray() };
//pageTableResult.tables.push(table.toData())
}
result.pages.push(pageTableResult);
page.cleanup();
}
}
// for (const table of Table.AllTables) {
// if (table.cellCount < 3) continue
// const str = table.toString()
// console.log(str)
// }
return result;
}
getPathGeometry(mm) {
const width = mm[2] - mm[0];
const height = mm[3] - mm[1];
if (mm[0] === Infinity) {
return PathGeometry.undefined;
}
if (width > 5 && height > 5) {
return PathGeometry.rectangle;
}
else if (width > 5 && height === 0) {
return PathGeometry.hline;
}
else if (width === 0 && height > 5) {
return PathGeometry.vline;
}
return PathGeometry.undefined;
}
async getPageTables(page) {
const lineStore = new LineStore();
const viewport = page.getViewport({ scale: 1 });
let transformMatrix = [1, 0, 0, 1, 0, 0];
const transformStack = [];
const opList = await page.getOperatorList();
for (let i = 0; i < opList.fnArray.length; i++) {
const fn = opList.fnArray[i];
const args = opList.argsArray[i];
const op = args?.[0] ?? 0;
const mm = args?.[2] ?? [Infinity, Infinity, -Infinity, -Infinity];
//const minMax = new Float32Array([Infinity, Infinity, -Infinity, -Infinity]);
if (fn === pdfjs.OPS.constructPath) {
if (op === pdfjs.OPS.fill) {
//debugger;
}
if (op !== pdfjs.OPS.stroke) {
continue;
}
const pg = this.getPathGeometry(mm);
if (pg === PathGeometry.rectangle) {
const rect = new Rectangle(new Point(mm[0], mm[1]), mm[2] - mm[0], mm[3] - mm[1]);
rect.transform(transformMatrix);
rect.transform(viewport.transform);
lineStore.addRectangle(rect);
}
else if (pg === PathGeometry.hline || pg === PathGeometry.vline) {
const from = new Point(mm[0], mm[1]);
const to = new Point(mm[2], mm[3]);
const line = new Line(from, to);
line.transform(transformMatrix);
line.transform(viewport.transform);
lineStore.add(line);
}
else {
//debugger;
}
// if (op === pdfjs.OPS.rectangle) {
// debugger;
// } else if (op === pdfjs.OPS.moveTo) {
// debugger;
// } else if (op === pdfjs.OPS.lineTo) {
// debugger;
// } else if (op === pdfjs.OPS.endPath) {
// const combinedMatrix = pdfjs.Util.transform(viewport.transform, transformMatrix);
// // while (args[1].length) {
// // const drawOp = args[1].shift();
// // debugger;
// // }
// } else {
// //debugger;
// }
}
else if (fn === pdfjs.OPS.setLineWidth) {
//debugger;
}
else if (fn === pdfjs.OPS.save) {
transformStack.push(transformMatrix);
}
else if (fn === pdfjs.OPS.restore) {
const restoredMatrix = transformStack.pop();
if (restoredMatrix) {
transformMatrix = restoredMatrix;
}
}
else if (fn === pdfjs.OPS.transform) {
//transformMatrix = this.transform_fn(transformMatrix, args);
transformMatrix = pdfjs.Util.transform(transformMatrix, args);
}
}
return lineStore;
}
// private async getPageGeometry(page: PDFPageProxy): Promise<LineStore> {
// const lineStore: LineStore = new LineStore();
// const opList = await page.getOperatorList();
// const viewport = page.getViewport({ scale: 1 });
// let transformMatrix = [1, 0, 0, 1, 0, 0];
// const transformStack: Array<Array<number>> = [];
// let current_x: number = 0;
// let current_y: number = 0;
// for (let j = 0; j < opList.fnArray.length; j++) {
// const fn = opList.fnArray[j];
// const args = opList.argsArray[j];
// if (fn === pdfjs.OPS.constructPath) {
// while (args[0].length) {
// const op = args[0].shift();
// const combinedMatrix = pdfjs.Util.transform(viewport.transform, transformMatrix);
// if (op === pdfjs.OPS.rectangle) {
// const x = args[1].shift();
// const y = args[1].shift();
// const width = args[1].shift();
// const height = args[1].shift();
// if (Math.min(width, height) <= 2) {
// // TODO remove
// debugger;
// }
// const rect = new Rectangle(new Point(x, y), width, height);
// rect.transform(combinedMatrix);
// //rect.transform(viewport.transform);
// lineStore.addRectangle(rect);
// } else if (op === pdfjs.OPS.moveTo) {
// current_x = args[1].shift();
// current_y = args[1].shift();
// } else if (op === pdfjs.OPS.lineTo) {
// const x = args[1].shift();
// const y = args[1].shift();
// //default trasform
// const from = new Point(current_x, current_y);
// const to = new Point(x, y);
// const line = new Line(from, to);
// line.transform(combinedMatrix);
// //line.transform(viewport.transform);
// // // viewport transform
// // const _from = viewport.convertToViewportPoint(line.from.x, line.from.y)
// // const _to = viewport.convertToViewportPoint(line.to.x, line.to.y)
// //
// // const transformedLine = new Line(new Point(_from[0], _from[1]), new Point(_to[0], _to[1]))
// lineStore.add(line);
// current_x = x;
// current_y = y;
// }
// }
// } else if (fn === pdfjs.OPS.save) {
// transformStack.push(transformMatrix);
// } else if (fn === pdfjs.OPS.restore) {
// const restoredMatrix = transformStack.pop();
// if (restoredMatrix) {
// transformMatrix = restoredMatrix;
// }
// } else if (fn === pdfjs.OPS.transform) {
// //transformMatrix = this.transform_fn(transformMatrix, args);
// transformMatrix = pdfjs.Util.transform(transformMatrix, args);
// }
// }
// return lineStore;
// }
async fillPageTables(page, pageTables) {
//const resultTable: Array<Table> = []
const viewport = page.getViewport({ scale: 1 });
// for (let i = 0; i < pageTables.length; i++) {
// const currentTable = pageTables[i]
// }
//pageTables = pageTables.filter((table) => table.cellCount > 3)
const textContent = await page.getTextContent({
includeMarkedContent: false,
disableNormalization: false,
});
for (const textItem of textContent.items) {
if (!('str' in textItem))
continue;
const tx = pdfjs.Util.transform(pdfjs.Util.transform(viewport.transform, textItem.transform), [1, 0, 0, -1, 0, 0]);
//const resXY = viewport.convertToViewportPoint(tx[4], tx[5]);
// textItem.transform = pdfjs.Util.transform(viewport.transform, textItem.transform)
// textItem.transform[5] = viewport.height - textItem.transform[5] - textItem.height
for (const pageTable of pageTables) {
const cell = pageTable.findCell(tx[4], tx[5]);
if (cell) {
cell.text.push(textItem.str);
if (textItem.hasEOL) {
cell.text.push('\n');
}
break;
}
}
//Table.tryAddText(pageTables, textItem)
}
}
}
//PDFParse.setWorker();
//# sourceMappingURL=PDFParse.js.map

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,127 @@
/**
* @public
* ParseParameters
* Options to control parsing behavior and output formatting.
*/
export interface ParseParameters {
/**
* Array of page numbers to parse.
* When provided, only these pages will be parsed and returned in the same order.
* Example: [1, 3, 5]. Parse only one page: [7].
* Default: `undefined`.
*/
partial?: Array<number>;
/**
* Parse the first N pages (pages 1..N).
* Ignored when `partial` is provided. If both `first` and `last` are set, they define
* an explicit inclusive page range (first..last) and this "first N" semantics is ignored.
* Default: `undefined`.
*/
first?: number;
/**
* Parse the last N pages (pages total-N+1..total).
* Ignored when `partial` is provided. If both `first` and `last` are set, they define
* an explicit inclusive page range (first..last) and this "last N" semantics is ignored.
* Default: `undefined`.
*/
last?: number;
/**
* Collect per-page metadata such as embedded links, title, pageLabel, and dimensions;
* ISBN, DOI, abstract, and references are work in progress when getInfo() is used.
* Default: `false`.
*/
parsePageInfo?: boolean;
/**
* Attempt to detect and include hyperlink annotations (e.g. URLs) associated with text.
* Detected links are formatted as Markdown inline links (for example: [text](https://example.com)).
* Default: `false`.
*/
parseHyperlinks?: boolean;
/**
* Enforce logical line breaks by inserting a newline when the vertical distance
* between text items exceeds `lineThreshold`.
* Useful to preserve paragraph/line structure when text items are emitted as separate segments.
* Default: `true`.
*/
lineEnforce?: boolean;
/**
* Threshold to decide whether nearby text items belong to different lines.
* Larger values make the parser more likely to start a new line between items.
* Default: `4.6`.
*/
lineThreshold?: number;
/**
* String inserted between text items on the same line when a sufficiently large horizontal gap is detected.
* Typically used to emulate a cell/column separator (for example, "\\t" for tabs).
* Default: `'\t'`.
*/
cellSeparator?: string;
/**
* Horizontal distance threshold to decide when two text items on the same baseline should be treated as separate cells.
* Larger value produces fewer (wider) cells; smaller value creates more cell breaks.
* Default: `7`.
*/
cellThreshold?: number;
/**
* Optional string appended at the end of each page's extracted text to mark page boundaries.
* Supports placeholders `page_number` and `total_number` which are substituted accordingly.
* If omitted or empty, no page boundary marker is added.
* Default: `'\n-- page_number of total_number --'`.
*/
pageJoiner?: string;
/**
* Optional string used to join text items when returning a page's text.
* If provided, this value is used instead of the default empty-string joining behavior.
* Default: `undefined`.
*/
itemJoiner?: string;
/**
* Minimum image dimension (in pixels) for width or height.
* When set, images where width OR height are below or equal this value will be ignored by `getImage()`.
* Useful for excluding tiny decorative or tracking images.
* Default: `80`.
* Disable: `0`.
*/
imageThreshold?: number;
/**
* Screenshot scale factor: use 1 for the original size, 1.5 for a 50% larger image, etc.
* Default: `1`.
*/
scale?: number;
/**
* Desired screenshot width in pixels.
* When set, the scale option is ignored.
* Default: `undefined`.
*/
desiredWidth?: number;
/**
* Applies to both getImage() and getScreenshot(): include the image as a base64 data URL string.
* Default: `true`.
*/
imageDataUrl?: boolean;
/**
* Applies to both getImage() and getScreenshot(): include the image as a binary buffer.
* Default: `true`.
*/
imageBuffer?: boolean;
/**
* Include marked content items in the items array of TextContent to capture PDF "marked content".
* Enables tags (MCID, role/props) and structural/accessibility information useful for mapping text ↔ structure.
* For plain text extraction it's usually false (trade-off: larger output).
* Default: `false`.
*/
includeMarkedContent?: boolean;
/**
* When true, text normalization is NOT performed in the worker thread.
* For plain text extraction, normalizing in the worker (false) is usually recommended.
* Default: `false`.
*/
disableNormalization?: boolean;
}
/**
* @public
* SafeParseParameters
*/
export type SafeParseParameters = Required<Pick<ParseParameters, 'lineThreshold' | 'cellThreshold' | 'scale'>> & ParseParameters;
export declare function setDefaultParseParameters(params: ParseParameters): SafeParseParameters;
//# sourceMappingURL=ParseParameters.d.ts.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"ParseParameters.d.ts","sourceRoot":"","sources":["../../../src/pdf-parse/ParseParameters.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH,MAAM,WAAW,eAAe;IAC/B;;;;;OAKG;IACH,OAAO,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IAExB;;;;;OAKG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IAEf;;;;;OAKG;IACH,IAAI,CAAC,EAAE,MAAM,CAAC;IAEd;;;;OAIG;IACH,aAAa,CAAC,EAAE,OAAO,CAAC;IAExB;;;;OAIG;IACH,eAAe,CAAC,EAAE,OAAO,CAAC;IAE1B;;;;;OAKG;IACH,WAAW,CAAC,EAAE,OAAO,CAAC;IAEtB;;;;OAIG;IACH,aAAa,CAAC,EAAE,MAAM,CAAC;IAEvB;;;;OAIG;IACH,aAAa,CAAC,EAAE,MAAM,CAAC;IAEvB;;;;OAIG;IACH,aAAa,CAAC,EAAE,MAAM,CAAC;IAEvB;;;;;OAKG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC;IAEpB;;;;OAIG;IACH,UAAU,CAAC,EAAE,MAAM,CAAC;IAEpB;;;;;;OAMG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC;IAExB;;;OAGG;IACH,KAAK,CAAC,EAAE,MAAM,CAAC;IAEf;;;;OAIG;IACH,YAAY,CAAC,EAAE,MAAM,CAAC;IAEtB;;;OAGG;IACH,YAAY,CAAC,EAAE,OAAO,CAAC;IAEvB;;;OAGG;IACH,WAAW,CAAC,EAAE,OAAO,CAAC;IAEtB;;;;;OAKG;IACH,oBAAoB,CAAC,EAAE,OAAO,CAAC;IAE/B;;;;OAIG;IACH,oBAAoB,CAAC,EAAE,OAAO,CAAC;CAC/B;AAED;;;GAGG;AACH,MAAM,MAAM,mBAAmB,GAAG,QAAQ,CAAC,IAAI,CAAC,eAAe,EAAE,eAAe,GAAG,eAAe,GAAG,OAAO,CAAC,CAAC,GAC7G,eAAe,CAAC;AAEjB,wBAAgB,yBAAyB,CAAC,MAAM,EAAE,eAAe,GAAG,mBAAmB,CAatF"}

View File

@@ -0,0 +1,13 @@
export function setDefaultParseParameters(params) {
params.lineThreshold = params?.lineThreshold ?? 4.6;
params.cellThreshold = params?.cellThreshold ?? 7;
params.cellSeparator = params?.cellSeparator ?? '\t';
params.lineEnforce = params?.lineEnforce ?? true;
params.pageJoiner = params?.pageJoiner ?? '\n-- page_number of total_number --';
params.imageThreshold = params?.imageThreshold ?? 80;
params.imageDataUrl = params?.imageDataUrl ?? true;
params.imageBuffer = params?.imageBuffer ?? true;
params.scale = params?.scale ?? 1;
return params;
}
//# sourceMappingURL=ParseParameters.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"ParseParameters.js","sourceRoot":"","sources":["../../../src/pdf-parse/ParseParameters.ts"],"names":[],"mappings":"AAiJA,MAAM,UAAU,yBAAyB,CAAC,MAAuB;IAChE,MAAM,CAAC,aAAa,GAAG,MAAM,EAAE,aAAa,IAAI,GAAG,CAAC;IACpD,MAAM,CAAC,aAAa,GAAG,MAAM,EAAE,aAAa,IAAI,CAAC,CAAC;IAClD,MAAM,CAAC,aAAa,GAAG,MAAM,EAAE,aAAa,IAAI,IAAI,CAAC;IACrD,MAAM,CAAC,WAAW,GAAG,MAAM,EAAE,WAAW,IAAI,IAAI,CAAC;IACjD,MAAM,CAAC,UAAU,GAAG,MAAM,EAAE,UAAU,IAAI,qCAAqC,CAAC;IAChF,MAAM,CAAC,cAAc,GAAG,MAAM,EAAE,cAAc,IAAI,EAAE,CAAC;IAErD,MAAM,CAAC,YAAY,GAAG,MAAM,EAAE,YAAY,IAAI,IAAI,CAAC;IACnD,MAAM,CAAC,WAAW,GAAG,MAAM,EAAE,WAAW,IAAI,IAAI,CAAC;IACjD,MAAM,CAAC,KAAK,GAAG,MAAM,EAAE,KAAK,IAAI,CAAC,CAAC;IAElC,OAAO,MAA6B,CAAC;AACtC,CAAC"}

View File

@@ -0,0 +1,15 @@
export type MinMax = [number, number, number, number];
export declare enum PathGeometry {
undefined = 0,
hline = 1,
vline = 2,
rectangle = 3
}
export declare enum DrawOPS {
moveTo = 0,
lineTo = 1,
curveTo = 2,
closePath = 3,
rectangle = 4
}
//# sourceMappingURL=PathGeometry.d.ts.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"PathGeometry.d.ts","sourceRoot":"","sources":["../../../src/pdf-parse/PathGeometry.ts"],"names":[],"mappings":"AAAA,MAAM,MAAM,MAAM,GAAG,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC;AAEtD,oBAAY,YAAY;IACvB,SAAS,IAAI;IACb,KAAK,IAAI;IACT,KAAK,IAAI;IACT,SAAS,IAAI;CACb;AAED,oBAAY,OAAO;IAClB,MAAM,IAAI;IACV,MAAM,IAAI;IACV,OAAO,IAAI;IACX,SAAS,IAAI;IACb,SAAS,IAAI;CACb"}

View File

@@ -0,0 +1,16 @@
export var PathGeometry;
(function (PathGeometry) {
PathGeometry[PathGeometry["undefined"] = 0] = "undefined";
PathGeometry[PathGeometry["hline"] = 1] = "hline";
PathGeometry[PathGeometry["vline"] = 2] = "vline";
PathGeometry[PathGeometry["rectangle"] = 3] = "rectangle";
})(PathGeometry || (PathGeometry = {}));
export var DrawOPS;
(function (DrawOPS) {
DrawOPS[DrawOPS["moveTo"] = 0] = "moveTo";
DrawOPS[DrawOPS["lineTo"] = 1] = "lineTo";
DrawOPS[DrawOPS["curveTo"] = 2] = "curveTo";
DrawOPS[DrawOPS["closePath"] = 3] = "closePath";
DrawOPS[DrawOPS["rectangle"] = 4] = "rectangle";
})(DrawOPS || (DrawOPS = {}));
//# sourceMappingURL=PathGeometry.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"PathGeometry.js","sourceRoot":"","sources":["../../../src/pdf-parse/PathGeometry.ts"],"names":[],"mappings":"AAEA,MAAM,CAAN,IAAY,YAKX;AALD,WAAY,YAAY;IACvB,yDAAa,CAAA;IACb,iDAAS,CAAA;IACT,iDAAS,CAAA;IACT,yDAAa,CAAA;AACd,CAAC,EALW,YAAY,KAAZ,YAAY,QAKvB;AAED,MAAM,CAAN,IAAY,OAMX;AAND,WAAY,OAAO;IAClB,yCAAU,CAAA;IACV,yCAAU,CAAA;IACV,2CAAW,CAAA;IACX,+CAAa,CAAA;IACb,+CAAa,CAAA;AACd,CAAC,EANW,OAAO,KAAP,OAAO,QAMlB"}

View File

@@ -0,0 +1,22 @@
/**
* @public
* Screenshot
*/
export interface Screenshot {
data: Uint8Array;
dataUrl: string;
pageNumber: number;
width: number;
height: number;
scale: number;
}
/**
* @public
* ScreenshotResult
*/
export declare class ScreenshotResult {
pages: Array<Screenshot>;
total: number;
constructor(total: number);
}
//# sourceMappingURL=ScreenshotResult.d.ts.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"ScreenshotResult.d.ts","sourceRoot":"","sources":["../../../src/pdf-parse/ScreenshotResult.ts"],"names":[],"mappings":"AAAA;;;GAGG;AACH,MAAM,WAAW,UAAU;IAE1B,IAAI,EAAE,UAAU,CAAC;IAGjB,OAAO,EAAE,MAAM,CAAC;IAEhB,UAAU,EAAE,MAAM,CAAC;IAEnB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;CACd;AAED;;;GAGG;AACH,qBAAa,gBAAgB;IAC5B,KAAK,EAAE,KAAK,CAAC,UAAU,CAAC,CAAM;IAC9B,KAAK,EAAE,MAAM,CAAK;gBAEN,KAAK,EAAE,MAAM;CAGzB"}

View File

@@ -0,0 +1,12 @@
/**
* @public
* ScreenshotResult
*/
export class ScreenshotResult {
pages = [];
total = 0;
constructor(total) {
this.total = total;
}
}
//# sourceMappingURL=ScreenshotResult.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"ScreenshotResult.js","sourceRoot":"","sources":["../../../src/pdf-parse/ScreenshotResult.ts"],"names":[],"mappings":"AAkBA;;;GAGG;AACH,MAAM,OAAO,gBAAgB;IAC5B,KAAK,GAAsB,EAAE,CAAC;IAC9B,KAAK,GAAW,CAAC,CAAC;IAElB,YAAY,KAAa;QACxB,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACpB,CAAC;CACD"}

View File

@@ -0,0 +1,20 @@
export type TableArray = Array<Array<string>>;
/**
* @public
* PageTableResult
*/
export interface PageTableResult {
num: number;
tables: TableArray[];
}
/**
* @public
* TableResult
*/
export declare class TableResult {
pages: Array<PageTableResult>;
mergedTables: TableArray[];
total: number;
constructor(total: number);
}
//# sourceMappingURL=TableResult.d.ts.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"TableResult.d.ts","sourceRoot":"","sources":["../../../src/pdf-parse/TableResult.ts"],"names":[],"mappings":"AAAA,MAAM,MAAM,UAAU,GAAG,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC;AAE9C;;;GAGG;AACH,MAAM,WAAW,eAAe;IAC/B,GAAG,EAAE,MAAM,CAAC;IACZ,MAAM,EAAE,UAAU,EAAE,CAAC;CACrB;AAED;;;GAGG;AACH,qBAAa,WAAW;IACvB,KAAK,EAAE,KAAK,CAAC,eAAe,CAAC,CAAM;IACnC,YAAY,EAAE,UAAU,EAAE,CAAM;IAChC,KAAK,EAAE,MAAM,CAAK;gBAEN,KAAK,EAAE,MAAM;CAGzB"}

View File

@@ -0,0 +1,13 @@
/**
* @public
* TableResult
*/
export class TableResult {
pages = [];
mergedTables = [];
total = 0;
constructor(total) {
this.total = total;
}
}
//# sourceMappingURL=TableResult.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"TableResult.js","sourceRoot":"","sources":["../../../src/pdf-parse/TableResult.ts"],"names":[],"mappings":"AAWA;;;GAGG;AACH,MAAM,OAAO,WAAW;IACvB,KAAK,GAA2B,EAAE,CAAC;IACnC,YAAY,GAAiB,EAAE,CAAC;IAChC,KAAK,GAAW,CAAC,CAAC;IAElB,YAAY,KAAa;QACxB,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACpB,CAAC;CACD"}

View File

@@ -0,0 +1,35 @@
/**
* @public
* HyperlinkPosition
*/
export type HyperlinkPosition = {
rect: {
left: number;
top: number;
right: number;
bottom: number;
};
url: string;
text: string;
used: boolean;
};
/**
* @public
* PageTextResult
*/
export interface PageTextResult {
num: number;
text: string;
}
/**
* @public
* TextResult
*/
export declare class TextResult {
pages: Array<PageTextResult>;
text: string;
total: number;
getPageText(num: number): string;
constructor(total: number);
}
//# sourceMappingURL=TextResult.d.ts.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"TextResult.d.ts","sourceRoot":"","sources":["../../../src/pdf-parse/TextResult.ts"],"names":[],"mappings":"AAAA;;;GAGG;AACH,MAAM,MAAM,iBAAiB,GAAG;IAC/B,IAAI,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,GAAG,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,CAAC;IACnE,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,OAAO,CAAC;CACd,CAAC;AAEF;;;GAGG;AACH,MAAM,WAAW,cAAc;IAC9B,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,EAAE,MAAM,CAAC;CACb;AAED;;;GAGG;AACH,qBAAa,UAAU;IACtB,KAAK,EAAE,KAAK,CAAC,cAAc,CAAC,CAAM;IAClC,IAAI,EAAE,MAAM,CAAM;IAClB,KAAK,EAAE,MAAM,CAAK;IAEX,WAAW,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM;gBAO3B,KAAK,EAAE,MAAM;CAGzB"}

View File

@@ -0,0 +1,20 @@
/**
* @public
* TextResult
*/
export class TextResult {
pages = [];
text = '';
total = 0;
getPageText(num) {
for (const pageData of this.pages) {
if (pageData.num === num)
return pageData.text;
}
return '';
}
constructor(total) {
this.total = total;
}
}
//# sourceMappingURL=TextResult.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"TextResult.js","sourceRoot":"","sources":["../../../src/pdf-parse/TextResult.ts"],"names":[],"mappings":"AAoBA;;;GAGG;AACH,MAAM,OAAO,UAAU;IACtB,KAAK,GAA0B,EAAE,CAAC;IAClC,IAAI,GAAW,EAAE,CAAC;IAClB,KAAK,GAAW,CAAC,CAAC;IAEX,WAAW,CAAC,GAAW;QAC7B,KAAK,MAAM,QAAQ,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;YACnC,IAAI,QAAQ,CAAC,GAAG,KAAK,GAAG;gBAAE,OAAO,QAAQ,CAAC,IAAI,CAAC;QAChD,CAAC;QACD,OAAO,EAAE,CAAC;IACX,CAAC;IAED,YAAY,KAAa;QACxB,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;IACpB,CAAC;CACD"}

View File

@@ -0,0 +1,26 @@
import { Point } from './Point.js';
import { Shape } from './Shape.js';
export declare enum LineDirection {
None = 0,
Horizontal = 1,
Vertical = 2
}
export declare class Line extends Shape {
from: Point;
to: Point;
direction: LineDirection;
length: number;
intersections: Array<Point>;
gaps: Array<Line>;
constructor(from: Point, to: Point);
private init;
private _valid;
get valid(): boolean;
get normalized(): Line;
addGap(line: Line): void;
containsPoint(p: Point): boolean;
addIntersectionPoint(point: Point): void;
intersection(line: Line): Point | undefined;
transform(matrix: Array<number>): this;
}
//# sourceMappingURL=Line.d.ts.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"Line.d.ts","sourceRoot":"","sources":["../../../../src/pdf-parse/geometry/Line.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AAEnC,oBAAY,aAAa;IACxB,IAAI,IAAI;IACR,UAAU,IAAI;IACd,QAAQ,IAAA;CACR;AAED,qBAAa,IAAK,SAAQ,KAAK;IACvB,IAAI,EAAE,KAAK,CAAC;IACZ,EAAE,EAAE,KAAK,CAAC;IACV,SAAS,EAAE,aAAa,CAAsB;IAC9C,MAAM,EAAE,MAAM,CAAK;IACnB,aAAa,EAAE,KAAK,CAAC,KAAK,CAAC,CAAM;IACjC,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,CAAM;gBAElB,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,KAAK;IAOlC,OAAO,CAAC,IAAI;IA4BZ,OAAO,CAAC,MAAM,CAAkC;IAEhD,IAAI,KAAK,IAAI,OAAO,CAKnB;IAED,IAAI,UAAU,IAAI,IAAI,CAarB;IAEM,MAAM,CAAC,IAAI,EAAE,IAAI,GAAG,IAAI;IAIxB,aAAa,CAAC,CAAC,EAAE,KAAK,GAAG,OAAO;IAoBhC,oBAAoB,CAAC,KAAK,EAAE,KAAK,GAAG,IAAI;IAOxC,YAAY,CAAC,IAAI,EAAE,IAAI,GAAG,KAAK,GAAG,SAAS;IAiD3C,SAAS,CAAC,MAAM,EAAE,KAAK,CAAC,MAAM,CAAC,GAAG,IAAI;CAe7C"}

View File

@@ -0,0 +1,146 @@
import { Point } from './Point.js';
import { Shape } from './Shape.js';
export var LineDirection;
(function (LineDirection) {
LineDirection[LineDirection["None"] = 0] = "None";
LineDirection[LineDirection["Horizontal"] = 1] = "Horizontal";
LineDirection[LineDirection["Vertical"] = 2] = "Vertical";
})(LineDirection || (LineDirection = {}));
export class Line extends Shape {
from;
to;
direction = LineDirection.None;
length = 0;
intersections = [];
gaps = [];
constructor(from, to) {
super();
this.from = from;
this.to = to;
this.init();
}
init() {
let from = this.from;
let to = this.to;
if (Math.abs(from.y - to.y) < Shape.tolerance) {
this.direction = LineDirection.Horizontal;
to.y = from.y;
if (from.x > to.x) {
const temp = from;
from = to;
to = temp;
}
this.length = to.x - from.x;
}
else if (Math.abs(from.x - to.x) < Shape.tolerance) {
this.direction = LineDirection.Vertical;
to.x = from.x;
if (from.y > to.y) {
const temp = from;
from = to;
to = temp;
}
this.length = to.y - from.y;
}
this.from = from;
this.to = to;
}
_valid = undefined;
get valid() {
if (this._valid === undefined) {
this._valid = this.direction !== LineDirection.None && this.length > Shape.tolerance;
}
return this._valid;
}
get normalized() {
if (this.direction === LineDirection.Horizontal) {
return new Line(new Point(this.from.x - Shape.tolerance, this.from.y), new Point(this.to.x + Shape.tolerance, this.from.y));
}
else if (this.direction === LineDirection.Vertical) {
return new Line(new Point(this.from.x, this.from.y - Shape.tolerance), new Point(this.from.x, this.to.y + Shape.tolerance));
}
return this;
}
addGap(line) {
this.gaps.push(line);
}
containsPoint(p) {
if (this.direction === LineDirection.Vertical) {
return this.from.x === p.x && p.y >= this.from.y && p.y <= this.to.y;
}
else if (this.direction === LineDirection.Horizontal) {
return this.from.y === p.y && p.x >= this.from.x && p.x <= this.to.x;
}
return false;
}
// // todo implement
// public containsLine(l:Line):boolean{
// if(this.direction === LineDirection.Vertical && l.direction === LineDirection.Vertical){
// return this.from.x === l.from.x
// }
// else if(this.direction === LineDirection.Horizontal && l.direction === LineDirection.Horizontal){
// return this.from.y === l.from.y
// }
// return false
// }
addIntersectionPoint(point) {
for (const intPoint of this.intersections) {
if (intPoint.equal(point))
return;
}
this.intersections.push(point);
}
intersection(line) {
let result;
if (!this.valid || !line.valid) {
return result;
}
const thisNormalized = this.normalized;
const lineNormalized = line.normalized;
if (this.direction === LineDirection.Horizontal && line.direction === LineDirection.Vertical) {
const x = lineNormalized.from.x;
const y = thisNormalized.from.y;
const isOk = x > thisNormalized.from.x && x < thisNormalized.to.x && y > lineNormalized.from.y && y < lineNormalized.to.y;
if (isOk) {
const intPoint = new Point(x, y);
this.addIntersectionPoint(intPoint);
line.addIntersectionPoint(intPoint);
result = intPoint;
}
}
else if (this.direction === LineDirection.Vertical && line.direction === LineDirection.Horizontal) {
const x = thisNormalized.from.x;
const y = lineNormalized.from.y;
const isOk = x > lineNormalized.from.x && x < lineNormalized.to.x && y > thisNormalized.from.y && y < thisNormalized.to.y;
if (isOk) {
const intPoint = new Point(x, y);
this.addIntersectionPoint(intPoint);
line.addIntersectionPoint(intPoint);
result = intPoint;
}
}
// if(result){
// for (const gapLine of this.gaps) {
// if(gapLine.containsPoint(result)) return undefined
// }
//
// for (const gapLine of line.gaps) {
// if(gapLine.containsPoint(result)) return undefined
// }
// }
return result;
}
transform(matrix) {
const p1 = this.from.transform(matrix);
const p2 = this.to.transform(matrix);
const x = Math.min(p1.x, p2.x);
const y = Math.min(p1.y, p2.y);
const width = Math.abs(p1.x - p2.x);
const height = Math.abs(p1.y - p2.y);
this.from = new Point(x, y);
this.to = new Point(x + width, y + height);
this.init();
return this;
}
}
//# sourceMappingURL=Line.js.map

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,20 @@
import { Line } from './Line.js';
import type { Rectangle } from './Rectangle.js';
import { Table } from './Table.js';
import type { TableData } from './TableData.js';
export declare class LineStore {
hLines: Array<Line>;
vLines: Array<Line>;
add(line: Line): void;
addRectangle(rect: Rectangle): void;
getTableData(): Array<TableData>;
getTables(): Array<Table>;
normalize(): void;
normalizeHorizontal(): void;
normalizeVertical(): void;
private fillTable;
private tryFill;
private margeHorizontalLines;
private margeVerticalLines;
}
//# sourceMappingURL=LineStore.d.ts.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"LineStore.d.ts","sourceRoot":"","sources":["../../../../src/pdf-parse/geometry/LineStore.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAiB,MAAM,WAAW,CAAC;AAEhD,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAEhD,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAEhD,qBAAa,SAAS;IACd,MAAM,EAAE,KAAK,CAAC,IAAI,CAAC,CAAM;IACzB,MAAM,EAAE,KAAK,CAAC,IAAI,CAAC,CAAM;IAEzB,GAAG,CAAC,IAAI,EAAE,IAAI,GAAG,IAAI;IAUrB,YAAY,CAAC,IAAI,EAAE,SAAS,GAAG,IAAI;IAMnC,YAAY,IAAI,KAAK,CAAC,SAAS,CAAC;IAehC,SAAS,IAAI,KAAK,CAAC,KAAK,CAAC;IAoCzB,SAAS,IAAI,IAAI;IAKjB,mBAAmB;IA0BnB,iBAAiB;IA0BxB,OAAO,CAAC,SAAS;IAoBjB,OAAO,CAAC,OAAO;IAUf,OAAO,CAAC,oBAAoB;IAwC5B,OAAO,CAAC,kBAAkB;CAuC1B"}

View File

@@ -0,0 +1,212 @@
import { Line, LineDirection } from './Line.js';
import { Point } from './Point.js';
import { Shape } from './Shape.js';
import { Table } from './Table.js';
export class LineStore {
hLines = [];
vLines = [];
add(line) {
if (line.valid) {
if (line.direction === LineDirection.Horizontal) {
this.hLines.push(line);
}
else if (line.direction === LineDirection.Vertical) {
this.vLines.push(line);
}
}
}
addRectangle(rect) {
for (const line of rect.getLines()) {
this.add(line);
}
}
getTableData() {
const result = [];
const tables = this.getTables();
for (const table of tables) {
const data = table.toData();
if (data) {
result.push(data);
}
}
return result;
}
getTables() {
const result = [];
while (this.hLines.length !== 0) {
const hLine = this.hLines.shift();
if (!hLine)
continue;
const filled = this.tryFill(result, hLine);
if (filled)
continue;
const table = new Table(hLine);
this.fillTable(table);
result.push(table);
}
while (this.vLines.length !== 0) {
const vLine = this.vLines.shift();
if (!vLine)
continue;
const filled = this.tryFill(result, vLine);
if (filled)
continue;
const table = new Table(vLine);
this.fillTable(table);
result.push(table);
}
const validTables = result.filter((t) => t.isValid);
for (const table of validTables) {
table.normalize();
}
return validTables;
}
normalize() {
this.normalizeHorizontal();
this.normalizeVertical();
}
normalizeHorizontal() {
this.hLines.sort((l1, l2) => l1.from.y - l2.from.y);
const newLines = [];
let sameY = [];
for (const line of this.hLines) {
if (sameY.length === 0) {
sameY.push(line);
}
else if (Math.abs(sameY[0]?.from.y - line.from.y) < Shape.tolerance) {
sameY.push(line);
}
else {
const merged = this.margeHorizontalLines(sameY);
newLines.push(...merged);
sameY = [line];
}
}
if (sameY.length > 0) {
const merged = this.margeHorizontalLines(sameY);
newLines.push(...merged);
}
this.hLines = newLines;
}
normalizeVertical() {
this.vLines.sort((l1, l2) => l1.from.x - l2.from.x);
const newLines = [];
let sameX = [];
for (const line of this.vLines) {
if (sameX.length === 0) {
sameX.push(line);
}
else if (Math.abs(sameX[0]?.from.x - line.from.x) < Shape.tolerance) {
sameX.push(line);
}
else {
const merged = this.margeVerticalLines(sameX);
newLines.push(...merged);
sameX = [line];
}
}
if (sameX.length > 0) {
const merged = this.margeVerticalLines(sameX);
newLines.push(...merged);
}
this.vLines = newLines;
}
fillTable(table) {
const newVLines = [];
const newHLines = [];
for (const vLine of this.vLines) {
if (!table.add(vLine)) {
newVLines.push(vLine);
}
}
for (const hLine of this.hLines) {
if (!table.add(hLine)) {
newHLines.push(hLine);
}
}
this.hLines = newHLines;
this.vLines = newVLines;
}
tryFill(tables, line) {
for (const table of tables) {
if (table.add(line)) {
this.fillTable(table);
return true;
}
}
return false;
}
margeHorizontalLines(sameYLines) {
const result = [];
sameYLines.sort((l1, l2) => l1.from.x - l2.from.x);
const sameY = sameYLines[0]?.from.y;
if (sameY === undefined)
return result;
let minX = Number.MAX_SAFE_INTEGER;
let maxX = Number.MIN_SAFE_INTEGER;
for (const line of sameYLines) {
if (line.from.x - maxX < Shape.tolerance) {
if (line.from.x < minX) {
minX = line.from.x;
}
if (line.to.x > maxX) {
maxX = line.to.x;
}
}
else {
if (maxX > minX) {
result.push(new Line(new Point(minX, sameY), new Point(maxX, sameY)));
}
minX = line.from.x;
maxX = line.to.x;
}
}
const last = result[result.length - 1];
if (last) {
if (last.from.x !== minX && last.to.x !== maxX) {
result.push(new Line(new Point(minX, sameY), new Point(maxX, sameY)));
}
}
else {
result.push(new Line(new Point(minX, sameY), new Point(maxX, sameY)));
}
return result;
}
margeVerticalLines(sameXLines) {
const result = [];
sameXLines.sort((l1, l2) => l1.from.y - l2.from.y);
const sameX = sameXLines[0]?.from.x;
if (sameX === undefined)
return result;
let minY = Number.MAX_SAFE_INTEGER;
let maxY = Number.MIN_SAFE_INTEGER;
for (const line of sameXLines) {
if (line.from.y - maxY < Shape.tolerance) {
if (line.from.y < minY) {
minY = line.from.y;
}
if (line.to.y > maxY) {
maxY = line.to.y;
}
}
else {
if (maxY > minY) {
result.push(new Line(new Point(sameX, minY), new Point(sameX, maxY)));
}
minY = line.from.y;
maxY = line.to.y;
}
}
const last = result[result.length - 1];
if (last) {
if (last.from.y !== minY && last.to.y !== maxY) {
result.push(new Line(new Point(sameX, minY), new Point(sameX, maxY)));
}
}
else {
result.push(new Line(new Point(sameX, minY), new Point(sameX, maxY)));
}
return result;
}
}
//# sourceMappingURL=LineStore.js.map

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,9 @@
import { Shape } from './Shape.js';
export declare class Point extends Shape {
x: number;
y: number;
constructor(x: number, y: number);
equal(point: Point): boolean;
transform(matrix: Array<number>): this;
}
//# sourceMappingURL=Point.d.ts.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"Point.d.ts","sourceRoot":"","sources":["../../../../src/pdf-parse/geometry/Point.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AAEnC,qBAAa,KAAM,SAAQ,KAAK;IACxB,CAAC,EAAE,MAAM,CAAC;IACV,CAAC,EAAE,MAAM,CAAC;gBAEL,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,MAAM;IAMzB,KAAK,CAAC,KAAK,EAAE,KAAK,GAAG,OAAO;IAI5B,SAAS,CAAC,MAAM,EAAE,KAAK,CAAC,MAAM,CAAC,GAAG,IAAI;CAM7C"}

View File

@@ -0,0 +1,20 @@
import { Shape } from './Shape.js';
export class Point extends Shape {
x;
y;
constructor(x, y) {
super();
this.x = x;
this.y = y;
}
equal(point) {
return point.x === this.x && point.y === this.y;
}
transform(matrix) {
const p = Shape.applyTransform([this.x, this.y], matrix);
this.x = p[0];
this.y = p[1];
return this;
}
}
//# sourceMappingURL=Point.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"Point.js","sourceRoot":"","sources":["../../../../src/pdf-parse/geometry/Point.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AAEnC,MAAM,OAAO,KAAM,SAAQ,KAAK;IACxB,CAAC,CAAS;IACV,CAAC,CAAS;IAEjB,YAAY,CAAS,EAAE,CAAS;QAC/B,KAAK,EAAE,CAAC;QACR,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC;QACX,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC;IACZ,CAAC;IAEM,KAAK,CAAC,KAAY;QACxB,OAAO,KAAK,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC,IAAI,KAAK,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC,CAAC;IACjD,CAAC;IAEM,SAAS,CAAC,MAAqB;QACrC,MAAM,CAAC,GAAG,KAAK,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC;QACzD,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QACd,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QACd,OAAO,IAAI,CAAC;IACb,CAAC;CACD"}

View File

@@ -0,0 +1,13 @@
import { Line } from './Line.js';
import { Point } from './Point.js';
import { Shape } from './Shape.js';
export declare class Rectangle extends Shape {
from: Point;
width: number;
height: number;
constructor(from: Point, width: number, height: number);
get to(): Point;
getLines(): Line[];
transform(matrix: Array<number>): this;
}
//# sourceMappingURL=Rectangle.d.ts.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"Rectangle.d.ts","sourceRoot":"","sources":["../../../../src/pdf-parse/geometry/Rectangle.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AAEnC,qBAAa,SAAU,SAAQ,KAAK;IAC5B,IAAI,EAAE,KAAK,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;gBAEV,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM;IAOtD,IAAW,EAAE,IAAI,KAAK,CAErB;IAEM,QAAQ,IAAI,IAAI,EAAE;IAYlB,SAAS,CAAC,MAAM,EAAE,KAAK,CAAC,MAAM,CAAC,GAAG,IAAI;CAe7C"}

View File

@@ -0,0 +1,40 @@
import { Line } from './Line.js';
import { Point } from './Point.js';
import { Shape } from './Shape.js';
export class Rectangle extends Shape {
from;
width;
height;
constructor(from, width, height) {
super();
this.from = from;
this.width = width;
this.height = height;
}
get to() {
return new Point(this.from.x + this.width, this.from.y + this.height);
}
getLines() {
const to = this.to;
const lines = [
new Line(this.from, new Point(to.x, this.from.y)),
new Line(this.from, new Point(this.from.x, to.y)),
new Line(new Point(to.x, this.from.y), to),
new Line(new Point(this.from.x, to.y), to),
];
return lines.filter((l) => l.valid);
}
transform(matrix) {
const p1 = Shape.applyTransform([this.from.x, this.from.y], matrix);
const p2 = Shape.applyTransform([this.from.x + this.width, this.from.y + this.height], matrix);
const x = Math.min(p1[0], p2[0]);
const y = Math.min(p1[1], p2[1]);
const width = Math.abs(p1[0] - p2[0]);
const height = Math.abs(p1[1] - p2[1]);
this.from = new Point(x, y);
this.width = width;
this.height = height;
return this;
}
}
//# sourceMappingURL=Rectangle.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"Rectangle.js","sourceRoot":"","sources":["../../../../src/pdf-parse/geometry/Rectangle.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AAEnC,MAAM,OAAO,SAAU,SAAQ,KAAK;IAC5B,IAAI,CAAQ;IACZ,KAAK,CAAS;IACd,MAAM,CAAS;IAEtB,YAAY,IAAW,EAAE,KAAa,EAAE,MAAc;QACrD,KAAK,EAAE,CAAC;QACR,IAAI,CAAC,IAAI,GAAG,IAAI,CAAC;QACjB,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;IACtB,CAAC;IAED,IAAW,EAAE;QACZ,OAAO,IAAI,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,IAAI,CAAC,KAAK,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC;IACvE,CAAC;IAEM,QAAQ;QACd,MAAM,EAAE,GAAG,IAAI,CAAC,EAAE,CAAC;QAEnB,MAAM,KAAK,GAAgB;YAC1B,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,IAAI,KAAK,CAAC,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACjD,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,IAAI,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC;YACjD,IAAI,IAAI,CAAC,IAAI,KAAK,CAAC,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;YAC1C,IAAI,IAAI,CAAC,IAAI,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;SAC1C,CAAC;QACF,OAAO,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;IACrC,CAAC;IAEM,SAAS,CAAC,MAAqB;QACrC,MAAM,EAAE,GAAG,KAAK,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC;QACpE,MAAM,EAAE,GAAG,KAAK,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,IAAI,CAAC,KAAK,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,EAAE,MAAM,CAAC,CAAC;QAE/F,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QACjC,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QAEjC,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QACtC,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QAEvC,IAAI,CAAC,IAAI,GAAG,IAAI,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QAC5B,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;QACrB,OAAO,IAAI,CAAC;IACb,CAAC;CACD"}

View File

@@ -0,0 +1,6 @@
export declare abstract class Shape {
static tolerance: number;
abstract transform(matrix: Array<number>): this;
static applyTransform(p: Array<number>, m: Array<number>): Array<number>;
}
//# sourceMappingURL=Shape.d.ts.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"Shape.d.ts","sourceRoot":"","sources":["../../../../src/pdf-parse/geometry/Shape.ts"],"names":[],"mappings":"AAAA,8BAAsB,KAAK;IAC1B,MAAM,CAAC,SAAS,SAAK;aACL,SAAS,CAAC,MAAM,EAAE,KAAK,CAAC,MAAM,CAAC,GAAG,IAAI;IAEtD,MAAM,CAAC,cAAc,CAAC,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,GAAG,KAAK,CAAC,MAAM,CAAC;CAKxE"}

View File

@@ -0,0 +1,9 @@
export class Shape {
static tolerance = 2;
static applyTransform(p, m) {
const xt = p[0] * m[0] + p[1] * m[2] + m[4];
const yt = p[0] * m[1] + p[1] * m[3] + m[5];
return [xt, yt];
}
}
//# sourceMappingURL=Shape.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"Shape.js","sourceRoot":"","sources":["../../../../src/pdf-parse/geometry/Shape.ts"],"names":[],"mappings":"AAAA,MAAM,OAAgB,KAAK;IAC1B,MAAM,CAAC,SAAS,GAAG,CAAC,CAAC;IAGrB,MAAM,CAAC,cAAc,CAAC,CAAgB,EAAE,CAAgB;QACvD,MAAM,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QAC5C,MAAM,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QAC5C,OAAO,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC;IACjB,CAAC"}

View File

@@ -0,0 +1,24 @@
import { Line } from './Line.js';
import { TableData } from './TableData.js';
export declare class Table {
hLines: Array<Line>;
vLines: Array<Line>;
constructor(line: Line);
get isValid(): boolean;
get rowPivots(): Array<number>;
get colPivots(): Array<number>;
add(line: Line): boolean;
private intersection;
private getSameHorizontal;
private getSameVertical;
private mergeHorizontalLines;
private mergeVerticalLines;
normalize(): void;
verticalExists(line: Line, y1: number, y2: number): boolean;
horizontalExists(line: Line, x1: number, x2: number): boolean;
private findBottomLineIndex;
private findVerticalLineIndexs;
private getRow;
toData(): TableData;
}
//# sourceMappingURL=Table.d.ts.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"Table.d.ts","sourceRoot":"","sources":["../../../../src/pdf-parse/geometry/Table.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAiB,MAAM,WAAW,CAAC;AAGhD,OAAO,EAAkB,SAAS,EAAiB,MAAM,gBAAgB,CAAC;AAE1E,qBAAa,KAAK;IACV,MAAM,EAAE,KAAK,CAAC,IAAI,CAAC,CAAM;IACzB,MAAM,EAAE,KAAK,CAAC,IAAI,CAAC,CAAM;gBAEpB,IAAI,EAAE,IAAI;IAQtB,IAAW,OAAO,IAAI,OAAO,CAE5B;IAED,IAAW,SAAS,IAAI,KAAK,CAAC,MAAM,CAAC,CAQpC;IAED,IAAW,SAAS,IAAI,KAAK,CAAC,MAAM,CAAC,CAQpC;IAEM,GAAG,CAAC,IAAI,EAAE,IAAI,GAAG,OAAO;IAgB/B,OAAO,CAAC,YAAY;IAsBpB,OAAO,CAAC,iBAAiB;IAmBzB,OAAO,CAAC,eAAe;IAmBvB,OAAO,CAAC,oBAAoB;IAwB5B,OAAO,CAAC,kBAAkB;IAqBnB,SAAS,IAAI,IAAI;IAkCjB,cAAc,CAAC,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,GAAG,OAAO;IAqB3D,gBAAgB,CAAC,IAAI,EAAE,IAAI,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,GAAG,OAAO;IAqBpE,OAAO,CAAC,mBAAmB;IAU3B,OAAO,CAAC,sBAAsB;IAa9B,OAAO,CAAC,MAAM;IAqCP,MAAM,IAAI,SAAS;CAmB1B"}

View File

@@ -0,0 +1,260 @@
import { Line, LineDirection } from './Line.js';
import { Point } from './Point.js';
import { Shape } from './Shape.js';
import { TableData } from './TableData.js';
export class Table {
hLines = [];
vLines = [];
constructor(line) {
if (line.direction === LineDirection.Horizontal) {
this.hLines.push(line);
}
else if (line.direction === LineDirection.Vertical) {
this.vLines.push(line);
}
}
get isValid() {
return this.hLines.length + this.vLines.length > 4;
}
get rowPivots() {
const rowSet = new Set();
for (const line of this.hLines) {
rowSet.add(line.from.y);
}
return [...rowSet].sort((a, b) => a - b);
}
get colPivots() {
const colSet = new Set();
for (const line of this.vLines) {
colSet.add(line.from.x);
}
return [...colSet].sort((a, b) => a - b);
}
add(line) {
const hasIntersection = this.intersection(line);
if (hasIntersection) {
if (line.direction === LineDirection.Horizontal) {
this.hLines.push(line);
return true;
}
else if (line.direction === LineDirection.Vertical) {
this.vLines.push(line);
return true;
}
}
return false;
}
intersection(line) {
let flag = false;
if (!line.valid)
return flag;
if (line.direction === LineDirection.Horizontal) {
for (const vLine of this.vLines) {
const p = line.intersection(vLine);
if (p) {
flag = true;
}
}
}
else if (line.direction === LineDirection.Vertical) {
for (const hLine of this.hLines) {
const p = line.intersection(hLine);
if (p) {
flag = true;
}
}
}
return flag;
}
getSameHorizontal(line) {
const same = [line];
const other = [];
while (this.hLines.length > 0) {
const hLine = this.hLines.shift();
if (!hLine)
continue;
if (hLine.from.y === line.from.y) {
same.push(hLine);
}
else {
other.push(hLine);
}
}
this.hLines = other;
return same;
}
getSameVertical(line) {
const same = [line];
const other = [];
while (this.vLines.length > 0) {
const vLine = this.vLines.shift();
if (!vLine)
continue;
if (vLine.from.x === line.from.x) {
same.push(vLine);
}
else {
other.push(vLine);
}
}
this.vLines = other;
return same;
}
mergeHorizontalLines(lines) {
lines.sort((l1, l2) => l1.from.x - l2.from.x);
const minX = lines[0].from.x;
const maxX = lines[lines.length - 1].to.x;
const resultLine = new Line(new Point(minX, lines[0].from.y), new Point(maxX, lines[0].from.y));
for (let i = 1; i < lines.length; i++) {
const prevLine = lines[i - 1];
const currLine = lines[i];
if (Math.abs(prevLine.to.x - currLine.from.x) > Shape.tolerance) {
const gapLine = new Line(new Point(prevLine.to.x, prevLine.from.y), new Point(currLine.from.x, currLine.from.y));
resultLine.addGap(gapLine);
}
}
return resultLine;
}
mergeVerticalLines(lines) {
lines.sort((l1, l2) => l1.from.y - l2.from.y);
const minY = lines[0].from.y;
const maxY = lines[lines.length - 1].to.y;
const resultLine = new Line(new Point(lines[0].from.x, minY), new Point(lines[0].from.x, maxY));
for (let i = 1; i < lines.length; i++) {
const prevLine = lines[i - 1];
const currLine = lines[i];
if (Math.abs(prevLine.to.y - currLine.from.y) > Shape.tolerance) {
const gapLine = new Line(new Point(prevLine.to.x, prevLine.to.y), new Point(prevLine.to.x, currLine.from.y));
resultLine.addGap(gapLine);
}
}
return resultLine;
}
normalize() {
this.hLines = this.hLines.filter((l) => l.intersections.length > 1);
this.vLines = this.vLines.filter((l) => l.intersections.length > 1);
this.hLines.sort((l1, l2) => l1.from.y - l2.from.y);
this.vLines.sort((l1, l2) => l1.from.x - l2.from.x);
const newHLines = [];
while (this.hLines.length > 0) {
const line = this.hLines.shift();
if (!line)
continue;
const lines = this.getSameHorizontal(line);
const merged = this.mergeHorizontalLines(lines);
newHLines.push(merged);
}
this.hLines = newHLines;
const newVLines = [];
while (this.vLines.length > 0) {
const line = this.vLines.shift();
if (!line)
continue;
const lines = this.getSameVertical(line);
const merged = this.mergeVerticalLines(lines);
newVLines.push(merged);
}
this.vLines = newVLines;
}
verticalExists(line, y1, y2) {
if (line.direction !== LineDirection.Vertical) {
throw new Error('Line is not vertical');
}
if (y1 >= y2) {
throw new Error('y1 must be less than y2');
}
if (line.from.y <= y1 && line.to.y >= y2) {
for (const gap of line.gaps) {
if (gap.from.y <= y1 && gap.to.y >= y2) {
return false;
}
}
return true;
}
return false;
}
horizontalExists(line, x1, x2) {
if (line.direction !== LineDirection.Horizontal) {
throw new Error('Line is not horizontal');
}
if (x1 >= x2) {
throw new Error('x1 must be less than x2');
}
if (line.from.x <= x1 && line.to.x >= x2) {
for (const gap of line.gaps) {
if (gap.from.x <= x1 && gap.to.x >= x2) {
return false;
}
}
return true;
}
return false;
}
findBottomLineIndex(h2Index, xMiddle) {
for (let i = h2Index; i < this.hLines.length; i++) {
const hLine = this.hLines[i];
if (hLine.from.x <= xMiddle && hLine.to.x >= xMiddle) {
return i;
}
}
return -1;
}
findVerticalLineIndexs(topHLine, yMiddle) {
const result = [];
for (let i = 0; i < this.vLines.length; i++) {
const vLine = this.vLines[i];
if (vLine.from.y <= yMiddle && vLine.to.y >= yMiddle && topHLine.intersection(vLine)) {
result.push(i);
}
}
return result;
}
getRow(h1Index, h2Index, yMiddle) {
const tableRow = [];
//const colCount = this.vLines.length -1
const topHLine = this.hLines[h1Index];
const vLineIndexes = this.findVerticalLineIndexs(topHLine, yMiddle);
for (let i = 1; i < vLineIndexes.length; i++) {
const leftVLine = this.vLines[vLineIndexes[i - 1]];
const rightVLine = this.vLines[vLineIndexes[i]];
const xMiddle = (leftVLine.from.x + rightVLine.from.x) / 2;
const bottomHLineIndex = this.findBottomLineIndex(h2Index, xMiddle);
const bottomHLine = this.hLines[bottomHLineIndex];
// minXY: {x:leftVLine.from.x,y:topHLine.from.y},
// maxXY: {x:rightVLine.from.x,y:bottomHLine.from.y},
const tableCell = {
minXY: new Point(leftVLine.from.x, topHLine.from.y),
maxXY: new Point(rightVLine.from.x, bottomHLine.from.y),
width: rightVLine.from.x - leftVLine.from.x,
height: bottomHLine.from.y - topHLine.from.y,
text: [],
};
const colSpan = vLineIndexes[i] - vLineIndexes[i - 1];
const rowSpan = bottomHLineIndex - h1Index;
if (colSpan > 1) {
tableCell.colspan = colSpan;
}
if (rowSpan > 1) {
tableCell.rowspan = rowSpan;
}
tableRow.push(tableCell);
}
return tableRow;
}
toData() {
const rowPivots = this.rowPivots;
const colPivots = this.colPivots;
const minXY = new Point(colPivots[0], rowPivots[0]);
const maxXY = new Point(colPivots[colPivots.length - 1], rowPivots[rowPivots.length - 1]);
const result = new TableData(minXY, maxXY, rowPivots, colPivots);
for (let h1 = 1; h1 < this.hLines.length; h1++) {
const prevHLine = this.hLines[h1 - 1];
const currHLine = this.hLines[h1];
const YMiddle = (prevHLine.from.y + currHLine.from.y) / 2;
const rowData = this.getRow(h1 - 1, h1, YMiddle);
result.rows.push(rowData);
}
return result;
}
}
//# sourceMappingURL=Table.js.map

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,25 @@
import type { Point } from './Point.js';
export type TableCell = {
minXY: Point;
maxXY: Point;
width: number;
height: number;
colspan?: number;
rowspan?: number;
text: Array<string>;
};
export type TableRow = Array<TableCell>;
export declare class TableData {
minXY: Point;
maxXY: Point;
rows: Array<TableRow>;
private rowPivots;
private colPivots;
constructor(minXY: Point, maxXY: Point, rowPivots: Array<number>, colPivots: Array<number>);
findCell(x: number, y: number): TableCell | undefined;
get cellCount(): number;
get rowCount(): number;
check(): boolean;
toArray(): string[][];
}
//# sourceMappingURL=TableData.d.ts.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"TableData.d.ts","sourceRoot":"","sources":["../../../../src/pdf-parse/geometry/TableData.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AAExC,MAAM,MAAM,SAAS,GAAG;IACvB,KAAK,EAAE,KAAK,CAAC;IACb,KAAK,EAAE,KAAK,CAAC;IACb,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,IAAI,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;CACpB,CAAC;AAEF,MAAM,MAAM,QAAQ,GAAG,KAAK,CAAC,SAAS,CAAC,CAAC;AAExC,qBAAa,SAAS;IACd,KAAK,EAAE,KAAK,CAAC;IACb,KAAK,EAAE,KAAK,CAAC;IACb,IAAI,EAAE,KAAK,CAAC,QAAQ,CAAC,CAAC;IAC7B,OAAO,CAAC,SAAS,CAAgB;IACjC,OAAO,CAAC,SAAS,CAAgB;gBAErB,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,SAAS,EAAE,KAAK,CAAC,MAAM,CAAC,EAAE,SAAS,EAAE,KAAK,CAAC,MAAM,CAAC;IAQnF,QAAQ,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,MAAM,GAAG,SAAS,GAAG,SAAS;IAc5D,IAAW,SAAS,WAEnB;IAED,IAAW,QAAQ,WAElB;IAEM,KAAK,IAAI,OAAO;IAkChB,OAAO,IAAI,MAAM,EAAE,EAAE;CAc5B"}

View File

@@ -0,0 +1,76 @@
export class TableData {
minXY;
maxXY;
rows;
rowPivots;
colPivots;
constructor(minXY, maxXY, rowPivots, colPivots) {
this.minXY = minXY;
this.maxXY = maxXY;
this.rows = [];
this.rowPivots = rowPivots;
this.colPivots = colPivots;
}
findCell(x, y) {
if (x >= this.minXY.x && y >= this.minXY.y && x <= this.maxXY.x && y <= this.maxXY.y) {
for (const row of this.rows) {
for (const cell of row) {
if (cell.minXY.x <= x && cell.minXY.y <= y && cell.maxXY.x >= x && cell.maxXY.y >= y) {
return cell;
}
}
}
}
return undefined;
}
get cellCount() {
return this.rows.reduce((acc, row) => acc + row.length, 0);
}
get rowCount() {
return this.rows.length;
}
check() {
// const cellCounts:Array<number> = []
//
// for (const row of this.rows) {
// let cellNum = 0
// for (const cell of row) {
// cellNum += cell.colspan || 1
// }
// cellCounts.push(cellNum)
// }
//
// for (let i = 1; i < cellCounts.length; i++) {
// if (cellCounts[i] !== cellCounts[i - 1]) {
// return false
// }
// }
const virtualCellCount = (this.colPivots.length - 1) * (this.rowPivots.length - 1);
let allCellCount = 0;
for (const row of this.rows) {
for (const cell of row) {
const count = (cell.colspan || 1) * (cell.rowspan || 1);
allCellCount += count;
}
}
if (virtualCellCount !== allCellCount) {
return false;
}
return true;
}
toArray() {
const tableArr = [];
for (const row of this.rows) {
const rowArr = [];
for (const cell of row) {
let text = cell.text.join('');
text = text.replace(/^[\s]+|[\s]+$/g, '');
text = text.trim();
rowArr.push(text);
}
tableArr.push(rowArr);
}
return tableArr;
}
}
//# sourceMappingURL=TableData.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"TableData.js","sourceRoot":"","sources":["../../../../src/pdf-parse/geometry/TableData.ts"],"names":[],"mappings":"AAcA,MAAM,OAAO,SAAS;IACd,KAAK,CAAQ;IACb,KAAK,CAAQ;IACb,IAAI,CAAkB;IACrB,SAAS,CAAgB;IACzB,SAAS,CAAgB;IAEjC,YAAY,KAAY,EAAE,KAAY,EAAE,SAAwB,EAAE,SAAwB;QACzF,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,IAAI,GAAG,EAAE,CAAC;QACf,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC;QAC3B,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC;IAC5B,CAAC;IAEM,QAAQ,CAAC,CAAS,EAAE,CAAS;QACnC,IAAI,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC;YACtF,KAAK,MAAM,GAAG,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC;gBAC7B,KAAK,MAAM,IAAI,IAAI,GAAG,EAAE,CAAC;oBACxB,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC;wBACtF,OAAO,IAAI,CAAC;oBACb,CAAC;gBACF,CAAC;YACF,CAAC;QACF,CAAC;QAED,OAAO,SAAS,CAAC;IAClB,CAAC;IAED,IAAW,SAAS;QACnB,OAAO,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE,CAAC,GAAG,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IAC5D,CAAC;IAED,IAAW,QAAQ;QAClB,OAAO,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC;IACzB,CAAC;IAEM,KAAK;QACX,sCAAsC;QACtC,EAAE;QACF,iCAAiC;QACjC,sBAAsB;QACtB,gCAAgC;QAChC,uCAAuC;QACvC,QAAQ;QACR,+BAA+B;QAC/B,IAAI;QACJ,EAAE;QACF,gDAAgD;QAChD,iDAAiD;QACjD,uBAAuB;QACvB,QAAQ;QACR,IAAI;QAEJ,MAAM,gBAAgB,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACnF,IAAI,YAAY,GAAG,CAAC,CAAC;QAErB,KAAK,MAAM,GAAG,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC;YAC7B,KAAK,MAAM,IAAI,IAAI,GAAG,EAAE,CAAC;gBACxB,MAAM,KAAK,GAAG,CAAC,IAAI,CAAC,OAAO,IAAI,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,OAAO,IAAI,CAAC,CAAC,CAAC;gBACxD,YAAY,IAAI,KAAK,CAAC;YACvB,CAAC;QACF,CAAC;QAED,IAAI,gBAAgB,KAAK,YAAY,EAAE,CAAC;YACvC,OAAO,KAAK,CAAC;QACd,CAAC;QAED,OAAO,IAAI,CAAC;IACb,CAAC;IAEM,OAAO;QACb,MAAM,QAAQ,GAAe,EAAE,CAAC;QAChC,KAAK,MAAM,GAAG,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC;YAC7B,MAAM,MAAM,GAAa,EAAE,CAAC;YAC5B,KAAK,MAAM,IAAI,IAAI,GAAG,EAAE,CAAC;gBACxB,IAAI,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;gBAC9B,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,gBAAgB,EAAE,EAAE,CAAC,CAAC;gBAC1C,IAAI,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC;gBACnB,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACnB,CAAC;YACD,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACvB,CAAC;QACD,OAAO,QAAQ,CAAC;IACjB,CAAC;CACD"}

View File

@@ -0,0 +1,7 @@
export { Line, LineDirection } from './Line.js';
export { LineStore } from './LineStore.js';
export { Point } from './Point.js';
export { Rectangle } from './Rectangle.js';
export { Shape } from './Shape.js';
export { Table } from './Table.js';
//# sourceMappingURL=index.d.ts.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/pdf-parse/geometry/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,aAAa,EAAE,MAAM,WAAW,CAAC;AAChD,OAAO,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAC3C,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAC3C,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC"}

View File

@@ -0,0 +1,7 @@
export { Line, LineDirection } from './Line.js';
export { LineStore } from './LineStore.js';
export { Point } from './Point.js';
export { Rectangle } from './Rectangle.js';
export { Shape } from './Shape.js';
export { Table } from './Table.js';
//# sourceMappingURL=index.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../src/pdf-parse/geometry/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,aAAa,EAAE,MAAM,WAAW,CAAC;AAChD,OAAO,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAC3C,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAC3C,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC"}

13
node_modules/pdf-parse/dist/pdf-parse/esm/index.d.ts generated vendored Normal file
View File

@@ -0,0 +1,13 @@
import { PDFParse } from './PDFParse.js';
export { VerbosityLevel } from 'pdfjs-dist/legacy/build/pdf.mjs';
export * from './Exception.js';
export * from './geometry/index.js';
export type { EmbeddedImage, ImageKindKey, ImageKindValue, ImageResult, PageImages } from './ImageResult.js';
export type { DateNode, InfoResult, Metadata, OutlineNode, PageLinkResult } from './InfoResult.js';
export type * from './LoadParameters.js';
export type * from './ParseParameters.js';
export type { Screenshot, ScreenshotResult } from './ScreenshotResult.js';
export type { PageTableResult, TableArray, TableResult } from './TableResult.js';
export type { PageTextResult, TextResult } from './TextResult.js';
export { PDFParse };
//# sourceMappingURL=index.d.ts.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/pdf-parse/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AAEzC,OAAO,EAAE,cAAc,EAAE,MAAM,iCAAiC,CAAC;AAEjE,cAAc,gBAAgB,CAAC;AAC/B,cAAc,qBAAqB,CAAC;AACpC,YAAY,EAAE,aAAa,EAAE,YAAY,EAAE,cAAc,EAAE,WAAW,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC;AAC7G,YAAY,EAAE,QAAQ,EAAE,UAAU,EAAE,QAAQ,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AACnG,mBAAmB,qBAAqB,CAAC;AACzC,mBAAmB,sBAAsB,CAAC;AAC1C,YAAY,EAAE,UAAU,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AAC1E,YAAY,EAAE,eAAe,EAAE,UAAU,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AACjF,YAAY,EAAE,cAAc,EAAE,UAAU,EAAE,MAAM,iBAAiB,CAAC;AAElE,OAAO,EAAE,QAAQ,EAAE,CAAC"}

6
node_modules/pdf-parse/dist/pdf-parse/esm/index.js generated vendored Normal file
View File

@@ -0,0 +1,6 @@
import { PDFParse } from './PDFParse.js';
export { VerbosityLevel } from 'pdfjs-dist/legacy/build/pdf.mjs';
export * from './Exception.js';
export * from './geometry/index.js';
export { PDFParse };
//# sourceMappingURL=index.js.map

View File

@@ -0,0 +1 @@
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/pdf-parse/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AAEzC,OAAO,EAAE,cAAc,EAAE,MAAM,iCAAiC,CAAC;AAEjE,cAAc,gBAAgB,CAAC;AAC/B,cAAc,qBAAqB,CAAC;AASpC,OAAO,EAAE,QAAQ,EAAE,CAAC"}

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,898 @@
import type { DocumentInitParameters } from 'pdfjs-dist/types/src/display/api.js';
import type { ImageKind } from 'pdfjs-dist/legacy/build/pdf.mjs';
import { Metadata } from 'pdfjs-dist/types/src/display/metadata.js';
import type { PDFDataRangeTransport } from 'pdfjs-dist/types/src/display/api.js';
import type { PDFWorker } from 'pdfjs-dist/types/src/display/api.js';
import { VerbosityLevel } from 'pdfjs-dist/legacy/build/pdf.mjs';
/**
* Error used to indicate that an operation was aborted (for example by an AbortSignal).
*
* @public
*/
export declare class AbortException extends Error {
/**
* Create a new AbortException.
* @param message - Optional error message.
* @param cause - Optional underlying cause.
*/
constructor(message?: string, cause?: unknown);
}
/**
* @public
* Consolidated date information gathered from different PDF sources.
* The PDF 'Info' dictionary contains CreationDate / ModDate and
* the XMP/XAP metadata can contain several timestamps as well. This
* structure collects those values (if present) as JavaScript Date objects
* or null when the property exists but cannot be parsed.
*/
export declare type DateNode = {
CreationDate?: Date | null;
ModDate?: Date | null;
XmpCreateDate?: Date | null;
XmpModifyDate?: Date | null;
XmpMetadataDate?: Date | null;
XapCreateDate?: Date | null;
XapModifyDate?: Date | null;
XapMetadataDate?: Date | null;
};
/**
* @public
* EmbeddedImage
* - Normalized representation of an embedded image extracted from the PDF.
* - `data`: Raw image bytes (e.g. PNG/JPEG) as Uint8Array. Use this for file writing or binary processing.
* - `dataUrl`: Optional data URL (e.g. "data:image/png;base64,...") for directly embedding in <img> src.
* Storing both lets consumers choose the most convenient form; consider omitting one to save memory.
* - `name`: Resource name for the image.
* - `width` / `height`: Dimensions in pixels.
* - `kind`: ImageKindValue from indicating the pixel format (e.g. GRAYSCALE_1BPP / RGB_24BPP / RGBA_32BPP).
*/
export declare interface EmbeddedImage {
data: Uint8Array;
dataUrl: string;
name: string;
width: number;
height: number;
kind: ImageKindValue;
}
/**
* Error thrown when the PDF structure/contents are malformed and cannot be parsed.
*
* This is raised for low-level format problems detected while reading PDF objects.
* Errors caused during parsing PDF data.
*
* @public
*/
export declare class FormatError extends Error {
/**
* Create a new FormatError.
* @param message - Optional message describing the format problem.
* @param cause - Optional underlying cause.
*/
constructor(message?: string, cause?: unknown);
}
/**
* Normalize arbitrary thrown values into an Error instance used by the library.
*
* Known Error instances with specific names are mapped to the library's
* typed exceptions in order to preserve type information and any additional
* fields (for example `details`, `status`, etc.). If the value is not an
* Error it is converted to a generic Error containing the stringified value.
*
* @public
* @param error - The thrown value to normalize.
* @returns An Error instance representing the provided value.
*/
export declare function getException(error: unknown): Error;
/**
* @public
* ImageKindKey
* - Represents the keys of the ImageKind enum (e.g. "GRAYSCALE_1BPP", "RGB_24BPP", "RGBA_32BPP").
*/
export declare type ImageKindKey = keyof typeof ImageKind;
/**
* @public
* ImageKindValue
* - Represents the numeric values of the ImageKind enum (e.g. 1, 2, 3).
*/
export declare type ImageKindValue = (typeof ImageKind)[ImageKindKey];
/**
* @public
* ImageResult
* Helper container for extracted images grouped per page.
*/
export declare class ImageResult {
pages: Array<PageImages>;
total: number;
getPageImage(num: number, name: string): EmbeddedImage | null;
constructor(total: number);
}
/**
* @public
* Aggregated information about a PDF document returned by getInfo().
* The object contains high-level metadata, outline/bookmark structure,
* per-page extracted hyperlinks and utility helpers for parsing dates.
*/
export declare class InfoResult {
total: number;
/**
* The PDF 'Info' dictionary. Typical fields include title, author, subject,
* Creator, Producer and Creation/Modification dates. The exact structure is
* determined by the PDF and as returned by PDF.js.
*/
info?: any;
metadata?: Metadata;
/**
* An array of document fingerprint strings provided by PDF.js. Useful
* for caching, de-duplication or identifying a document across runs.
*/
fingerprints?: Array<string | null>;
/**
* Permission flags for the document as returned by PDF.js (or null).
* These flags indicate capabilities such as printing, copying and
* other restrictions imposed by the PDF security settings.
*/
permission?: number[] | null;
/**
* Optional document outline (bookmarks). When present this is the
* hierarchical navigation structure which viewers use for quick access.
*/
outline?: Array<OutlineNode> | null;
pages: Array<PageLinkResult>;
/**
* Collects dates from different sources (Info dictionary and XMP/XAP metadata)
* and returns them as a DateNode where available. This helps callers compare
* and choose the most relevant timestamp (for example a creation date vs XMP date).
*/
getDateNode(): DateNode;
/**
* Try to parse an ISO-8601 date string from XMP/XAP metadata. If the
* value is falsy or cannot be parsed, undefined is returned to indicate
* absence or unparsable input.
*/
private parseISODateString;
constructor(total: number);
}
/**
* Error thrown when the parsed data is not a valid PDF document.
*
* Use this exception to signal that the input cannot be interpreted as a PDF
* (corrupt file, invalid header, etc.).
*
* @public
*/
export declare class InvalidPDFException extends Error {
/**
* Create a new InvalidPDFException.
* @param message - Optional error message.
* @param cause - Optional underlying cause (preserved on modern runtimes).
*/
constructor(message?: string, cause?: unknown);
}
export declare class Line extends Shape {
from: Point;
to: Point;
direction: LineDirection;
length: number;
intersections: Array<Point>;
gaps: Array<Line>;
constructor(from: Point, to: Point);
private init;
private _valid;
get valid(): boolean;
get normalized(): Line;
addGap(line: Line): void;
containsPoint(p: Point): boolean;
addIntersectionPoint(point: Point): void;
intersection(line: Line): Point | undefined;
transform(matrix: Array<number>): this;
}
export declare enum LineDirection {
None = 0,
Horizontal = 1,
Vertical = 2
}
export declare class LineStore {
hLines: Array<Line>;
vLines: Array<Line>;
add(line: Line): void;
addRectangle(rect: Rectangle): void;
getTableData(): Array<TableData>;
getTables(): Array<Table>;
normalize(): void;
normalizeHorizontal(): void;
normalizeVertical(): void;
private fillTable;
private tryFill;
private margeHorizontalLines;
private margeVerticalLines;
}
/**
* @public
* LoadParameters
* PDF loading parameters.
*/
export declare interface LoadParameters extends DocumentInitParameters {
/**
* The URL of the PDF.
* Default: `undefined`.
*/
url?: string | URL | undefined;
/**
* Binary PDF data.
* Use TypedArrays (e.g., `Uint8Array`) to improve memory usage. If PDF data is BASE64-encoded, use `atob()` to convert it to a binary string first.
* **NOTE**: If TypedArrays are used, they will generally be transferred to the worker thread, reducing main-thread memory usage but taking ownership of the array.
* Default: `undefined`.
*/
data?: string | number[] | ArrayBuffer | TypedArray | undefined;
/**
* Basic authentication headers.
* Default: `undefined`.
*/
httpHeaders?: Object | undefined;
/**
* Indicates whether cross-site Access-Control requests should be made using credentials (e.g., cookies or auth headers).
* Default: `false`.
*/
withCredentials?: boolean | undefined;
/**
* For decrypting password-protected PDFs.
* Default: `undefined`.
*/
password?: string | undefined;
/**
* The PDF file length. Used for progress reports and range requests.
* Default: `undefined`.
*/
length?: number | undefined;
/**
* Allows using a custom range transport implementation.
* Default: `undefined`.
*/
range?: PDFDataRangeTransport | undefined;
/**
* Maximum number of bytes fetched per range request.
* Default: `65536` (`2^16`).
*/
rangeChunkSize?: number | undefined;
/**
* The worker used for loading and parsing PDF data.
* Default: `undefined`.
*/
worker?: PDFWorker | undefined;
/**
* Controls logging level; use constants from `VerbosityLevel`.
* Default: `undefined`.
*/
verbosity?: number | undefined;
/**
* Base URL of the document, used to resolve relative URLs in annotations and outline items.
* Default: `undefined`.
*/
docBaseUrl?: string | undefined;
/**
* URL where predefined Adobe CMaps are located. Include trailing slash.
* Default: `undefined`.
*/
cMapUrl?: string | undefined;
/**
* Specifies if Adobe CMaps are binary-packed.
* Default: `true`.
*/
cMapPacked?: boolean | undefined;
/**
* Factory for reading built-in CMap files.
* Default: `{DOMCMapReaderFactory}`.
*/
CMapReaderFactory?: Object | undefined;
/**
* URL where predefined ICC profiles are located. Include trailing slash.
* Default: `undefined`.
*/
iccUrl?: string | undefined;
/**
* If `true`, non-embedded fonts fall back to system fonts.
* Default: `true` in browsers, `false` in Node.js (unless `disableFontFace === true`, then always `false`).
*/
useSystemFonts?: boolean | undefined;
/**
* URL for standard font files. Include trailing slash.
* Default: `undefined`.
*/
standardFontDataUrl?: string | undefined;
/**
* Factory for reading standard font files.
* Default: `{DOMStandardFontDataFactory}`.
*/
StandardFontDataFactory?: Object | undefined;
/**
* URL for WebAssembly files. Include trailing slash.
* Default: `undefined`.
*/
wasmUrl?: string | undefined;
/**
* Factory for reading WASM files.
* Default: `{DOMWasmFactory}`.
*/
WasmFactory?: Object | undefined;
/**
* Enable `fetch()` in worker thread for CMap/font/WASM files. If `true`, factory options are ignored.
* Default: `true` in browsers, `false` in Node.js.
*/
useWorkerFetch?: boolean | undefined;
/**
* Attempt to use WebAssembly for better performance (e.g., image decoding).
* Default: `true`.
*/
useWasm?: boolean | undefined;
/**
* Reject promises (e.g., `getTextContent`) on parse errors instead of recovering partially.
* Default: `false`.
*/
stopAtErrors?: boolean | undefined;
/**
* Max image size in total pixels (`width * height`). Use `-1` for no limit.
* Default: `-1`.
*/
maxImageSize?: number | undefined;
/**
* Whether evaluating strings as JS is allowed (for PDF function performance).
* Default: `true`.
*/
isEvalSupported?: boolean | undefined;
/**
* Whether `OffscreenCanvas` can be used in worker.
* Default: `true` in browsers, `false` in Node.js.
*/
isOffscreenCanvasSupported?: boolean | undefined;
/**
* Whether `ImageDecoder` can be used in worker.
* Default: `true` in browsers, `false` in Node.js.
* **NOTE**: Temporarily disabled in Chromium due to bugs:
* - Crashes with BMP decoder on huge images ([issue 374807001](https://issues.chromium.org/issues/374807001))
* - Broken JPEGs with custom color profiles ([issue 378869810](https://issues.chromium.org/issues/378869810))
*/
isImageDecoderSupported?: boolean | undefined;
/**
* Used to determine when to resize images (via `OffscreenCanvas`). Use `-1` to use a slower fallback algorithm.
* Default: `undefined`.
*/
canvasMaxAreaInBytes?: number | undefined;
/**
* Disable `@font-face`/Font Loading API; use built-in glyph renderer instead.
* Default: `false` in browsers, `true` in Node.js.
*/
disableFontFace?: boolean | undefined;
/**
* Include extra (non-rendering) font properties when exporting font data from worker. Increases memory usage.
* Default: `false`.
*/
fontExtraProperties?: boolean | undefined;
/**
* Render XFA forms if present.
* Default: `false`.
*/
enableXfa?: boolean | undefined;
/**
* Explicit document context for creating elements and loading resources. Defaults to current document.
* Default: `undefined`.
*/
ownerDocument?: HTMLDocument | undefined;
/**
* Disable range requests for PDF loading.
* Default: `false`.
*/
disableRange?: boolean | undefined;
/**
* Disable streaming PDF data.
* Default: `false`.
*/
disableStream?: boolean | undefined;
/**
* Disable pre-fetching of PDF data. Requires `disableStream: true` to work fully.
* Default: `false`.
*/
disableAutoFetch?: boolean | undefined;
/**
* Enable debugging hooks (see `web/debugger.js`).
* Default: `false`.
*/
pdfBug?: boolean | undefined;
/**
* Factory for creating canvases.
* Default: `{DOMCanvasFactory}`.
*/
CanvasFactory?: Object | undefined;
/**
* Factory for creating SVG filters during rendering.
* Default: `{DOMFilterFactory}`.
*/
FilterFactory?: Object | undefined;
/**
* Enable hardware acceleration for rendering.
* Default: `false`.
*/
enableHWA?: boolean | undefined;
}
export { Metadata }
/**
* @public
* Node representing a single item in the PDF outline (bookmarks).
* This mirrors the structure returned by PDF.js' getOutline() API.
*/
export declare interface OutlineNode {
title: string;
bold: boolean;
italic: boolean;
color: Uint8ClampedArray;
dest: string | Array<any> | null;
url: string | null;
unsafeUrl?: string;
newWindow?: boolean;
count?: number;
items: Array<any>;
}
/**
* @public
* PageImages
* - Represents all embedded images found on a single PDF page.
* - pageNumber: 1-based page index.
* - images: Array of EmbeddedImage objects for this page.
*/
export declare interface PageImages {
pageNumber: number;
images: EmbeddedImage[];
}
/**
* @public
* Per-page link extraction result.
* - pageNumber: the physical page index (1-based) within the PDF document.
* - pageLabel: optional printed page label shown by PDF viewers (e.g. "iii", "1", "A-1");
* this can differ from the physical page number and may be undefined
* when the document does not provide labels.
* - links: array of text-&gt;URL mappings that were found/overlaid on the page.
* - width/height: page dimensions in PDF units for the viewport used.
*/
export declare type PageLinkResult = {
pageNumber: number;
pageLabel?: string | null;
links: Array<{
text: string;
url: string;
}>;
width: number;
height: number;
};
/**
* @public
* PageTableResult
*/
export declare interface PageTableResult {
num: number;
tables: TableArray[];
}
/**
* @public
* PageTextResult
*/
export declare interface PageTextResult {
num: number;
text: string;
}
/**
* @public
* ParseParameters
* Options to control parsing behavior and output formatting.
*/
export declare interface ParseParameters {
/**
* Array of page numbers to parse.
* When provided, only these pages will be parsed and returned in the same order.
* Example: [1, 3, 5]. Parse only one page: [7].
* Default: `undefined`.
*/
partial?: Array<number>;
/**
* Parse the first N pages (pages 1..N).
* Ignored when `partial` is provided. If both `first` and `last` are set, they define
* an explicit inclusive page range (first..last) and this "first N" semantics is ignored.
* Default: `undefined`.
*/
first?: number;
/**
* Parse the last N pages (pages total-N+1..total).
* Ignored when `partial` is provided. If both `first` and `last` are set, they define
* an explicit inclusive page range (first..last) and this "last N" semantics is ignored.
* Default: `undefined`.
*/
last?: number;
/**
* Collect per-page metadata such as embedded links, title, pageLabel, and dimensions;
* ISBN, DOI, abstract, and references are work in progress when getInfo() is used.
* Default: `false`.
*/
parsePageInfo?: boolean;
/**
* Attempt to detect and include hyperlink annotations (e.g. URLs) associated with text.
* Detected links are formatted as Markdown inline links (for example: [text](https://example.com)).
* Default: `false`.
*/
parseHyperlinks?: boolean;
/**
* Enforce logical line breaks by inserting a newline when the vertical distance
* between text items exceeds `lineThreshold`.
* Useful to preserve paragraph/line structure when text items are emitted as separate segments.
* Default: `true`.
*/
lineEnforce?: boolean;
/**
* Threshold to decide whether nearby text items belong to different lines.
* Larger values make the parser more likely to start a new line between items.
* Default: `4.6`.
*/
lineThreshold?: number;
/**
* String inserted between text items on the same line when a sufficiently large horizontal gap is detected.
* Typically used to emulate a cell/column separator (for example, "\\t" for tabs).
* Default: `'\t'`.
*/
cellSeparator?: string;
/**
* Horizontal distance threshold to decide when two text items on the same baseline should be treated as separate cells.
* Larger value produces fewer (wider) cells; smaller value creates more cell breaks.
* Default: `7`.
*/
cellThreshold?: number;
/**
* Optional string appended at the end of each page's extracted text to mark page boundaries.
* Supports placeholders `page_number` and `total_number` which are substituted accordingly.
* If omitted or empty, no page boundary marker is added.
* Default: `'\n-- page_number of total_number --'`.
*/
pageJoiner?: string;
/**
* Optional string used to join text items when returning a page's text.
* If provided, this value is used instead of the default empty-string joining behavior.
* Default: `undefined`.
*/
itemJoiner?: string;
/**
* Minimum image dimension (in pixels) for width or height.
* When set, images where width OR height are below or equal this value will be ignored by `getImage()`.
* Useful for excluding tiny decorative or tracking images.
* Default: `80`.
* Disable: `0`.
*/
imageThreshold?: number;
/**
* Screenshot scale factor: use 1 for the original size, 1.5 for a 50% larger image, etc.
* Default: `1`.
*/
scale?: number;
/**
* Desired screenshot width in pixels.
* When set, the scale option is ignored.
* Default: `undefined`.
*/
desiredWidth?: number;
/**
* Applies to both getImage() and getScreenshot(): include the image as a base64 data URL string.
* Default: `true`.
*/
imageDataUrl?: boolean;
/**
* Applies to both getImage() and getScreenshot(): include the image as a binary buffer.
* Default: `true`.
*/
imageBuffer?: boolean;
/**
* Include marked content items in the items array of TextContent to capture PDF "marked content".
* Enables tags (MCID, role/props) and structural/accessibility information useful for mapping text ↔ structure.
* For plain text extraction it's usually false (trade-off: larger output).
* Default: `false`.
*/
includeMarkedContent?: boolean;
/**
* When true, text normalization is NOT performed in the worker thread.
* For plain text extraction, normalizing in the worker (false) is usually recommended.
* Default: `false`.
*/
disableNormalization?: boolean;
}
/**
* Error indicating a PDF file requires a password or the provided password is incorrect.
*
* @public
*/
export declare class PasswordException extends Error {
/**
* Create a new PasswordException.
* @param message - Optional error message.
* @param cause - Optional underlying cause.
*/
constructor(message?: string, cause?: unknown);
}
export { PDFDataRangeTransport }
/**
* @public
* Loads PDF documents and exposes helpers for text, image, table, metadata, and screenshot extraction.
*/
export declare class PDFParse {
private readonly options;
private doc;
progress: {
loaded: number;
total: number;
};
/**
* Create a new parser with `LoadParameters`.
* Converts Node.js `Buffer` data to `Uint8Array` automatically and ensures a default verbosity level.
* @param options - Initialization parameters.
*/
constructor(options: LoadParameters);
destroy(): Promise<void>;
static get isNodeJS(): boolean;
static setWorker(workerSrc?: string): string;
/**
* Load document-level metadata (info, outline, permissions, page labels) and optionally gather per-page link details.
* @param params - Parse options; set `parsePageInfo` to collect per-page metadata described in `ParseParameters`.
* @returns Aggregated document metadata in an `InfoResult`.
*/
getInfo(params?: ParseParameters): Promise<InfoResult>;
private getPageLinks;
/**
* Extract plain text for each requested page, optionally enriching hyperlinks and enforcing line or cell separators.
* @param params - Parse options controlling pagination, link handling, and line/cell thresholds.
* @returns A `TextResult` containing page-wise text and a concatenated document string.
*/
getText(params?: ParseParameters): Promise<TextResult>;
private load;
private shouldParse;
private getPageText;
private getHyperlinks;
/**
* Extract embedded images from requested pages.
*
* Behavior notes:
* - Pages are selected according to ParseParameters (partial, first, last).
* - Images smaller than `params.imageThreshold` (width OR height) are skipped.
* - Returned ImageResult contains per-page PageImages; each image entry includes:
* - data: Uint8Array (present when params.imageBuffer === true)
* - dataUrl: string (present when params.imageDataUrl === true)
* - width, height, kind, name
* - Works in both Node.js (canvas.toBuffer) and browser (canvas.toDataURL) environments.
*
* @param params - ParseParameters controlling page selection, thresholds and output format.
* @returns Promise<ImageResult> with extracted images grouped by page.
*/
getImage(params?: ParseParameters): Promise<ImageResult>;
private convertToRGBA;
private resolveEmbeddedImage;
/**
* Render pages to raster screenshots.
*
* Behavior notes:
* - Pages are selected according to ParseParameters (partial, first, last).
* - Use params.scale for zoom; if params.desiredWidth is specified it takes precedence.
* - Each ScreenshotResult page contains:
* - data: Uint8Array (when params.imageBuffer === true)
* - dataUrl: string (when params.imageDataUrl === true)
* - pageNumber, width, height, scale
* - Works in both Node.js (canvas.toBuffer) and browser (canvas.toDataURL) environments.
*
* @param parseParams - ParseParameters controlling page selection and render options.
* @returns Promise<ScreenshotResult> with rendered page images.
*/
getScreenshot(parseParams?: ParseParameters): Promise<ScreenshotResult>;
/**
* Detect and extract tables from pages by analysing vector drawing operators, then populate cells with text.
*
* Behavior notes:
* - Scans operator lists for rectangles/lines that form table grids (uses PathGeometry and LineStore).
* - Normalizes detected geometry and matches positioned text to table cells.
* - Honors ParseParameters for page selection.
*
* @param params - ParseParameters controlling which pages to analyse (partial/first/last).
* @returns Promise<TableResult> containing discovered tables per page.
*/
getTable(params?: ParseParameters): Promise<TableResult>;
private getPathGeometry;
private getPageTables;
private fillPageTables;
}
export { PDFWorker }
export declare class Point extends Shape {
x: number;
y: number;
constructor(x: number, y: number);
equal(point: Point): boolean;
transform(matrix: Array<number>): this;
}
export declare class Rectangle extends Shape {
from: Point;
width: number;
height: number;
constructor(from: Point, width: number, height: number);
get to(): Point;
getLines(): Line[];
transform(matrix: Array<number>): this;
}
/**
* Represents an HTTP/network response error encountered while fetching PDF data.
*
* The `status` and `missing` properties mirror values that may be provided
* by the underlying PDF library's network layer.
*
* @public
*/
export declare class ResponseException extends Error {
/**
* Create a new ResponseException.
* @param message - Optional error message.
* @param status - Optional numeric HTTP/status code.
* @param missing - Optional field describing missing resources.
* @param cause - Optional underlying cause.
*/
constructor(message?: string, status?: number, missing?: unknown, cause?: unknown);
}
/**
* @public
* SafeParseParameters
*/
export declare type SafeParseParameters = Required<Pick<ParseParameters, 'lineThreshold' | 'cellThreshold' | 'scale'>> & ParseParameters;
/**
* @public
* Screenshot
*/
export declare interface Screenshot {
data: Uint8Array;
dataUrl: string;
pageNumber: number;
width: number;
height: number;
scale: number;
}
/**
* @public
* ScreenshotResult
*/
export declare class ScreenshotResult {
pages: Array<Screenshot>;
total: number;
constructor(total: number);
}
export declare function setDefaultParseParameters(params: ParseParameters): SafeParseParameters;
export declare abstract class Shape {
static tolerance: number;
abstract transform(matrix: Array<number>): this;
static applyTransform(p: Array<number>, m: Array<number>): Array<number>;
}
export declare class Table {
hLines: Array<Line>;
vLines: Array<Line>;
constructor(line: Line);
get isValid(): boolean;
get rowPivots(): Array<number>;
get colPivots(): Array<number>;
add(line: Line): boolean;
private intersection;
private getSameHorizontal;
private getSameVertical;
private mergeHorizontalLines;
private mergeVerticalLines;
normalize(): void;
verticalExists(line: Line, y1: number, y2: number): boolean;
horizontalExists(line: Line, x1: number, x2: number): boolean;
private findBottomLineIndex;
private findVerticalLineIndexs;
private getRow;
toData(): TableData;
}
export declare type TableArray = Array<Array<string>>;
declare type TableCell = {
minXY: Point;
maxXY: Point;
width: number;
height: number;
colspan?: number;
rowspan?: number;
text: Array<string>;
};
declare class TableData {
minXY: Point;
maxXY: Point;
rows: Array<TableRow>;
private rowPivots;
private colPivots;
constructor(minXY: Point, maxXY: Point, rowPivots: Array<number>, colPivots: Array<number>);
findCell(x: number, y: number): TableCell | undefined;
get cellCount(): number;
get rowCount(): number;
check(): boolean;
toArray(): string[][];
}
/**
* @public
* TableResult
*/
export declare class TableResult {
pages: Array<PageTableResult>;
mergedTables: TableArray[];
total: number;
constructor(total: number);
}
declare type TableRow = Array<TableCell>;
/**
* @public
* TextResult
*/
export declare class TextResult {
pages: Array<PageTextResult>;
text: string;
total: number;
getPageText(num: number): string;
constructor(total: number);
}
export declare type TypedArray = Int8Array | Uint8Array | Uint8ClampedArray | Int16Array | Uint16Array | Int32Array | Uint32Array | Float32Array | Float64Array;
/**
* Generic wrapper for errors where the library cannot classify the cause.
*
* The `details` property may contain additional information provided by the
* underlying PDF library.
*
* @public
*/
export declare class UnknownErrorException extends Error {
/**
* Create a new UnknownErrorException.
* @param message - Optional error message.
* @param details - Optional additional details from the PDF library.
* @param cause - Optional underlying cause.
*/
constructor(message?: string, details?: unknown, cause?: unknown);
}
export { VerbosityLevel }
export { }

31974
node_modules/pdf-parse/dist/pdf-parse/web/pdf-parse.es.js generated vendored Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

65152
node_modules/pdf-parse/dist/pdf-parse/web/pdf.worker.mjs generated vendored Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

105
node_modules/pdf-parse/dist/worker/cjs/index.cjs generated vendored Normal file

File diff suppressed because one or more lines are too long

Some files were not shown because too many files have changed in this diff Show More