Files
Steuerungstechnik/node_modules/pdf-parse/bin/cli.mjs
2025-12-15 19:52:56 +01:00

423 lines
11 KiB
JavaScript

#!/usr/bin/env node
import { mkdir, readFile, writeFile } from 'node:fs/promises';
import { stderr, stdout } from 'node:process';
import { PDFParse } from 'pdf-parse';
import { getHeader } from 'pdf-parse/node';
import minimist from './minimist.mjs';
const args = minimist(process.argv.slice(2), {
alias: {
h: 'help',
v: 'version',
o: 'output',
p: 'pages',
f: 'format',
m: 'min',
s: 'scale',
w: 'width',
l: 'large',
},
string: ['output', 'pages', 'format', 'min', 'scale', 'width'],
boolean: ['help', 'version', 'magic', 'large'],
});
if (args.version) {
const pkg = JSON.parse(await readFile(new URL('../package.json', import.meta.url)));
stdout.write(`${pkg.name} ${pkg.version}\n`);
process.exit(0);
}
if (args.help || args._.length === 0) {
showHelp();
process.exit(0);
}
const command = args._[0];
const filePath = args._[1];
if (!filePath) {
stderr.write('Error: PDF file path or URL is required\n');
stderr.write('Use --help for usage information\n');
process.exit(1);
}
const commands = ['info', 'text', 'image', 'screenshot', 'ss', 'table', 'check'];
if (!commands.includes(command)) {
stderr.write(`Error: Unknown command '${command}'\n`);
stderr.write(`Available commands: ${commands.join(', ')}\n`);
process.exit(1);
}
try {
await runCommand(command, filePath, args);
} catch (error) {
stderr.write(`Error: ${error.message}\n`);
process.exit(1);
}
function showHelp() {
const help = `Usage: pdf-parse <command> <file> [options]
Commands:
check Check PDF file headers and validate format (URL only)
info Extract PDF metadata and information
text Extract text content from PDF
image Extract embedded images from PDF
screenshot Generate screenshots of PDF pages (alias: ss)
table Extract tabular data from PDF
Options:
-o, --output <file> Output file path (for single file) or directory (for multiple files)
-p, --pages <range> Page range (e.g., 1,3-5,7)
-f, --format <format> Output format (json, text, dataurl)
-m, --min <px> Minimum image size threshold in pixels (default: 80)
-s, --scale <factor> Scale factor for screenshots (default: 1.0)
-w, --width <px> Desired width for screenshots in pixels
-l, --large Enable optimizations for large PDF files
--magic Validate PDF magic bytes (default: true)
-h, --help Show this help message
-v, --version Show version number
Examples:
pdf-parse info document.pdf
pdf-parse text document.pdf --pages 1-3
pdf-parse screenshot document.pdf --output screenshot.png
pdf-parse table document.pdf --format json
pdf-parse image document.pdf --output ./images/
pdf-parse screenshot document.pdf --output ./screenshots/ --scale 2.0
pdf-parse check https://bitcoin.org/bitcoin.pdf --magic
pdf-parse text https://example.com/large.pdf --large --pages 1-5
`;
stdout.write(help);
}
async function runCommand(command, filePath, options) {
let initParams;
if (filePath.startsWith('http://') || filePath.startsWith('https://')) {
initParams = { url: filePath };
} else {
const data = await readFile(filePath);
initParams = { data };
}
// Apply large file optimizations if --large flag is used
if (options.large) {
initParams.disableAutoFetch = true;
initParams.disableStream = true;
initParams.rangeChunkSize = 65536;
}
const parser = new PDFParse(initParams);
try {
switch (command) {
case 'check':
await handleGetHeader(filePath, options);
break;
case 'info':
await handleGetInfo(parser, options);
break;
case 'text':
await handleGetText(parser, options);
break;
case 'image':
await handleGetImage(parser, options);
break;
case 'screenshot':
case 'ss':
await handleGetScreenshot(parser, options);
break;
case 'table':
await handleGetTable(parser, options);
break;
}
} finally {
await parser.destroy();
}
}
async function handleGetHeader(filePath, options) {
// Check if it's a URL
if (!filePath.startsWith('http://') && !filePath.startsWith('https://')) {
stderr.write('Error: check command only works with URLs, not local files\n');
stderr.write('Use: pdf-parse check https://bitcoin.org/bitcoin.pdf\n');
process.exit(1);
}
// Second parameter is for PDF magic bytes validation
const result = await getHeader(filePath, !!options.magic);
const output = options.format === 'json' ? JSON.stringify(result, null, 2) : formatHeader(result);
if (options.output) {
await writeFile(options.output, output);
} else {
stdout.write(output);
}
}
async function handleGetInfo(parser, options) {
const result = await parser.getInfo();
const output = options.format === 'json' ? JSON.stringify(result, null, 2) : formatInfo(result);
if (options.output) {
await writeFile(options.output, output);
} else {
stdout.write(output);
}
}
async function handleGetText(parser, options) {
const params = parsePageParams(options);
const result = await parser.getText(params);
const output = options.format === 'json' ? JSON.stringify(result, null, 2) : result.text;
if (options.output) {
await writeFile(options.output, output);
} else {
stdout.write(output);
}
}
async function handleGetImage(parser, options) {
const params = parsePageParams(options);
params.imageBuffer = true;
params.imageDataUrl = options.format === 'dataurl';
if (options.min) {
params.imageThreshold = parseInt(options.min, 10);
}
const result = await parser.getImage(params);
if (options.output) {
// Create output directory if it doesn't exist
const outputDir = options.output;
await createDirectoryIfNeeded(outputDir);
let imageCount = 0;
for (const page of result.pages) {
for (const image of page.images) {
const ext = 'png';
const filename = `page_${page.pageNumber}_image_${imageCount}.${ext}`;
const filepath = `${outputDir}/${filename}`;
await writeFile(filepath, image.data);
imageCount++;
}
}
stdout.write(`Extracted ${imageCount} images to ${outputDir}\n`);
} else {
// List images without extracting
let totalImages = 0;
for (const page of result.pages) {
totalImages += page.images.length;
}
if (options.format === 'json') {
// Remove binary data for JSON output
const cleanResult = {
total: result.total,
pages: result.pages.map((page) => ({
pageNumber: page.pageNumber,
imageCount: page.images.length,
images: page.images.map((img) => ({
name: img.name,
width: img.width,
height: img.height,
kind: img.kind,
})),
})),
};
stdout.write(JSON.stringify(cleanResult, null, 2));
} else {
stdout.write(`Found ${totalImages} images across ${result.total} pages\n`);
for (const page of result.pages) {
if (page.images.length > 0) {
stdout.write(`Page ${page.pageNumber}: ${page.images.length} images\n`);
for (let i = 0; i < page.images.length; i++) {
const img = page.images[i];
stdout.write(` Image ${i}: ${img.width}x${img.height} (${img.name})\n`);
}
}
}
}
}
}
async function handleGetScreenshot(parser, options) {
const params = parsePageParams(options);
params.imageBuffer = true;
params.imageDataUrl = options.format === 'dataurl';
if (options.scale) {
params.scale = parseFloat(options.scale);
}
if (options.width) {
params.desiredWidth = parseInt(options.width, 10);
}
const result = await parser.getScreenshot(params);
if (options.output) {
// Create output directory if it doesn't exist
const outputDir = options.output;
await createDirectoryIfNeeded(outputDir);
let screenshotCount = 0;
for (const page of result.pages) {
const ext = 'png';
const filename = `page_${page.pageNumber}_screenshot.${ext}`;
const filepath = `${outputDir}/${filename}`;
await writeFile(filepath, page.data);
screenshotCount++;
}
stdout.write(`Generated ${screenshotCount} screenshots to ${outputDir}\n`);
} else {
// List screenshots without generating
if (options.format === 'json') {
// Remove binary data for JSON output
const cleanResult = {
total: result.total,
pages: result.pages.map((page) => ({
pageNumber: page.pageNumber,
width: page.width,
height: page.height,
scale: page.scale,
})),
};
stdout.write(JSON.stringify(cleanResult, null, 2));
} else {
stdout.write(`Would generate ${result.pages.length} screenshots across ${result.total} pages\n`);
for (const page of result.pages) {
stdout.write(`Page ${page.pageNumber}: ${page.width}x${page.height} (scale: ${page.scale})\n`);
}
}
}
}
async function handleGetTable(parser, options) {
const params = parsePageParams(options);
const result = await parser.getTable(params);
if (options.format === 'json') {
stdout.write(JSON.stringify(result, null, 2));
} else {
// Text format - pretty print tables
let output = `Found tables across ${result.total} pages:\n\n`;
for (const page of result.pages) {
if (page.tables.length > 0) {
output += `Page ${page.num}:\n`;
for (let i = 0; i < page.tables.length; i++) {
output += `Table ${i + 1}:\n`;
const table = page.tables[i];
// Calculate column widths
const colWidths = [];
for (let col = 0; col < table[0].length; col++) {
let maxWidth = 0;
for (const row of table) {
if (row[col]) {
maxWidth = Math.max(maxWidth, row[col].length);
}
}
colWidths[col] = maxWidth;
}
// Print table
for (const row of table) {
for (let col = 0; col < row.length; col++) {
const cell = row[col] || '';
const width = colWidths[col] || 10;
output += cell.padEnd(width + 2);
}
output += '\n';
}
output += '\n';
}
}
}
stdout.write(output);
}
}
function parsePageParams(options) {
const params = {};
if (options.pages) {
// Parse page range like "1,3-5,7" into partial array
const partial = [];
const ranges = options.pages.split(',');
for (const range of ranges) {
if (range.includes('-')) {
const [start, end] = range.split('-').map((n) => parseInt(n.trim(), 10));
for (let i = start; i <= end; i++) {
partial.push(i);
}
} else {
partial.push(parseInt(range.trim(), 10));
}
}
params.partial = partial;
}
return params;
}
function formatInfo(result) {
let output = `Total pages: ${result.total}\n`;
if (result.info) {
output += `\nDocument Info:\n`;
for (const [key, value] of Object.entries(result.info)) {
output += ` ${key}: ${value}\n`;
}
}
if (result.metadata) {
output += `\nMetadata:\n`;
for (const [key, value] of Object.entries(result.metadata)) {
output += ` ${key}: ${value}\n`;
}
}
return output;
}
function formatHeader(result) {
const magic = result.magic === null ? '-' : !!result.magic;
let output = `Status: ${result.status}\n`;
output += `Size: ${result.size} bytes\n`;
output += `Magic: ${magic}\n`;
if (result.headers) {
output += `\nHeaders:\n`;
for (const [key, value] of Object.entries(result.headers)) {
output += ` ${key}: ${value}\n`;
}
}
return output;
}
async function createDirectoryIfNeeded(dirPath) {
try {
await mkdir(dirPath, { recursive: true });
} catch (error) {
if (error.code !== 'EEXIST') {
throw error;
}
}
}