Steuerungstechnik/node_modules/pdf-parse/bin/cli.mjs

#!/usr/bin/env node

import { mkdir, readFile, writeFile } from 'node:fs/promises';
import { stderr, stdout } from 'node:process';
import { PDFParse } from 'pdf-parse';
import { getHeader } from 'pdf-parse/node';

import minimist from './minimist.mjs';

const args = minimist(process.argv.slice(2), {
	alias: {
		h: 'help',
		v: 'version',
		o: 'output',
		p: 'pages',
		f: 'format',
		m: 'min',
		s: 'scale',
		w: 'width',
		l: 'large',
	},
	string: ['output', 'pages', 'format', 'min', 'scale', 'width'],
	boolean: ['help', 'version', 'magic', 'large'],
});

if (args.version) {
	const pkg = JSON.parse(await readFile(new URL('../package.json', import.meta.url)));
	stdout.write(`${pkg.name} ${pkg.version}\n`);
	process.exit(0);
}

if (args.help || args._.length === 0) {
	showHelp();
	process.exit(0);
}

const command = args._[0];
const filePath = args._[1];

if (!filePath) {
	stderr.write('Error: PDF file path or URL is required\n');
	stderr.write('Use --help for usage information\n');
	process.exit(1);
}

const commands = ['info', 'text', 'image', 'screenshot', 'ss', 'table', 'check'];

if (!commands.includes(command)) {
	stderr.write(`Error: Unknown command '${command}'\n`);
	stderr.write(`Available commands: ${commands.join(', ')}\n`);
	process.exit(1);
}

try {
	await runCommand(command, filePath, args);
} catch (error) {
	stderr.write(`Error: ${error.message}\n`);
	process.exit(1);
}

function showHelp() {
	const help = `Usage: pdf-parse <command> <file> [options]

Commands:
  check       Check PDF file headers and validate format (URL only)
  info        Extract PDF metadata and information
  text        Extract text content from PDF
  image       Extract embedded images from PDF
  screenshot  Generate screenshots of PDF pages (alias: ss)
  table       Extract tabular data from PDF

Options:
  -o, --output <file>          Output file path (for single file) or directory (for multiple files)
  -p, --pages <range>          Page range (e.g., 1,3-5,7)
  -f, --format <format>        Output format (json, text, dataurl)
  -m, --min <px>               Minimum image size threshold in pixels (default: 80)
  -s, --scale <factor>         Scale factor for screenshots (default: 1.0)
  -w, --width <px>             Desired width for screenshots in pixels
  -l, --large                  Enable optimizations for large PDF files
  --magic                      Validate PDF magic bytes (default: true)
  -h, --help                   Show this help message
  -v, --version                Show version number

Examples:
  pdf-parse info document.pdf
  pdf-parse text document.pdf --pages 1-3
  pdf-parse screenshot document.pdf --output screenshot.png
  pdf-parse table document.pdf --format json
  pdf-parse image document.pdf --output ./images/
  pdf-parse screenshot document.pdf --output ./screenshots/ --scale 2.0
  pdf-parse check https://bitcoin.org/bitcoin.pdf --magic
  pdf-parse text https://example.com/large.pdf --large --pages 1-5
`;
	stdout.write(help);
}

async function runCommand(command, filePath, options) {
	let initParams;

	if (filePath.startsWith('http://') || filePath.startsWith('https://')) {
		initParams = { url: filePath };
	} else {
		const data = await readFile(filePath);
		initParams = { data };
	}

	// Apply large file optimizations if --large flag is used
	if (options.large) {
		initParams.disableAutoFetch = true;
		initParams.disableStream = true;
		initParams.rangeChunkSize = 65536;
	}

	const parser = new PDFParse(initParams);

	try {
		switch (command) {
			case 'check':
				await handleGetHeader(filePath, options);
				break;
			case 'info':
				await handleGetInfo(parser, options);
				break;
			case 'text':
				await handleGetText(parser, options);
				break;
			case 'image':
				await handleGetImage(parser, options);
				break;
			case 'screenshot':
			case 'ss':
				await handleGetScreenshot(parser, options);
				break;
			case 'table':
				await handleGetTable(parser, options);
				break;
		}
	} finally {
		await parser.destroy();
	}
}

async function handleGetHeader(filePath, options) {
	// Check if it's a URL
	if (!filePath.startsWith('http://') && !filePath.startsWith('https://')) {
		stderr.write('Error: check command only works with URLs, not local files\n');
		stderr.write('Use: pdf-parse check https://bitcoin.org/bitcoin.pdf\n');
		process.exit(1);
	}

	// Second parameter is for PDF magic bytes validation
	const result = await getHeader(filePath, !!options.magic);
	const output = options.format === 'json' ? JSON.stringify(result, null, 2) : formatHeader(result);

	if (options.output) {
		await writeFile(options.output, output);
	} else {
		stdout.write(output);
	}
}

async function handleGetInfo(parser, options) {
	const result = await parser.getInfo();
	const output = options.format === 'json' ? JSON.stringify(result, null, 2) : formatInfo(result);

	if (options.output) {
		await writeFile(options.output, output);
	} else {
		stdout.write(output);
	}
}

async function handleGetText(parser, options) {
	const params = parsePageParams(options);
	const result = await parser.getText(params);
	const output = options.format === 'json' ? JSON.stringify(result, null, 2) : result.text;

	if (options.output) {
		await writeFile(options.output, output);
	} else {
		stdout.write(output);
	}
}

async function handleGetImage(parser, options) {
	const params = parsePageParams(options);
	params.imageBuffer = true;
	params.imageDataUrl = options.format === 'dataurl';

	if (options.min) {
		params.imageThreshold = parseInt(options.min, 10);
	}

	const result = await parser.getImage(params);

	if (options.output) {
		// Create output directory if it doesn't exist
		const outputDir = options.output;
		await createDirectoryIfNeeded(outputDir);

		let imageCount = 0;
		for (const page of result.pages) {
			for (const image of page.images) {
				const ext = 'png';
				const filename = `page_${page.pageNumber}_image_${imageCount}.${ext}`;
				const filepath = `${outputDir}/${filename}`;

				await writeFile(filepath, image.data);
				imageCount++;
			}
		}

		stdout.write(`Extracted ${imageCount} images to ${outputDir}\n`);
	} else {
		// List images without extracting
		let totalImages = 0;
		for (const page of result.pages) {
			totalImages += page.images.length;
		}

		if (options.format === 'json') {
			// Remove binary data for JSON output
			const cleanResult = {
				total: result.total,
				pages: result.pages.map((page) => ({
					pageNumber: page.pageNumber,
					imageCount: page.images.length,
					images: page.images.map((img) => ({
						name: img.name,
						width: img.width,
						height: img.height,
						kind: img.kind,
					})),
				})),
			};
			stdout.write(JSON.stringify(cleanResult, null, 2));
		} else {
			stdout.write(`Found ${totalImages} images across ${result.total} pages\n`);
			for (const page of result.pages) {
				if (page.images.length > 0) {
					stdout.write(`Page ${page.pageNumber}: ${page.images.length} images\n`);
					for (let i = 0; i < page.images.length; i++) {
						const img = page.images[i];
						stdout.write(`  Image ${i}: ${img.width}x${img.height} (${img.name})\n`);
					}
				}
			}
		}
	}
}

async function handleGetScreenshot(parser, options) {
	const params = parsePageParams(options);
	params.imageBuffer = true;
	params.imageDataUrl = options.format === 'dataurl';

	if (options.scale) {
		params.scale = parseFloat(options.scale);
	}

	if (options.width) {
		params.desiredWidth = parseInt(options.width, 10);
	}

	const result = await parser.getScreenshot(params);

	if (options.output) {
		// Create output directory if it doesn't exist
		const outputDir = options.output;
		await createDirectoryIfNeeded(outputDir);

		let screenshotCount = 0;
		for (const page of result.pages) {
			const ext = 'png';
			const filename = `page_${page.pageNumber}_screenshot.${ext}`;
			const filepath = `${outputDir}/${filename}`;

			await writeFile(filepath, page.data);
			screenshotCount++;
		}

		stdout.write(`Generated ${screenshotCount} screenshots to ${outputDir}\n`);
	} else {
		// List screenshots without generating
		if (options.format === 'json') {
			// Remove binary data for JSON output
			const cleanResult = {
				total: result.total,
				pages: result.pages.map((page) => ({
					pageNumber: page.pageNumber,
					width: page.width,
					height: page.height,
					scale: page.scale,
				})),
			};
			stdout.write(JSON.stringify(cleanResult, null, 2));
		} else {
			stdout.write(`Would generate ${result.pages.length} screenshots across ${result.total} pages\n`);
			for (const page of result.pages) {
				stdout.write(`Page ${page.pageNumber}: ${page.width}x${page.height} (scale: ${page.scale})\n`);
			}
		}
	}
}

async function handleGetTable(parser, options) {
	const params = parsePageParams(options);
	const result = await parser.getTable(params);

	if (options.format === 'json') {
		stdout.write(JSON.stringify(result, null, 2));
	} else {
		// Text format - pretty print tables
		let output = `Found tables across ${result.total} pages:\n\n`;

		for (const page of result.pages) {
			if (page.tables.length > 0) {
				output += `Page ${page.num}:\n`;
				for (let i = 0; i < page.tables.length; i++) {
					output += `Table ${i + 1}:\n`;
					const table = page.tables[i];

					// Calculate column widths
					const colWidths = [];
					for (let col = 0; col < table[0].length; col++) {
						let maxWidth = 0;
						for (const row of table) {
							if (row[col]) {
								maxWidth = Math.max(maxWidth, row[col].length);
							}
						}
						colWidths[col] = maxWidth;
					}

					// Print table
					for (const row of table) {
						for (let col = 0; col < row.length; col++) {
							const cell = row[col] || '';
							const width = colWidths[col] || 10;
							output += cell.padEnd(width + 2);
						}
						output += '\n';
					}
					output += '\n';
				}
			}
		}

		stdout.write(output);
	}
}

function parsePageParams(options) {
	const params = {};

	if (options.pages) {
		// Parse page range like "1,3-5,7" into partial array
		const partial = [];
		const ranges = options.pages.split(',');

		for (const range of ranges) {
			if (range.includes('-')) {
				const [start, end] = range.split('-').map((n) => parseInt(n.trim(), 10));
				for (let i = start; i <= end; i++) {
					partial.push(i);
				}
			} else {
				partial.push(parseInt(range.trim(), 10));
			}
		}

		params.partial = partial;
	}

	return params;
}

function formatInfo(result) {
	let output = `Total pages: ${result.total}\n`;

	if (result.info) {
		output += `\nDocument Info:\n`;
		for (const [key, value] of Object.entries(result.info)) {
			output += `  ${key}: ${value}\n`;
		}
	}

	if (result.metadata) {
		output += `\nMetadata:\n`;
		for (const [key, value] of Object.entries(result.metadata)) {
			output += `  ${key}: ${value}\n`;
		}
	}

	return output;
}

function formatHeader(result) {
	const magic = result.magic === null ? '-' : !!result.magic;
	let output = `Status: ${result.status}\n`;
	output += `Size: ${result.size} bytes\n`;
	output += `Magic: ${magic}\n`;

	if (result.headers) {
		output += `\nHeaders:\n`;
		for (const [key, value] of Object.entries(result.headers)) {
			output += `  ${key}: ${value}\n`;
		}
	}

	return output;
}

async function createDirectoryIfNeeded(dirPath) {
	try {
		await mkdir(dirPath, { recursive: true });
	} catch (error) {
		if (error.code !== 'EEXIST') {
			throw error;
		}
	}
}