#!/usr/bin/env node import { mkdir, readFile, writeFile } from 'node:fs/promises'; import { stderr, stdout } from 'node:process'; import { PDFParse } from 'pdf-parse'; import { getHeader } from 'pdf-parse/node'; import minimist from './minimist.mjs'; const args = minimist(process.argv.slice(2), { alias: { h: 'help', v: 'version', o: 'output', p: 'pages', f: 'format', m: 'min', s: 'scale', w: 'width', l: 'large', }, string: ['output', 'pages', 'format', 'min', 'scale', 'width'], boolean: ['help', 'version', 'magic', 'large'], }); if (args.version) { const pkg = JSON.parse(await readFile(new URL('../package.json', import.meta.url))); stdout.write(`${pkg.name} ${pkg.version}\n`); process.exit(0); } if (args.help || args._.length === 0) { showHelp(); process.exit(0); } const command = args._[0]; const filePath = args._[1]; if (!filePath) { stderr.write('Error: PDF file path or URL is required\n'); stderr.write('Use --help for usage information\n'); process.exit(1); } const commands = ['info', 'text', 'image', 'screenshot', 'ss', 'table', 'check']; if (!commands.includes(command)) { stderr.write(`Error: Unknown command '${command}'\n`); stderr.write(`Available commands: ${commands.join(', ')}\n`); process.exit(1); } try { await runCommand(command, filePath, args); } catch (error) { stderr.write(`Error: ${error.message}\n`); process.exit(1); } function showHelp() { const help = `Usage: pdf-parse [options] Commands: check Check PDF file headers and validate format (URL only) info Extract PDF metadata and information text Extract text content from PDF image Extract embedded images from PDF screenshot Generate screenshots of PDF pages (alias: ss) table Extract tabular data from PDF Options: -o, --output Output file path (for single file) or directory (for multiple files) -p, --pages Page range (e.g., 1,3-5,7) -f, --format Output format (json, text, dataurl) -m, --min Minimum image size threshold in pixels (default: 80) -s, --scale Scale factor for screenshots (default: 1.0) -w, --width Desired width for screenshots in pixels -l, --large Enable optimizations for large PDF files --magic Validate PDF magic bytes (default: true) -h, --help Show this help message -v, --version Show version number Examples: pdf-parse info document.pdf pdf-parse text document.pdf --pages 1-3 pdf-parse screenshot document.pdf --output screenshot.png pdf-parse table document.pdf --format json pdf-parse image document.pdf --output ./images/ pdf-parse screenshot document.pdf --output ./screenshots/ --scale 2.0 pdf-parse check https://bitcoin.org/bitcoin.pdf --magic pdf-parse text https://example.com/large.pdf --large --pages 1-5 `; stdout.write(help); } async function runCommand(command, filePath, options) { let initParams; if (filePath.startsWith('http://') || filePath.startsWith('https://')) { initParams = { url: filePath }; } else { const data = await readFile(filePath); initParams = { data }; } // Apply large file optimizations if --large flag is used if (options.large) { initParams.disableAutoFetch = true; initParams.disableStream = true; initParams.rangeChunkSize = 65536; } const parser = new PDFParse(initParams); try { switch (command) { case 'check': await handleGetHeader(filePath, options); break; case 'info': await handleGetInfo(parser, options); break; case 'text': await handleGetText(parser, options); break; case 'image': await handleGetImage(parser, options); break; case 'screenshot': case 'ss': await handleGetScreenshot(parser, options); break; case 'table': await handleGetTable(parser, options); break; } } finally { await parser.destroy(); } } async function handleGetHeader(filePath, options) { // Check if it's a URL if (!filePath.startsWith('http://') && !filePath.startsWith('https://')) { stderr.write('Error: check command only works with URLs, not local files\n'); stderr.write('Use: pdf-parse check https://bitcoin.org/bitcoin.pdf\n'); process.exit(1); } // Second parameter is for PDF magic bytes validation const result = await getHeader(filePath, !!options.magic); const output = options.format === 'json' ? JSON.stringify(result, null, 2) : formatHeader(result); if (options.output) { await writeFile(options.output, output); } else { stdout.write(output); } } async function handleGetInfo(parser, options) { const result = await parser.getInfo(); const output = options.format === 'json' ? JSON.stringify(result, null, 2) : formatInfo(result); if (options.output) { await writeFile(options.output, output); } else { stdout.write(output); } } async function handleGetText(parser, options) { const params = parsePageParams(options); const result = await parser.getText(params); const output = options.format === 'json' ? JSON.stringify(result, null, 2) : result.text; if (options.output) { await writeFile(options.output, output); } else { stdout.write(output); } } async function handleGetImage(parser, options) { const params = parsePageParams(options); params.imageBuffer = true; params.imageDataUrl = options.format === 'dataurl'; if (options.min) { params.imageThreshold = parseInt(options.min, 10); } const result = await parser.getImage(params); if (options.output) { // Create output directory if it doesn't exist const outputDir = options.output; await createDirectoryIfNeeded(outputDir); let imageCount = 0; for (const page of result.pages) { for (const image of page.images) { const ext = 'png'; const filename = `page_${page.pageNumber}_image_${imageCount}.${ext}`; const filepath = `${outputDir}/${filename}`; await writeFile(filepath, image.data); imageCount++; } } stdout.write(`Extracted ${imageCount} images to ${outputDir}\n`); } else { // List images without extracting let totalImages = 0; for (const page of result.pages) { totalImages += page.images.length; } if (options.format === 'json') { // Remove binary data for JSON output const cleanResult = { total: result.total, pages: result.pages.map((page) => ({ pageNumber: page.pageNumber, imageCount: page.images.length, images: page.images.map((img) => ({ name: img.name, width: img.width, height: img.height, kind: img.kind, })), })), }; stdout.write(JSON.stringify(cleanResult, null, 2)); } else { stdout.write(`Found ${totalImages} images across ${result.total} pages\n`); for (const page of result.pages) { if (page.images.length > 0) { stdout.write(`Page ${page.pageNumber}: ${page.images.length} images\n`); for (let i = 0; i < page.images.length; i++) { const img = page.images[i]; stdout.write(` Image ${i}: ${img.width}x${img.height} (${img.name})\n`); } } } } } } async function handleGetScreenshot(parser, options) { const params = parsePageParams(options); params.imageBuffer = true; params.imageDataUrl = options.format === 'dataurl'; if (options.scale) { params.scale = parseFloat(options.scale); } if (options.width) { params.desiredWidth = parseInt(options.width, 10); } const result = await parser.getScreenshot(params); if (options.output) { // Create output directory if it doesn't exist const outputDir = options.output; await createDirectoryIfNeeded(outputDir); let screenshotCount = 0; for (const page of result.pages) { const ext = 'png'; const filename = `page_${page.pageNumber}_screenshot.${ext}`; const filepath = `${outputDir}/${filename}`; await writeFile(filepath, page.data); screenshotCount++; } stdout.write(`Generated ${screenshotCount} screenshots to ${outputDir}\n`); } else { // List screenshots without generating if (options.format === 'json') { // Remove binary data for JSON output const cleanResult = { total: result.total, pages: result.pages.map((page) => ({ pageNumber: page.pageNumber, width: page.width, height: page.height, scale: page.scale, })), }; stdout.write(JSON.stringify(cleanResult, null, 2)); } else { stdout.write(`Would generate ${result.pages.length} screenshots across ${result.total} pages\n`); for (const page of result.pages) { stdout.write(`Page ${page.pageNumber}: ${page.width}x${page.height} (scale: ${page.scale})\n`); } } } } async function handleGetTable(parser, options) { const params = parsePageParams(options); const result = await parser.getTable(params); if (options.format === 'json') { stdout.write(JSON.stringify(result, null, 2)); } else { // Text format - pretty print tables let output = `Found tables across ${result.total} pages:\n\n`; for (const page of result.pages) { if (page.tables.length > 0) { output += `Page ${page.num}:\n`; for (let i = 0; i < page.tables.length; i++) { output += `Table ${i + 1}:\n`; const table = page.tables[i]; // Calculate column widths const colWidths = []; for (let col = 0; col < table[0].length; col++) { let maxWidth = 0; for (const row of table) { if (row[col]) { maxWidth = Math.max(maxWidth, row[col].length); } } colWidths[col] = maxWidth; } // Print table for (const row of table) { for (let col = 0; col < row.length; col++) { const cell = row[col] || ''; const width = colWidths[col] || 10; output += cell.padEnd(width + 2); } output += '\n'; } output += '\n'; } } } stdout.write(output); } } function parsePageParams(options) { const params = {}; if (options.pages) { // Parse page range like "1,3-5,7" into partial array const partial = []; const ranges = options.pages.split(','); for (const range of ranges) { if (range.includes('-')) { const [start, end] = range.split('-').map((n) => parseInt(n.trim(), 10)); for (let i = start; i <= end; i++) { partial.push(i); } } else { partial.push(parseInt(range.trim(), 10)); } } params.partial = partial; } return params; } function formatInfo(result) { let output = `Total pages: ${result.total}\n`; if (result.info) { output += `\nDocument Info:\n`; for (const [key, value] of Object.entries(result.info)) { output += ` ${key}: ${value}\n`; } } if (result.metadata) { output += `\nMetadata:\n`; for (const [key, value] of Object.entries(result.metadata)) { output += ` ${key}: ${value}\n`; } } return output; } function formatHeader(result) { const magic = result.magic === null ? '-' : !!result.magic; let output = `Status: ${result.status}\n`; output += `Size: ${result.size} bytes\n`; output += `Magic: ${magic}\n`; if (result.headers) { output += `\nHeaders:\n`; for (const [key, value] of Object.entries(result.headers)) { output += ` ${key}: ${value}\n`; } } return output; } async function createDirectoryIfNeeded(dirPath) { try { await mkdir(dirPath, { recursive: true }); } catch (error) { if (error.code !== 'EEXIST') { throw error; } } }