From 4eb7ddfd995a50e9c11fb4e08dc6c9712e31bca0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=BE=E6=B5=A6=20=E7=9F=A5=E4=B9=9F=20Matsuura=20Tomoy?= =?UTF-8?q?a?= Date: Sat, 1 Nov 2025 18:04:28 -0400 Subject: [PATCH] initial commit --- .gitignore | 57 +++++++ README.md | 84 ++++++++++ convert.sh | 114 +++++++++++++ ocr_pdf.py | 424 +++++++++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 24 +++ 5 files changed, 703 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100755 convert.sh create mode 100644 ocr_pdf.py create mode 100644 pyproject.toml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a77f8ba --- /dev/null +++ b/.gitignore @@ -0,0 +1,57 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# Virtual Environment +.venv/ +venv/ +ENV/ +env/ + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo +*~ +.DS_Store + +# UV +uv.lock + +# Project specific +out_pdfs/ + +# Logs +*.log + +# Testing +.pytest_cache/ +.coverage +htmlcov/ + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json diff --git a/README.md b/README.md new file mode 100644 index 0000000..1571b8a --- /dev/null +++ b/README.md @@ -0,0 +1,84 @@ +# jpegdoc2pdf + +Convert smartphone JPGs of typewritten English documents into searchable **OCRed PDFs** using parallel batch processing. + +## Prerequisites + +Install the following tools: +- **Tesseract OCR** (ensure it's in PATH) +- **img2pdf** - lossless image to PDF converter +- **ocrmypdf** - adds OCR layer to PDFs + +```bash +# macOS +brew install tesseract img2pdf ocrmypdf + +# Linux (Debian/Ubuntu) +apt-get install tesseract-ocr img2pdf ocrmypdf +``` + +## Usage + +### Basic Usage + +```bash +./convert.sh ROOT_DIR [OUT_DIR] [-P N] [--recursive] +``` + +### Examples + +**Process subdirectories in ROOT with default settings:** +```bash +./convert.sh ./ROOT +``` + +**Specify custom output directory:** +```bash +./convert.sh ./ROOT ./my_output +``` + +**Use 4 parallel processes:** +```bash +./convert.sh ./ROOT ./out_pdfs -P 4 +``` + +**Process nested subdirectories recursively:** +```bash +./convert.sh ./ROOT ./out_pdfs -P 4 --recursive +``` + +## Folder Structure + +Organize your images with one subdirectory per PDF: + +``` +ROOT/ + CaseA/ + 001.jpg + 002.jpg + CaseB/ + page1.jpg + page2.jpg +``` + +- Each subdirectory under `ROOT` becomes a single PDF +- Nested subfolders (with `--recursive`) are named like `Parent__Child.pdf` +- Output PDFs are saved to `out_pdfs/` (or your specified output directory) + +## Options + +- **ROOT_DIR** (required): Root directory containing subdirectories of images +- **OUT_DIR** (optional): Output directory (default: `out_pdfs`) +- **-P N** (optional): Number of parallel processes (default: CPU core count) +- **--recursive** or **-r**: Process nested subdirectories recursively + +## Supported Image Formats + +jpg, jpeg, png, tif, tiff (case-insensitive) + +## OCR Settings + +- Language: English (`eng`) +- Tesseract OEM: 1 (LSTM neural net mode) +- Page segmentation mode: 6 (uniform text block) +- Optimization level: 1 diff --git a/convert.sh b/convert.sh new file mode 100755 index 0000000..215884e --- /dev/null +++ b/convert.sh @@ -0,0 +1,114 @@ +#!/usr/bin/env zsh +# 並列バッチOCR(zsh + xargs) +# 依存: img2pdf, ocrmypdf, Tesseract +# 使い方: +# ./batch_ocr_parallel.zsh ROOT_DIR [OUT_DIR] [-P N] [--recursive] +# 例: +# ./batch_ocr_parallel.zsh ./ROOT ./out_pdfs -P 4 --recursive + +emulate -L zsh +set -euo pipefail + +# ---- 引数処理 ---- +ROOT="${1:-}" +[[ -z "${ROOT}" ]] && { print -u2 -- "Usage: $0 ROOT_DIR [OUT_DIR] [-P N] [--recursive]"; exit 2; } +shift + +OUT="out_pdfs" +PROCS="" +RECURSIVE=0 + +if (( $# )); then + # 2番目がディレクトリ名っぽければ OUT に採用 + if [[ -n "${1:-}" && "${1:-}" != "-P" && "${1:-}" != "--recursive" && "${1:-}" != "-r" ]]; then + OUT="$1"; shift + fi + # 残りのオプション + while (( $# )); do + case "$1" in + -P) PROCS="${2:-}"; shift 2 ;; + --recursive|-r) RECURSIVE=1; shift ;; + *) print -u2 -- "Unknown arg: $1"; exit 2 ;; + esac + done +fi + +# 並列度のデフォルト(未指定ならCPUコア数) +if [[ -z "${PROCS}" ]]; then + if command -v nproc >/dev/null 2>&1; then + PROCS="$(nproc)" + elif command -v sysctl >/dev/null 2>&1; then + PROCS="$(sysctl -n hw.ncpu)" + else + PROCS=2 + fi +fi + +# 出力先 +mkdir -p -- "$OUT" + +# ---- ワーカー(サブディレクトリ1つを処理)を一時ファイルに作成 ---- +WORKER="$(mktemp -t ocrmypdf_worker.XXXXXX).zsh" +cat > "$WORKER" <<'ZSH' +#!/usr/bin/env zsh +emulate -L zsh +set -euo pipefail +setopt null_glob + +ROOT="$1" +OUT="$2" +DIR="$3" + +name="${${DIR#${ROOT%/}/}#/}" # ROOT/ 以降を相対名に(深い場合は "A/B") +base="${DIR:t}" # 末尾名(単純名) + +# 画像拡張子 +typeset -a exts; exts=( jpg jpeg png tif tiff JPG JPEG PNG TIF TIFF ) + +# 画像列挙 +typeset -a imgs; imgs=() +for ext in $exts; do + imgs+=("$DIR"/*.${ext}) +done + +if (( ${#imgs} == 0 )); then + print -u2 -- "[skip] ${name} (no images)" + exit 0 +fi + +# 一時PDF +tmp_pdf="$(mktemp -t "ocr_${base}.XXXXXX").pdf" + +# 1) 画像 → 無劣化PDF結合(-o を先に、-- の後に入力) +img2pdf --auto-orient -o "$tmp_pdf" -- "${imgs[@]}" + +# 出力ファイル名(ネストを __ でつなぐ) +out_pdf="$OUT/${name//\//__}.pdf" +mkdir -p -- "${out_pdf:h}" + +# 2) OCR(タイプ打ち英語向けチューニング) +ocrmypdf \ + -l eng \ + --tesseract-oem 1 \ + --tesseract-pagesegmode 6 \ + --optimize 1 \ + --output-type pdf \ + "$tmp_pdf" "$out_pdf" + +rm -f -- "$tmp_pdf" +print -r -- "✅ Wrote: $out_pdf" +ZSH +chmod +x "$WORKER" + +# ---- 対象ディレクトリ列挙 → xargs 並列実行 ---- +if (( RECURSIVE )); then + # ROOT 配下の全サブディレクトリ(ROOT 自身は除外) + find "$ROOT" -mindepth 1 -type d -print0 \ + | xargs -0 -I {} -n 1 -P "$PROCS" "$WORKER" "$ROOT" "$OUT" {} +else + # 直下のサブディレクトリのみ + find "$ROOT" -mindepth 1 -maxdepth 1 -type d -print0 \ + | xargs -0 -I {} -n 1 -P "$PROCS" "$WORKER" "$ROOT" "$OUT" {} +fi + +print -r -- "Batch complete. (parallel: $PROCS)" diff --git a/ocr_pdf.py b/ocr_pdf.py new file mode 100644 index 0000000..6e747f6 --- /dev/null +++ b/ocr_pdf.py @@ -0,0 +1,424 @@ + +#!/usr/bin/env python3 +""" +ocr_pdf.py — Convert JPGs of documents into OCRed PDFs. +Now supports **batch mode** where each subdirectory becomes one PDF. + +Features +- Automatic trimming via document contour detection + perspective warp +- Deskew fallback if no clear document contour is found +- Image cleanup tuned for 1970s typewritten English pages (contrast/binarization) +- Tesseract-based OCR to embed a searchable text layer in the PDF +- Batch multiple images into one output PDF +- Batch mode: process a root folder; each subdirectory becomes its own PDF + +USAGE +------ +# Single-PDF mode (glob patterns allowed) +uv run ocr-pdf -o out.pdf scans/*.jpg + +# Batch mode: each subdirectory under ROOT becomes a PDF +uv run ocr-pdf --batch-root ROOT_DIR --out-dir out_pdfs + +# Batch mode with filters +uv run ocr-pdf --batch-root ROOT_DIR --out-dir out_pdfs --patterns "*.jpg,*.png" --recursive + +DEPENDENCIES +------------ +- Tesseract must be installed on your system and in PATH. + macOS (brew): brew install tesseract + Ubuntu/Debian: sudo apt-get install tesseract-ocr + Windows: Install from https://github.com/UB-Mannheim/tesseract/wiki +""" + +import argparse +import os +import sys +from pathlib import Path +from typing import Optional, Tuple, List, Iterable, Sequence, Dict + +import cv2 +import numpy as np +import pytesseract +from pypdf import PdfReader, PdfWriter +from tempfile import TemporaryDirectory +import re + +# ----------------------------- +# Geometry helpers +# ----------------------------- + +def _order_quad_points(pts: np.ndarray) -> np.ndarray: + """Order 4 points as (top-left, top-right, bottom-right, bottom-left).""" + rect = np.zeros((4, 2), dtype="float32") + s = pts.sum(axis=1) + rect[0] = pts[np.argmin(s)] + rect[2] = pts[np.argmax(s)] + diff = np.diff(pts, axis=1) + rect[1] = pts[np.argmin(diff)] + rect[3] = pts[np.argmax(diff)] + return rect + + +def four_point_transform(image: np.ndarray, pts: np.ndarray) -> np.ndarray: + rect = _order_quad_points(pts.astype("float32")) + (tl, tr, br, bl) = rect + + # Compute the width of the new image + widthA = np.linalg.norm(br - bl) + widthB = np.linalg.norm(tr - tl) + maxWidth = int(max(widthA, widthB)) + + # Compute the height of the new image + heightA = np.linalg.norm(tr - br) + heightB = np.linalg.norm(tl - bl) + maxHeight = int(max(heightA, heightB)) + + dst = np.array([ + [0, 0], + [maxWidth - 1, 0], + [maxWidth - 1, maxHeight - 1], + [0, maxHeight - 1]], dtype="float32") + + M = cv2.getPerspectiveTransform(rect, dst) + warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight)) + return warped + + +# ----------------------------- +# Document detection and cleanup +# ----------------------------- + +def detect_document_contour(image_bgr: np.ndarray) -> Optional[np.ndarray]: + """Find the largest 4-point contour that looks like a document page.""" + image = image_bgr.copy() + ratio = 1000.0 / max(image.shape[:2]) # scale longest side to ~1000px for speed + small = cv2.resize(image, (int(image.shape[1]*ratio), int(image.shape[0]*ratio)), + interpolation=cv2.INTER_AREA) + gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY) + gray = cv2.GaussianBlur(gray, (5, 5), 0) + edges = cv2.Canny(gray, 60, 180) + + # Close gaps + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5)) + edges = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel, iterations=1) + + contours, _ = cv2.findContours(edges, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) + contours = sorted(contours, key=cv2.contourArea, reverse=True) + + for c in contours[:10]: + peri = cv2.arcLength(c, True) + approx = cv2.approxPolyDP(c, 0.02 * peri, True) + if len(approx) == 4: + # Scale contour back to original image coords + approx = (approx.reshape(4, 2) / ratio).astype(np.float32) + return approx + return None + + +def deskew(image_gray: np.ndarray) -> np.ndarray: + """Estimate skew angle with Hough transform on text lines; rotate to correct.""" + # Binary for line detection + g = cv2.GaussianBlur(image_gray, (3, 3), 0) + _, bw = cv2.threshold(g, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) + inv = cv2.bitwise_not(bw) + + edges = cv2.Canny(inv, 50, 150, apertureSize=3) + lines = cv2.HoughLines(edges, 1, np.pi / 180.0, 150) + + angle_deg = 0.0 + if lines is not None: + angles = [] + for rho_theta in lines[:200]: + rho, theta = rho_theta[0] + # Convert to degrees relative to horizontal + deg = (theta * 180.0 / np.pi) - 90.0 + # Normalize to [-45, 45] to avoid vertical lines + if deg < -45: deg += 90 + if deg > 45: deg -= 90 + angles.append(deg) + if angles: + angle_deg = float(np.median(angles)) + + if abs(angle_deg) < 0.1: + return image_gray # no significant skew + + (h, w) = image_gray.shape[:2] + M = cv2.getRotationMatrix2D((w // 2, h // 2), angle_deg, 1.0) + rotated = cv2.warpAffine(image_gray, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE) + return rotated + + +def cleanup_for_ocr(image_bgr: np.ndarray) -> np.ndarray: + """Return a high-contrast, noise-reduced grayscale image suitable for OCR.""" + gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY) + + # CLAHE to recover typewriter ink contrast without blowing highlights + clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8)) + gray = clahe.apply(gray) + + # Gentle denoise to preserve glyph edges + gray = cv2.bilateralFilter(gray, d=7, sigmaColor=50, sigmaSpace=50) + + # Deskew after contrast/denoise + gray = deskew(gray) + + # Adaptive threshold tends to work well on aged paper; keep grayscale if needed + th = cv2.adaptiveThreshold( + gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, 15 + ) + + # Remove small speckles + kernel = np.ones((2, 2), np.uint8) + th = cv2.morphologyEx(th, cv2.MORPH_OPEN, kernel, iterations=1) + + return th + + +def prepare_page(image_path: Path) -> np.ndarray: + """Load, auto-trim (if possible), and clean up a single page image. Returns grayscale uint8 image.""" + bgr = cv2.imdecode(np.fromfile(str(image_path), dtype=np.uint8), cv2.IMREAD_COLOR) + if bgr is None: + raise RuntimeError(f"Failed to load image: {image_path}") + + cnt = detect_document_contour(bgr) + if cnt is not None: + warped = four_point_transform(bgr, cnt) + else: + warped = bgr # fall back to original framing + + cleaned = cleanup_for_ocr(warped) + return cleaned + + +# ----------------------------- +# OCR + PDF assembly +# ----------------------------- + +def image_to_ocr_pdf_bytes(image: np.ndarray, dpi: int, lang: str, oem: Optional[int], psm: Optional[int]) -> bytes: + """Use Tesseract to produce a searchable PDF bytes for one image page.""" + # Tesseract prefers RGB + if len(image.shape) == 2: + rgb = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB) + else: + rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + + # Hint DPI via config; upsample if very small to approach ~300 dpi text size + h, w = rgb.shape[:2] + scale = 1.0 + if max(h, w) < 1500: + scale = 1500.0 / max(h, w) + rgb = cv2.resize(rgb, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_CUBIC) + + config_parts = [f'--dpi {dpi}', f'-l {lang}'] + if oem is not None: + config_parts.append(f'--oem {int(oem)}') + if psm is not None: + config_parts.append(f'--psm {int(psm)}') + config = " ".join(config_parts) + + pdf_bytes = pytesseract.image_to_pdf_or_hocr(rgb, extension='pdf', config=config) + return pdf_bytes + + +def combine_pdfs(pdf_paths: List[Path], out_path: Path) -> None: + writer = PdfWriter() + for p in pdf_paths: + reader = PdfReader(str(p)) + for page in reader.pages: + writer.add_page(page) + with open(out_path, "wb") as f: + writer.write(f) + + +# ----------------------------- +# Batch helpers +# ----------------------------- + +VALID_EXT = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp"} + +def natural_key(s: str): + """Sort key that groups numbers naturally (e.g., page2 < page10).""" + return [int(t) if t.isdigit() else t.lower() for t in re.split(r'(\d+)', s)] + +def find_image_files(dir_path: Path, patterns: Sequence[str]) -> List[Path]: + files: List[Path] = [] + for pat in patterns: + files.extend(sorted(dir_path.glob(pat), key=lambda p: natural_key(p.name))) + # fallback: if no patterns matched, include known image extensions + if not files: + for p in sorted(dir_path.iterdir(), key=lambda p: natural_key(p.name)): + if p.suffix.lower() in VALID_EXT and p.is_file(): + files.append(p) + return files + +def iter_target_dirs(root: Path, recursive: bool) -> List[Path]: + if not recursive: + # only direct children that are directories (ignore hidden) + return [p for p in sorted(root.iterdir()) if p.is_dir() and not p.name.startswith(".")] + # walk recursively; include any directory that contains at least one image + out = [] + for d, subdirs, files in os.walk(root): + dpath = Path(d) + if dpath == root: + continue + # skip hidden directories + if any(part.startswith(".") for part in dpath.parts): + continue + for f in files: + if Path(f).suffix.lower() in VALID_EXT: + out.append(dpath) + break + out = sorted(set(out), key=lambda p: natural_key(str(p.relative_to(root)))) + return out + + +# ----------------------------- +# Main CLI +# ----------------------------- + +def main(): + ap = argparse.ArgumentParser(description="Convert images to OCRed PDF(s) with auto-trim/deskew. Supports batch by subdirectory.") + mode = ap.add_mutually_exclusive_group(required=False) + # Single-PDF mode (default when inputs provided) + ap.add_argument("inputs", nargs="*", help="Input image files (JPG/PNG/etc.). Glob patterns ok (quote them).") + ap.add_argument("-o", "--output", help="Output PDF path, e.g., out.pdf (required in single mode).") + + # Batch mode + ap.add_argument("--batch-root", type=str, help="Root directory containing subdirectories of images. Each subdirectory -> 1 PDF.") + ap.add_argument("--out-dir", type=str, help="Directory to write PDFs in batch mode.") + ap.add_argument("--patterns", type=str, default="*.jpg,*.jpeg,*.png,*.tif,*.tiff,*.bmp", help="Comma-separated glob patterns per subdir.") + ap.add_argument("--recursive", action="store_true", help="Recurse into nested subdirectories in batch mode.") + + # Common OCR knobs + ap.add_argument("--dpi", type=int, default=300, help="DPI hint for Tesseract (default: 300)") + ap.add_argument("--lang", default="eng", help="Tesseract language(s), e.g., 'eng' (default)") + ap.add_argument("--oem", type=int, choices=[0, 1, 2, 3], default=1, help="Tesseract OCR Engine Mode (default: 1)") + ap.add_argument("--psm", type=int, choices=list(range(0, 14)), default=6, help="Tesseract Page Segmentation Mode (default: 6)") + ap.add_argument("--keep-intermediate", action="store_true", help="Keep per-page PDFs in a temp folder for inspection.") + args = ap.parse_args() + + # Validate tesseract availability early + try: + _ = pytesseract.get_tesseract_version() + except Exception as e: + print("ERROR: Tesseract is not available. Please install it and ensure it's in PATH.", file=sys.stderr) + print(str(e), file=sys.stderr) + sys.exit(2) + + # ---------------- Single-PDF mode ---------------- + if args.batch_root is None: + if not args.inputs or not args.output: + print("In single-PDF mode, provide inputs and --output. For batch mode, use --batch-root and --out-dir.", file=sys.stderr) + sys.exit(1) + + # Expand inputs + input_paths: List[Path] = [] + for pattern in args.inputs: + if any(ch in pattern for ch in "*?[]"): + expanded = [Path(p) for p in sorted(map(str, Path().glob(pattern)))] + else: + expanded = [Path(pattern)] + for p in expanded: + if p.exists() and p.is_file(): + input_paths.append(p) + + if not input_paths: + print("No valid input files found.", file=sys.stderr) + sys.exit(1) + + out_path = Path(args.output) + out_path.parent.mkdir(parents=True, exist_ok=True) + + with TemporaryDirectory(prefix="ocr_pdf_") as tmpdir: + tmpdir = Path(tmpdir) + page_pdf_paths: List[Path] = [] + + for idx, img_path in enumerate(input_paths, start=1): + print(f"[{idx}/{len(input_paths)}] Processing {img_path} ...") + try: + page_img = prepare_page(img_path) + pdf_bytes = image_to_ocr_pdf_bytes( + page_img, dpi=args.dpi, lang=args.lang, oem=args.oem, psm=args.psm + ) + page_pdf = tmpdir / f"page_{idx:04d}.pdf" + with open(page_pdf, "wb") as f: + f.write(pdf_bytes) + page_pdf_paths.append(page_pdf) + except Exception as e: + print(f"ERROR processing {img_path}: {e}", file=sys.stderr) + + if not page_pdf_paths: + print("No pages were successfully processed; aborting.", file=sys.stderr) + sys.exit(1) + + combine_pdfs(page_pdf_paths, out_path) + print(f"✅ Wrote OCRed PDF: {out_path}") + + if args.keep_intermediate: + keep_dir = out_path.with_suffix("") + keep_dir = keep_dir.parent / (keep_dir.name + "_pages") + keep_dir.mkdir(parents=True, exist_ok=True) + for p in page_pdf_paths: + dest = keep_dir / p.name + dest.write_bytes(p.read_bytes()) + print(f"Kept per-page PDFs in: {keep_dir}") + return + + # ---------------- Batch mode ---------------- + root = Path(args.batch_root) + if not root.is_dir(): + print(f"--batch-root is not a directory: {root}", file=sys.stderr) + sys.exit(1) + + out_dir = Path(args.out_dir) if args.out_dir else root / "ocr_pdfs" + out_dir.mkdir(parents=True, exist_ok=True) + + patterns = [p.strip() for p in args.patterns.split(",") if p.strip()] + targets = iter_target_dirs(root, recursive=args.recursive) + + if not targets: + print("No subdirectories with images found under batch root.", file=sys.stderr) + sys.exit(1) + + for d in targets: + rel = d.relative_to(root) + images = find_image_files(d, patterns) + if not images: + print(f"[skip] {rel} — no images matching {patterns}", file=sys.stderr) + continue + + out_pdf = out_dir / (str(rel).replace(os.sep, "__") + ".pdf") + out_pdf.parent.mkdir(parents=True, exist_ok=True) + print(f"\n=== {rel} → {out_pdf.name} ({len(images)} pages) ===") + + with TemporaryDirectory(prefix=f"ocr_{rel}_") as tmpdir: + tmpdir = Path(tmpdir) + page_pdf_paths: List[Path] = [] + + for idx, img_path in enumerate(images, start=1): + print(f"[{rel}] [{idx}/{len(images)}] {img_path.name}") + try: + page_img = prepare_page(img_path) + pdf_bytes = image_to_ocr_pdf_bytes( + page_img, dpi=args.dpi, lang=args.lang, oem=args.oem, psm=args.psm + ) + page_pdf = tmpdir / f"page_{idx:04d}.pdf" + with open(page_pdf, "wb") as f: + f.write(pdf_bytes) + page_pdf_paths.append(page_pdf) + except Exception as e: + print(f"ERROR processing {img_path}: {e}", file=sys.stderr) + + if not page_pdf_paths: + print(f"[skip] {rel} — no pages processed successfully.", file=sys.stderr) + continue + + combine_pdfs(page_pdf_paths, out_pdf) + print(f"✅ Wrote: {out_pdf}") + + print("\nBatch complete.") + return + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..ce44f96 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,24 @@ + +[project] +name = "ocr-pdf" +version = "0.1.0" +description = "Convert smartphone JPGs to auto-trimmed, deskewed OCRed PDF (English, typewritten)." +readme = "README.md" +requires-python = ">=3.9" +dependencies = [ + "opencv-python>=4.9", + "numpy>=1.26", + "pytesseract>=0.3.10", + "pypdf>=5.0", + "Pillow>=10.0", +] + +[project.scripts] +ocr-pdf = "ocr_pdf:main" + +[build-system] +requires = ["setuptools>=69"] +build-backend = "setuptools.build_meta" + +[tool.uv] +# You can pin python version or set custom indexes here if needed.