initial commit

2025-11-01 18:04:28 -04:00
commit 4eb7ddfd99
5 changed files with 703 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,57 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# Virtual Environment
+.venv/
+venv/
+ENV/
+env/
+
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+.DS_Store
+
+# UV
+uv.lock
+
+# Project specific
+out_pdfs/
+
+# Logs
+*.log
+
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
--- a/README.md
+++ b/README.md
@@ -0,0 +1,84 @@
+# jpegdoc2pdf
+
+Convert smartphone JPGs of typewritten English documents into searchable **OCRed PDFs** using parallel batch processing.
+
+## Prerequisites
+
+Install the following tools:
+- **Tesseract OCR** (ensure it's in PATH)
+- **img2pdf** - lossless image to PDF converter
+- **ocrmypdf** - adds OCR layer to PDFs
+
+```bash
+# macOS
+brew install tesseract img2pdf ocrmypdf
+
+# Linux (Debian/Ubuntu)
+apt-get install tesseract-ocr img2pdf ocrmypdf
+```
+
+## Usage
+
+### Basic Usage
+
+```bash
+./convert.sh ROOT_DIR [OUT_DIR] [-P N] [--recursive]
+```
+
+### Examples
+
+**Process subdirectories in ROOT with default settings:**
+```bash
+./convert.sh ./ROOT
+```
+
+**Specify custom output directory:**
+```bash
+./convert.sh ./ROOT ./my_output
+```
+
+**Use 4 parallel processes:**
+```bash
+./convert.sh ./ROOT ./out_pdfs -P 4
+```
+
+**Process nested subdirectories recursively:**
+```bash
+./convert.sh ./ROOT ./out_pdfs -P 4 --recursive
+```
+
+## Folder Structure
+
+Organize your images with one subdirectory per PDF:
+
+```
+ROOT/
+  CaseA/
+    001.jpg
+    002.jpg
+  CaseB/
+    page1.jpg
+    page2.jpg
+```
+
+- Each subdirectory under `ROOT` becomes a single PDF
+- Nested subfolders (with `--recursive`) are named like `Parent__Child.pdf`
+- Output PDFs are saved to `out_pdfs/` (or your specified output directory)
+
+## Options
+
+- **ROOT_DIR** (required): Root directory containing subdirectories of images
+- **OUT_DIR** (optional): Output directory (default: `out_pdfs`)
+- **-P N** (optional): Number of parallel processes (default: CPU core count)
+- **--recursive** or **-r**: Process nested subdirectories recursively
+
+## Supported Image Formats
+
+jpg, jpeg, png, tif, tiff (case-insensitive)
+
+## OCR Settings
+
+- Language: English (`eng`)
+- Tesseract OEM: 1 (LSTM neural net mode)
+- Page segmentation mode: 6 (uniform text block)
+- Optimization level: 1
--- a/convert.sh
+++ b/convert.sh
@@ -0,0 +1,114 @@
+#!/usr/bin/env zsh
+# 並列バッチOCR（zsh + xargs）
+# 依存: img2pdf, ocrmypdf, Tesseract
+# 使い方:
+#   ./batch_ocr_parallel.zsh ROOT_DIR [OUT_DIR] [-P N] [--recursive]
+# 例:
+#   ./batch_ocr_parallel.zsh ./ROOT ./out_pdfs -P 4 --recursive
+
+emulate -L zsh
+set -euo pipefail
+
+# ---- 引数処理 ----
+ROOT="${1:-}"
+[[ -z "${ROOT}" ]] && { print -u2 -- "Usage: $0 ROOT_DIR [OUT_DIR] [-P N] [--recursive]"; exit 2; }
+shift
+
+OUT="out_pdfs"
+PROCS=""
+RECURSIVE=0
+
+if (( $# )); then
+  # 2番目がディレクトリ名っぽければ OUT に採用
+  if [[ -n "${1:-}" && "${1:-}" != "-P" && "${1:-}" != "--recursive" && "${1:-}" != "-r" ]]; then
+    OUT="$1"; shift
+  fi
+  # 残りのオプション
+  while (( $# )); do
+    case "$1" in
+      -P) PROCS="${2:-}"; shift 2 ;;
+      --recursive|-r) RECURSIVE=1; shift ;;
+      *) print -u2 -- "Unknown arg: $1"; exit 2 ;;
+    esac
+  done
+fi
+
+# 並列度のデフォルト（未指定ならCPUコア数）
+if [[ -z "${PROCS}" ]]; then
+  if command -v nproc >/dev/null 2>&1; then
+    PROCS="$(nproc)"
+  elif command -v sysctl >/dev/null 2>&1; then
+    PROCS="$(sysctl -n hw.ncpu)"
+  else
+    PROCS=2
+  fi
+fi
+
+# 出力先
+mkdir -p -- "$OUT"
+
+# ---- ワーカー（サブディレクトリ1つを処理）を一時ファイルに作成 ----
+WORKER="$(mktemp -t ocrmypdf_worker.XXXXXX).zsh"
+cat > "$WORKER" <<'ZSH'
+#!/usr/bin/env zsh
+emulate -L zsh
+set -euo pipefail
+setopt null_glob
+
+ROOT="$1"
+OUT="$2"
+DIR="$3"
+
+name="${${DIR#${ROOT%/}/}#/}"       # ROOT/ 以降を相対名に（深い場合は "A/B"）
+base="${DIR:t}"                      # 末尾名（単純名）
+
+# 画像拡張子
+typeset -a exts; exts=( jpg jpeg png tif tiff JPG JPEG PNG TIF TIFF )
+
+# 画像列挙
+typeset -a imgs; imgs=()
+for ext in $exts; do
+  imgs+=("$DIR"/*.${ext})
+done
+
+if (( ${#imgs} == 0 )); then
+  print -u2 -- "[skip] ${name} (no images)"
+  exit 0
+fi
+
+# 一時PDF
+tmp_pdf="$(mktemp -t "ocr_${base}.XXXXXX").pdf"
+
+# 1) 画像 → 無劣化PDF結合（-o を先に、-- の後に入力）
+img2pdf --auto-orient -o "$tmp_pdf" -- "${imgs[@]}"
+
+# 出力ファイル名（ネストを __ でつなぐ）
+out_pdf="$OUT/${name//\//__}.pdf"
+mkdir -p -- "${out_pdf:h}"
+
+# 2) OCR（タイプ打ち英語向けチューニング）
+ocrmypdf \
+  -l eng \
+  --tesseract-oem 1 \
+  --tesseract-pagesegmode 6 \
+  --optimize 1 \
+  --output-type pdf \
+  "$tmp_pdf" "$out_pdf"
+
+rm -f -- "$tmp_pdf"
+print -r -- "✅ Wrote: $out_pdf"
+ZSH
+chmod +x "$WORKER"
+
+# ---- 対象ディレクトリ列挙 → xargs 並列実行 ----
+if (( RECURSIVE )); then
+  # ROOT 配下の全サブディレクトリ（ROOT 自身は除外）
+  find "$ROOT" -mindepth 1 -type d -print0 \
+    | xargs -0 -I {} -n 1 -P "$PROCS" "$WORKER" "$ROOT" "$OUT" {}
+else
+  # 直下のサブディレクトリのみ
+  find "$ROOT" -mindepth 1 -maxdepth 1 -type d -print0 \
+    | xargs -0 -I {} -n 1 -P "$PROCS" "$WORKER" "$ROOT" "$OUT" {}
+fi
+
+print -r -- "Batch complete. (parallel: $PROCS)"
--- a/ocr_pdf.py
+++ b/ocr_pdf.py
@@ -0,0 +1,424 @@
+
+#!/usr/bin/env python3
+"""
+ocr_pdf.py — Convert JPGs of documents into OCRed PDFs.
+Now supports **batch mode** where each subdirectory becomes one PDF.
+
+Features
+- Automatic trimming via document contour detection + perspective warp
+- Deskew fallback if no clear document contour is found
+- Image cleanup tuned for 1970s typewritten English pages (contrast/binarization)
+- Tesseract-based OCR to embed a searchable text layer in the PDF
+- Batch multiple images into one output PDF
+- Batch mode: process a root folder; each subdirectory becomes its own PDF
+
+USAGE
+------
+# Single-PDF mode (glob patterns allowed)
+uv run ocr-pdf -o out.pdf scans/*.jpg
+
+# Batch mode: each subdirectory under ROOT becomes a PDF
+uv run ocr-pdf --batch-root ROOT_DIR --out-dir out_pdfs
+
+# Batch mode with filters
+uv run ocr-pdf --batch-root ROOT_DIR --out-dir out_pdfs --patterns "*.jpg,*.png" --recursive
+
+DEPENDENCIES
+------------
+- Tesseract must be installed on your system and in PATH.
+  macOS (brew):   brew install tesseract
+  Ubuntu/Debian:  sudo apt-get install tesseract-ocr
+  Windows:        Install from https://github.com/UB-Mannheim/tesseract/wiki
+"""
+
+import argparse
+import os
+import sys
+from pathlib import Path
+from typing import Optional, Tuple, List, Iterable, Sequence, Dict
+
+import cv2
+import numpy as np
+import pytesseract
+from pypdf import PdfReader, PdfWriter
+from tempfile import TemporaryDirectory
+import re
+
+# -----------------------------
+# Geometry helpers
+# -----------------------------
+
+def _order_quad_points(pts: np.ndarray) -> np.ndarray:
+    """Order 4 points as (top-left, top-right, bottom-right, bottom-left)."""
+    rect = np.zeros((4, 2), dtype="float32")
+    s = pts.sum(axis=1)
+    rect[0] = pts[np.argmin(s)]
+    rect[2] = pts[np.argmax(s)]
+    diff = np.diff(pts, axis=1)
+    rect[1] = pts[np.argmin(diff)]
+    rect[3] = pts[np.argmax(diff)]
+    return rect
+
+
+def four_point_transform(image: np.ndarray, pts: np.ndarray) -> np.ndarray:
+    rect = _order_quad_points(pts.astype("float32"))
+    (tl, tr, br, bl) = rect
+
+    # Compute the width of the new image
+    widthA = np.linalg.norm(br - bl)
+    widthB = np.linalg.norm(tr - tl)
+    maxWidth = int(max(widthA, widthB))
+
+    # Compute the height of the new image
+    heightA = np.linalg.norm(tr - br)
+    heightB = np.linalg.norm(tl - bl)
+    maxHeight = int(max(heightA, heightB))
+
+    dst = np.array([
+        [0, 0],
+        [maxWidth - 1, 0],
+        [maxWidth - 1, maxHeight - 1],
+        [0, maxHeight - 1]], dtype="float32")
+
+    M = cv2.getPerspectiveTransform(rect, dst)
+    warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))
+    return warped
+
+
+# -----------------------------
+# Document detection and cleanup
+# -----------------------------
+
+def detect_document_contour(image_bgr: np.ndarray) -> Optional[np.ndarray]:
+    """Find the largest 4-point contour that looks like a document page."""
+    image = image_bgr.copy()
+    ratio = 1000.0 / max(image.shape[:2])  # scale longest side to ~1000px for speed
+    small = cv2.resize(image, (int(image.shape[1]*ratio), int(image.shape[0]*ratio)),
+                       interpolation=cv2.INTER_AREA)
+    gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
+    gray = cv2.GaussianBlur(gray, (5, 5), 0)
+    edges = cv2.Canny(gray, 60, 180)
+
+    # Close gaps
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
+    edges = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel, iterations=1)
+
+    contours, _ = cv2.findContours(edges, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
+    contours = sorted(contours, key=cv2.contourArea, reverse=True)
+
+    for c in contours[:10]:
+        peri = cv2.arcLength(c, True)
+        approx = cv2.approxPolyDP(c, 0.02 * peri, True)
+        if len(approx) == 4:
+            # Scale contour back to original image coords
+            approx = (approx.reshape(4, 2) / ratio).astype(np.float32)
+            return approx
+    return None
+
+
+def deskew(image_gray: np.ndarray) -> np.ndarray:
+    """Estimate skew angle with Hough transform on text lines; rotate to correct."""
+    # Binary for line detection
+    g = cv2.GaussianBlur(image_gray, (3, 3), 0)
+    _, bw = cv2.threshold(g, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    inv = cv2.bitwise_not(bw)
+
+    edges = cv2.Canny(inv, 50, 150, apertureSize=3)
+    lines = cv2.HoughLines(edges, 1, np.pi / 180.0, 150)
+
+    angle_deg = 0.0
+    if lines is not None:
+        angles = []
+        for rho_theta in lines[:200]:
+            rho, theta = rho_theta[0]
+            # Convert to degrees relative to horizontal
+            deg = (theta * 180.0 / np.pi) - 90.0
+            # Normalize to [-45, 45] to avoid vertical lines
+            if deg < -45: deg += 90
+            if deg > 45: deg -= 90
+            angles.append(deg)
+        if angles:
+            angle_deg = float(np.median(angles))
+
+    if abs(angle_deg) < 0.1:
+        return image_gray  # no significant skew
+
+    (h, w) = image_gray.shape[:2]
+    M = cv2.getRotationMatrix2D((w // 2, h // 2), angle_deg, 1.0)
+    rotated = cv2.warpAffine(image_gray, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
+    return rotated
+
+
+def cleanup_for_ocr(image_bgr: np.ndarray) -> np.ndarray:
+    """Return a high-contrast, noise-reduced grayscale image suitable for OCR."""
+    gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
+
+    # CLAHE to recover typewriter ink contrast without blowing highlights
+    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
+    gray = clahe.apply(gray)
+
+    # Gentle denoise to preserve glyph edges
+    gray = cv2.bilateralFilter(gray, d=7, sigmaColor=50, sigmaSpace=50)
+
+    # Deskew after contrast/denoise
+    gray = deskew(gray)
+
+    # Adaptive threshold tends to work well on aged paper; keep grayscale if needed
+    th = cv2.adaptiveThreshold(
+        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, 15
+    )
+
+    # Remove small speckles
+    kernel = np.ones((2, 2), np.uint8)
+    th = cv2.morphologyEx(th, cv2.MORPH_OPEN, kernel, iterations=1)
+
+    return th
+
+
+def prepare_page(image_path: Path) -> np.ndarray:
+    """Load, auto-trim (if possible), and clean up a single page image. Returns grayscale uint8 image."""
+    bgr = cv2.imdecode(np.fromfile(str(image_path), dtype=np.uint8), cv2.IMREAD_COLOR)
+    if bgr is None:
+        raise RuntimeError(f"Failed to load image: {image_path}")
+
+    cnt = detect_document_contour(bgr)
+    if cnt is not None:
+        warped = four_point_transform(bgr, cnt)
+    else:
+        warped = bgr  # fall back to original framing
+
+    cleaned = cleanup_for_ocr(warped)
+    return cleaned
+
+
+# -----------------------------
+# OCR + PDF assembly
+# -----------------------------
+
+def image_to_ocr_pdf_bytes(image: np.ndarray, dpi: int, lang: str, oem: Optional[int], psm: Optional[int]) -> bytes:
+    """Use Tesseract to produce a searchable PDF bytes for one image page."""
+    # Tesseract prefers RGB
+    if len(image.shape) == 2:
+        rgb = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
+    else:
+        rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+    # Hint DPI via config; upsample if very small to approach ~300 dpi text size
+    h, w = rgb.shape[:2]
+    scale = 1.0
+    if max(h, w) < 1500:
+        scale = 1500.0 / max(h, w)
+        rgb = cv2.resize(rgb, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_CUBIC)
+
+    config_parts = [f'--dpi {dpi}', f'-l {lang}']
+    if oem is not None:
+        config_parts.append(f'--oem {int(oem)}')
+    if psm is not None:
+        config_parts.append(f'--psm {int(psm)}')
+    config = " ".join(config_parts)
+
+    pdf_bytes = pytesseract.image_to_pdf_or_hocr(rgb, extension='pdf', config=config)
+    return pdf_bytes
+
+
+def combine_pdfs(pdf_paths: List[Path], out_path: Path) -> None:
+    writer = PdfWriter()
+    for p in pdf_paths:
+        reader = PdfReader(str(p))
+        for page in reader.pages:
+            writer.add_page(page)
+    with open(out_path, "wb") as f:
+        writer.write(f)
+
+
+# -----------------------------
+# Batch helpers
+# -----------------------------
+
+VALID_EXT = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp"}
+
+def natural_key(s: str):
+    """Sort key that groups numbers naturally (e.g., page2 < page10)."""
+    return [int(t) if t.isdigit() else t.lower() for t in re.split(r'(\d+)', s)]
+
+def find_image_files(dir_path: Path, patterns: Sequence[str]) -> List[Path]:
+    files: List[Path] = []
+    for pat in patterns:
+        files.extend(sorted(dir_path.glob(pat), key=lambda p: natural_key(p.name)))
+    # fallback: if no patterns matched, include known image extensions
+    if not files:
+        for p in sorted(dir_path.iterdir(), key=lambda p: natural_key(p.name)):
+            if p.suffix.lower() in VALID_EXT and p.is_file():
+                files.append(p)
+    return files
+
+def iter_target_dirs(root: Path, recursive: bool) -> List[Path]:
+    if not recursive:
+        # only direct children that are directories (ignore hidden)
+        return [p for p in sorted(root.iterdir()) if p.is_dir() and not p.name.startswith(".")]
+    # walk recursively; include any directory that contains at least one image
+    out = []
+    for d, subdirs, files in os.walk(root):
+        dpath = Path(d)
+        if dpath == root:
+            continue
+        # skip hidden directories
+        if any(part.startswith(".") for part in dpath.parts):
+            continue
+        for f in files:
+            if Path(f).suffix.lower() in VALID_EXT:
+                out.append(dpath)
+                break
+    out = sorted(set(out), key=lambda p: natural_key(str(p.relative_to(root))))
+    return out
+
+
+# -----------------------------
+# Main CLI
+# -----------------------------
+
+def main():
+    ap = argparse.ArgumentParser(description="Convert images to OCRed PDF(s) with auto-trim/deskew. Supports batch by subdirectory.")
+    mode = ap.add_mutually_exclusive_group(required=False)
+    # Single-PDF mode (default when inputs provided)
+    ap.add_argument("inputs", nargs="*", help="Input image files (JPG/PNG/etc.). Glob patterns ok (quote them).")
+    ap.add_argument("-o", "--output", help="Output PDF path, e.g., out.pdf (required in single mode).")
+
+    # Batch mode
+    ap.add_argument("--batch-root", type=str, help="Root directory containing subdirectories of images. Each subdirectory -> 1 PDF.")
+    ap.add_argument("--out-dir", type=str, help="Directory to write PDFs in batch mode.")
+    ap.add_argument("--patterns", type=str, default="*.jpg,*.jpeg,*.png,*.tif,*.tiff,*.bmp", help="Comma-separated glob patterns per subdir.")
+    ap.add_argument("--recursive", action="store_true", help="Recurse into nested subdirectories in batch mode.")
+
+    # Common OCR knobs
+    ap.add_argument("--dpi", type=int, default=300, help="DPI hint for Tesseract (default: 300)")
+    ap.add_argument("--lang", default="eng", help="Tesseract language(s), e.g., 'eng' (default)")
+    ap.add_argument("--oem", type=int, choices=[0, 1, 2, 3], default=1, help="Tesseract OCR Engine Mode (default: 1)")
+    ap.add_argument("--psm", type=int, choices=list(range(0, 14)), default=6, help="Tesseract Page Segmentation Mode (default: 6)")
+    ap.add_argument("--keep-intermediate", action="store_true", help="Keep per-page PDFs in a temp folder for inspection.")
+    args = ap.parse_args()
+
+    # Validate tesseract availability early
+    try:
+        _ = pytesseract.get_tesseract_version()
+    except Exception as e:
+        print("ERROR: Tesseract is not available. Please install it and ensure it's in PATH.", file=sys.stderr)
+        print(str(e), file=sys.stderr)
+        sys.exit(2)
+
+    # ---------------- Single-PDF mode ----------------
+    if args.batch_root is None:
+        if not args.inputs or not args.output:
+            print("In single-PDF mode, provide inputs and --output. For batch mode, use --batch-root and --out-dir.", file=sys.stderr)
+            sys.exit(1)
+
+        # Expand inputs
+        input_paths: List[Path] = []
+        for pattern in args.inputs:
+            if any(ch in pattern for ch in "*?[]"):
+                expanded = [Path(p) for p in sorted(map(str, Path().glob(pattern)))]
+            else:
+                expanded = [Path(pattern)]
+            for p in expanded:
+                if p.exists() and p.is_file():
+                    input_paths.append(p)
+
+        if not input_paths:
+            print("No valid input files found.", file=sys.stderr)
+            sys.exit(1)
+
+        out_path = Path(args.output)
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+
+        with TemporaryDirectory(prefix="ocr_pdf_") as tmpdir:
+            tmpdir = Path(tmpdir)
+            page_pdf_paths: List[Path] = []
+
+            for idx, img_path in enumerate(input_paths, start=1):
+                print(f"[{idx}/{len(input_paths)}] Processing {img_path} ...")
+                try:
+                    page_img = prepare_page(img_path)
+                    pdf_bytes = image_to_ocr_pdf_bytes(
+                        page_img, dpi=args.dpi, lang=args.lang, oem=args.oem, psm=args.psm
+                    )
+                    page_pdf = tmpdir / f"page_{idx:04d}.pdf"
+                    with open(page_pdf, "wb") as f:
+                        f.write(pdf_bytes)
+                    page_pdf_paths.append(page_pdf)
+                except Exception as e:
+                    print(f"ERROR processing {img_path}: {e}", file=sys.stderr)
+
+            if not page_pdf_paths:
+                print("No pages were successfully processed; aborting.", file=sys.stderr)
+                sys.exit(1)
+
+            combine_pdfs(page_pdf_paths, out_path)
+            print(f"✅ Wrote OCRed PDF: {out_path}")
+
+            if args.keep_intermediate:
+                keep_dir = out_path.with_suffix("")
+                keep_dir = keep_dir.parent / (keep_dir.name + "_pages")
+                keep_dir.mkdir(parents=True, exist_ok=True)
+                for p in page_pdf_paths:
+                    dest = keep_dir / p.name
+                    dest.write_bytes(p.read_bytes())
+                print(f"Kept per-page PDFs in: {keep_dir}")
+        return
+
+    # ---------------- Batch mode ----------------
+    root = Path(args.batch_root)
+    if not root.is_dir():
+        print(f"--batch-root is not a directory: {root}", file=sys.stderr)
+        sys.exit(1)
+
+    out_dir = Path(args.out_dir) if args.out_dir else root / "ocr_pdfs"
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    patterns = [p.strip() for p in args.patterns.split(",") if p.strip()]
+    targets = iter_target_dirs(root, recursive=args.recursive)
+
+    if not targets:
+        print("No subdirectories with images found under batch root.", file=sys.stderr)
+        sys.exit(1)
+
+    for d in targets:
+        rel = d.relative_to(root)
+        images = find_image_files(d, patterns)
+        if not images:
+            print(f"[skip] {rel} — no images matching {patterns}", file=sys.stderr)
+            continue
+
+        out_pdf = out_dir / (str(rel).replace(os.sep, "__") + ".pdf")
+        out_pdf.parent.mkdir(parents=True, exist_ok=True)
+        print(f"\n=== {rel} → {out_pdf.name} ({len(images)} pages) ===")
+
+        with TemporaryDirectory(prefix=f"ocr_{rel}_") as tmpdir:
+            tmpdir = Path(tmpdir)
+            page_pdf_paths: List[Path] = []
+
+            for idx, img_path in enumerate(images, start=1):
+                print(f"[{rel}] [{idx}/{len(images)}] {img_path.name}")
+                try:
+                    page_img = prepare_page(img_path)
+                    pdf_bytes = image_to_ocr_pdf_bytes(
+                        page_img, dpi=args.dpi, lang=args.lang, oem=args.oem, psm=args.psm
+                    )
+                    page_pdf = tmpdir / f"page_{idx:04d}.pdf"
+                    with open(page_pdf, "wb") as f:
+                        f.write(pdf_bytes)
+                    page_pdf_paths.append(page_pdf)
+                except Exception as e:
+                    print(f"ERROR processing {img_path}: {e}", file=sys.stderr)
+
+            if not page_pdf_paths:
+                print(f"[skip] {rel} — no pages processed successfully.", file=sys.stderr)
+                continue
+
+            combine_pdfs(page_pdf_paths, out_pdf)
+            print(f"✅ Wrote: {out_pdf}")
+
+    print("\nBatch complete.")
+    return
+
+if __name__ == "__main__":
+    main()
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,24 @@
+
+[project]
+name = "ocr-pdf"
+version = "0.1.0"
+description = "Convert smartphone JPGs to auto-trimmed, deskewed OCRed PDF (English, typewritten)."
+readme = "README.md"
+requires-python = ">=3.9"
+dependencies = [
+  "opencv-python>=4.9",
+  "numpy>=1.26",
+  "pytesseract>=0.3.10",
+  "pypdf>=5.0",
+  "Pillow>=10.0",
+]
+
+[project.scripts]
+ocr-pdf = "ocr_pdf:main"
+
+[build-system]
+requires = ["setuptools>=69"]
+build-backend = "setuptools.build_meta"
+
+[tool.uv]
+# You can pin python version or set custom indexes here if needed.