initial commit

2025-11-01 18:04:28 -04:00
commit 4eb7ddfd99
5 changed files with 703 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,57 @@
 # Python
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 pip-wheel-metadata/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # Virtual Environment
 .venv/
 venv/
 ENV/
 env/
 # IDEs
 .vscode/
 .idea/
 *.swp
 *.swo
 *~
 .DS_Store
 # UV
 uv.lock
 # Project specific
 out_pdfs/
 # Logs
 *.log
 # Testing
 .pytest_cache/
 .coverage
 htmlcov/
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
--- a/README.md
+++ b/README.md
@@ -0,0 +1,84 @@
 # jpegdoc2pdf
 Convert smartphone JPGs of typewritten English documents into searchable **OCRed PDFs** using parallel batch processing.
 ## Prerequisites
 Install the following tools:
 - **Tesseract OCR** (ensure it's in PATH)
 - **img2pdf** - lossless image to PDF converter
 - **ocrmypdf** - adds OCR layer to PDFs
 ```bash
 # macOS
 brew install tesseract img2pdf ocrmypdf
 # Linux (Debian/Ubuntu)
 apt-get install tesseract-ocr img2pdf ocrmypdf
 ```
 ## Usage
 ### Basic Usage
 ```bash
 ./convert.sh ROOT_DIR [OUT_DIR] [-P N] [--recursive]
 ```
 ### Examples
 **Process subdirectories in ROOT with default settings:**
 ```bash
 ./convert.sh ./ROOT
 ```
 **Specify custom output directory:**
 ```bash
 ./convert.sh ./ROOT ./my_output
 ```
 **Use 4 parallel processes:**
 ```bash
 ./convert.sh ./ROOT ./out_pdfs -P 4
 ```
 **Process nested subdirectories recursively:**
 ```bash
 ./convert.sh ./ROOT ./out_pdfs -P 4 --recursive
 ```
 ## Folder Structure
 Organize your images with one subdirectory per PDF:
 ```
 ROOT/
  CaseA/
    001.jpg
    002.jpg
  CaseB/
    page1.jpg
    page2.jpg
 ```
 - Each subdirectory under `ROOT` becomes a single PDF
 - Nested subfolders (with `--recursive`) are named like `Parent__Child.pdf`
 - Output PDFs are saved to `out_pdfs/` (or your specified output directory)
 ## Options
 - **ROOT_DIR** (required): Root directory containing subdirectories of images
 - **OUT_DIR** (optional): Output directory (default: `out_pdfs`)
 - **-P N** (optional): Number of parallel processes (default: CPU core count)
 - **--recursive** or **-r**: Process nested subdirectories recursively
 ## Supported Image Formats
 jpg, jpeg, png, tif, tiff (case-insensitive)
 ## OCR Settings
 - Language: English (`eng`)
 - Tesseract OEM: 1 (LSTM neural net mode)
 - Page segmentation mode: 6 (uniform text block)
 - Optimization level: 1
--- a/convert.sh
+++ b/convert.sh
@@ -0,0 +1,114 @@
 #!/usr/bin/env zsh
 # 並列バッチOCR（zsh + xargs）
 # 依存: img2pdf, ocrmypdf, Tesseract
 # 使い方:
 #   ./batch_ocr_parallel.zsh ROOT_DIR [OUT_DIR] [-P N] [--recursive]
 # 例:
 #   ./batch_ocr_parallel.zsh ./ROOT ./out_pdfs -P 4 --recursive
 emulate -L zsh
 set -euo pipefail
 # ---- 引数処理 ----
 ROOT="${1:-}"
 [[ -z "${ROOT}" ]] && { print -u2 -- "Usage: $0 ROOT_DIR [OUT_DIR] [-P N] [--recursive]"; exit 2; }
 shift
 OUT="out_pdfs"
 PROCS=""
 RECURSIVE=0
 if (( $# )); then
  # 2番目がディレクトリ名っぽければ OUT に採用
  if [[ -n "${1:-}" && "${1:-}" != "-P" && "${1:-}" != "--recursive" && "${1:-}" != "-r" ]]; then
    OUT="$1"; shift
  fi
  # 残りのオプション
  while (( $# )); do
    case "$1" in
      -P) PROCS="${2:-}"; shift 2 ;;
      --recursive|-r) RECURSIVE=1; shift ;;
      *) print -u2 -- "Unknown arg: $1"; exit 2 ;;
    esac
  done
 fi
 # 並列度のデフォルト（未指定ならCPUコア数）
 if [[ -z "${PROCS}" ]]; then
  if command -v nproc >/dev/null 2>&1; then
    PROCS="$(nproc)"
  elif command -v sysctl >/dev/null 2>&1; then
    PROCS="$(sysctl -n hw.ncpu)"
  else
    PROCS=2
  fi
 fi
 # 出力先
 mkdir -p -- "$OUT"
 # ---- ワーカー（サブディレクトリ1つを処理）を一時ファイルに作成 ----
 WORKER="$(mktemp -t ocrmypdf_worker.XXXXXX).zsh"
 cat > "$WORKER" <<'ZSH'
 #!/usr/bin/env zsh
 emulate -L zsh
 set -euo pipefail
 setopt null_glob
 ROOT="$1"
 OUT="$2"
 DIR="$3"
 name="${${DIR#${ROOT%/}/}#/}"       # ROOT/ 以降を相対名に（深い場合は "A/B"）
 base="${DIR:t}"                      # 末尾名（単純名）
 # 画像拡張子
 typeset -a exts; exts=( jpg jpeg png tif tiff JPG JPEG PNG TIF TIFF )
 # 画像列挙
 typeset -a imgs; imgs=()
 for ext in $exts; do
  imgs+=("$DIR"/*.${ext})
 done
 if (( ${#imgs} == 0 )); then
  print -u2 -- "[skip] ${name} (no images)"
  exit 0
 fi
 # 一時PDF
 tmp_pdf="$(mktemp -t "ocr_${base}.XXXXXX").pdf"
 # 1) 画像 → 無劣化PDF結合（-o を先に、-- の後に入力）
 img2pdf --auto-orient -o "$tmp_pdf" -- "${imgs[@]}"
 # 出力ファイル名（ネストを __ でつなぐ）
 out_pdf="$OUT/${name//\//__}.pdf"
 mkdir -p -- "${out_pdf:h}"
 # 2) OCR（タイプ打ち英語向けチューニング）
 ocrmypdf \
  -l eng \
  --tesseract-oem 1 \
  --tesseract-pagesegmode 6 \
  --optimize 1 \
  --output-type pdf \
  "$tmp_pdf" "$out_pdf"
 rm -f -- "$tmp_pdf"
 print -r -- "✅ Wrote: $out_pdf"
 ZSH
 chmod +x "$WORKER"
 # ---- 対象ディレクトリ列挙 → xargs 並列実行 ----
 if (( RECURSIVE )); then
  # ROOT 配下の全サブディレクトリ（ROOT 自身は除外）
  find "$ROOT" -mindepth 1 -type d -print0 \
    | xargs -0 -I {} -n 1 -P "$PROCS" "$WORKER" "$ROOT" "$OUT" {}
 else
  # 直下のサブディレクトリのみ
  find "$ROOT" -mindepth 1 -maxdepth 1 -type d -print0 \
    | xargs -0 -I {} -n 1 -P "$PROCS" "$WORKER" "$ROOT" "$OUT" {}
 fi
 print -r -- "Batch complete. (parallel: $PROCS)"
--- a/ocr_pdf.py
+++ b/ocr_pdf.py
@@ -0,0 +1,424 @@
 #!/usr/bin/env python3
 """
 ocr_pdf.py — Convert JPGs of documents into OCRed PDFs.
 Now supports **batch mode** where each subdirectory becomes one PDF.
 Features
 - Automatic trimming via document contour detection + perspective warp
 - Deskew fallback if no clear document contour is found
 - Image cleanup tuned for 1970s typewritten English pages (contrast/binarization)
 - Tesseract-based OCR to embed a searchable text layer in the PDF
 - Batch multiple images into one output PDF
 - Batch mode: process a root folder; each subdirectory becomes its own PDF
 USAGE
 ------
 # Single-PDF mode (glob patterns allowed)
 uv run ocr-pdf -o out.pdf scans/*.jpg
 # Batch mode: each subdirectory under ROOT becomes a PDF
 uv run ocr-pdf --batch-root ROOT_DIR --out-dir out_pdfs
 # Batch mode with filters
 uv run ocr-pdf --batch-root ROOT_DIR --out-dir out_pdfs --patterns "*.jpg,*.png" --recursive
 DEPENDENCIES
 ------------
 - Tesseract must be installed on your system and in PATH.
  macOS (brew):   brew install tesseract
  Ubuntu/Debian:  sudo apt-get install tesseract-ocr
  Windows:        Install from https://github.com/UB-Mannheim/tesseract/wiki
 """
 import argparse
 import os
 import sys
 from pathlib import Path
 from typing import Optional, Tuple, List, Iterable, Sequence, Dict
 import cv2
 import numpy as np
 import pytesseract
 from pypdf import PdfReader, PdfWriter
 from tempfile import TemporaryDirectory
 import re
 # -----------------------------
 # Geometry helpers
 # -----------------------------
 def _order_quad_points(pts: np.ndarray) -> np.ndarray:
    """Order 4 points as (top-left, top-right, bottom-right, bottom-left)."""
    rect = np.zeros((4, 2), dtype="float32")
    s = pts.sum(axis=1)
    rect[0] = pts[np.argmin(s)]
    rect[2] = pts[np.argmax(s)]
    diff = np.diff(pts, axis=1)
    rect[1] = pts[np.argmin(diff)]
    rect[3] = pts[np.argmax(diff)]
    return rect
 def four_point_transform(image: np.ndarray, pts: np.ndarray) -> np.ndarray:
    rect = _order_quad_points(pts.astype("float32"))
    (tl, tr, br, bl) = rect
    # Compute the width of the new image
    widthA = np.linalg.norm(br - bl)
    widthB = np.linalg.norm(tr - tl)
    maxWidth = int(max(widthA, widthB))
    # Compute the height of the new image
    heightA = np.linalg.norm(tr - br)
    heightB = np.linalg.norm(tl - bl)
    maxHeight = int(max(heightA, heightB))
    dst = np.array([
        [0, 0],
        [maxWidth - 1, 0],
        [maxWidth - 1, maxHeight - 1],
        [0, maxHeight - 1]], dtype="float32")
    M = cv2.getPerspectiveTransform(rect, dst)
    warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))
    return warped
 # -----------------------------
 # Document detection and cleanup
 # -----------------------------
 def detect_document_contour(image_bgr: np.ndarray) -> Optional[np.ndarray]:
    """Find the largest 4-point contour that looks like a document page."""
    image = image_bgr.copy()
    ratio = 1000.0 / max(image.shape[:2])  # scale longest side to ~1000px for speed
    small = cv2.resize(image, (int(image.shape[1]*ratio), int(image.shape[0]*ratio)),
                       interpolation=cv2.INTER_AREA)
    gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
    gray = cv2.GaussianBlur(gray, (5, 5), 0)
    edges = cv2.Canny(gray, 60, 180)
    # Close gaps
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
    edges = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel, iterations=1)
    contours, _ = cv2.findContours(edges, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    contours = sorted(contours, key=cv2.contourArea, reverse=True)
    for c in contours[:10]:
        peri = cv2.arcLength(c, True)
        approx = cv2.approxPolyDP(c, 0.02 * peri, True)
        if len(approx) == 4:
            # Scale contour back to original image coords
            approx = (approx.reshape(4, 2) / ratio).astype(np.float32)
            return approx
    return None
 def deskew(image_gray: np.ndarray) -> np.ndarray:
    """Estimate skew angle with Hough transform on text lines; rotate to correct."""
    # Binary for line detection
    g = cv2.GaussianBlur(image_gray, (3, 3), 0)
    _, bw = cv2.threshold(g, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    inv = cv2.bitwise_not(bw)
    edges = cv2.Canny(inv, 50, 150, apertureSize=3)
    lines = cv2.HoughLines(edges, 1, np.pi / 180.0, 150)
    angle_deg = 0.0
    if lines is not None:
        angles = []
        for rho_theta in lines[:200]:
            rho, theta = rho_theta[0]
            # Convert to degrees relative to horizontal
            deg = (theta * 180.0 / np.pi) - 90.0
            # Normalize to [-45, 45] to avoid vertical lines
            if deg < -45: deg += 90
            if deg > 45: deg -= 90
            angles.append(deg)
        if angles:
            angle_deg = float(np.median(angles))
    if abs(angle_deg) < 0.1:
        return image_gray  # no significant skew
    (h, w) = image_gray.shape[:2]
    M = cv2.getRotationMatrix2D((w // 2, h // 2), angle_deg, 1.0)
    rotated = cv2.warpAffine(image_gray, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated
 def cleanup_for_ocr(image_bgr: np.ndarray) -> np.ndarray:
    """Return a high-contrast, noise-reduced grayscale image suitable for OCR."""
    gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
    # CLAHE to recover typewriter ink contrast without blowing highlights
    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
    gray = clahe.apply(gray)
    # Gentle denoise to preserve glyph edges
    gray = cv2.bilateralFilter(gray, d=7, sigmaColor=50, sigmaSpace=50)
    # Deskew after contrast/denoise
    gray = deskew(gray)
    # Adaptive threshold tends to work well on aged paper; keep grayscale if needed
    th = cv2.adaptiveThreshold(
        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, 15
    )
    # Remove small speckles
    kernel = np.ones((2, 2), np.uint8)
    th = cv2.morphologyEx(th, cv2.MORPH_OPEN, kernel, iterations=1)
    return th
 def prepare_page(image_path: Path) -> np.ndarray:
    """Load, auto-trim (if possible), and clean up a single page image. Returns grayscale uint8 image."""
    bgr = cv2.imdecode(np.fromfile(str(image_path), dtype=np.uint8), cv2.IMREAD_COLOR)
    if bgr is None:
        raise RuntimeError(f"Failed to load image: {image_path}")
    cnt = detect_document_contour(bgr)
    if cnt is not None:
        warped = four_point_transform(bgr, cnt)
    else:
        warped = bgr  # fall back to original framing
    cleaned = cleanup_for_ocr(warped)
    return cleaned
 # -----------------------------
 # OCR + PDF assembly
 # -----------------------------
 def image_to_ocr_pdf_bytes(image: np.ndarray, dpi: int, lang: str, oem: Optional[int], psm: Optional[int]) -> bytes:
    """Use Tesseract to produce a searchable PDF bytes for one image page."""
    # Tesseract prefers RGB
    if len(image.shape) == 2:
        rgb = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
    else:
        rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    # Hint DPI via config; upsample if very small to approach ~300 dpi text size
    h, w = rgb.shape[:2]
    scale = 1.0
    if max(h, w) < 1500:
        scale = 1500.0 / max(h, w)
        rgb = cv2.resize(rgb, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_CUBIC)
    config_parts = [f'--dpi {dpi}', f'-l {lang}']
    if oem is not None:
        config_parts.append(f'--oem {int(oem)}')
    if psm is not None:
        config_parts.append(f'--psm {int(psm)}')
    config = " ".join(config_parts)
    pdf_bytes = pytesseract.image_to_pdf_or_hocr(rgb, extension='pdf', config=config)
    return pdf_bytes
 def combine_pdfs(pdf_paths: List[Path], out_path: Path) -> None:
    writer = PdfWriter()
    for p in pdf_paths:
        reader = PdfReader(str(p))
        for page in reader.pages:
            writer.add_page(page)
    with open(out_path, "wb") as f:
        writer.write(f)
 # -----------------------------
 # Batch helpers
 # -----------------------------
 VALID_EXT = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp"}
 def natural_key(s: str):
    """Sort key that groups numbers naturally (e.g., page2 < page10)."""
    return [int(t) if t.isdigit() else t.lower() for t in re.split(r'(\d+)', s)]
 def find_image_files(dir_path: Path, patterns: Sequence[str]) -> List[Path]:
    files: List[Path] = []
    for pat in patterns:
        files.extend(sorted(dir_path.glob(pat), key=lambda p: natural_key(p.name)))
    # fallback: if no patterns matched, include known image extensions
    if not files:
        for p in sorted(dir_path.iterdir(), key=lambda p: natural_key(p.name)):
            if p.suffix.lower() in VALID_EXT and p.is_file():
                files.append(p)
    return files
 def iter_target_dirs(root: Path, recursive: bool) -> List[Path]:
    if not recursive:
        # only direct children that are directories (ignore hidden)
        return [p for p in sorted(root.iterdir()) if p.is_dir() and not p.name.startswith(".")]
    # walk recursively; include any directory that contains at least one image
    out = []
    for d, subdirs, files in os.walk(root):
        dpath = Path(d)
        if dpath == root:
            continue
        # skip hidden directories
        if any(part.startswith(".") for part in dpath.parts):
            continue
        for f in files:
            if Path(f).suffix.lower() in VALID_EXT:
                out.append(dpath)
                break
    out = sorted(set(out), key=lambda p: natural_key(str(p.relative_to(root))))
    return out
 # -----------------------------
 # Main CLI
 # -----------------------------
 def main():
    ap = argparse.ArgumentParser(description="Convert images to OCRed PDF(s) with auto-trim/deskew. Supports batch by subdirectory.")
    mode = ap.add_mutually_exclusive_group(required=False)
    # Single-PDF mode (default when inputs provided)
    ap.add_argument("inputs", nargs="*", help="Input image files (JPG/PNG/etc.). Glob patterns ok (quote them).")
    ap.add_argument("-o", "--output", help="Output PDF path, e.g., out.pdf (required in single mode).")
    # Batch mode
    ap.add_argument("--batch-root", type=str, help="Root directory containing subdirectories of images. Each subdirectory -> 1 PDF.")
    ap.add_argument("--out-dir", type=str, help="Directory to write PDFs in batch mode.")
    ap.add_argument("--patterns", type=str, default="*.jpg,*.jpeg,*.png,*.tif,*.tiff,*.bmp", help="Comma-separated glob patterns per subdir.")
    ap.add_argument("--recursive", action="store_true", help="Recurse into nested subdirectories in batch mode.")
    # Common OCR knobs
    ap.add_argument("--dpi", type=int, default=300, help="DPI hint for Tesseract (default: 300)")
    ap.add_argument("--lang", default="eng", help="Tesseract language(s), e.g., 'eng' (default)")
    ap.add_argument("--oem", type=int, choices=[0, 1, 2, 3], default=1, help="Tesseract OCR Engine Mode (default: 1)")
    ap.add_argument("--psm", type=int, choices=list(range(0, 14)), default=6, help="Tesseract Page Segmentation Mode (default: 6)")
    ap.add_argument("--keep-intermediate", action="store_true", help="Keep per-page PDFs in a temp folder for inspection.")
    args = ap.parse_args()
    # Validate tesseract availability early
    try:
        _ = pytesseract.get_tesseract_version()
    except Exception as e:
        print("ERROR: Tesseract is not available. Please install it and ensure it's in PATH.", file=sys.stderr)
        print(str(e), file=sys.stderr)
        sys.exit(2)
    # ---------------- Single-PDF mode ----------------
    if args.batch_root is None:
        if not args.inputs or not args.output:
            print("In single-PDF mode, provide inputs and --output. For batch mode, use --batch-root and --out-dir.", file=sys.stderr)
            sys.exit(1)
        # Expand inputs
        input_paths: List[Path] = []
        for pattern in args.inputs:
            if any(ch in pattern for ch in "*?[]"):
                expanded = [Path(p) for p in sorted(map(str, Path().glob(pattern)))]
            else:
                expanded = [Path(pattern)]
            for p in expanded:
                if p.exists() and p.is_file():
                    input_paths.append(p)
        if not input_paths:
            print("No valid input files found.", file=sys.stderr)
            sys.exit(1)
        out_path = Path(args.output)
        out_path.parent.mkdir(parents=True, exist_ok=True)
        with TemporaryDirectory(prefix="ocr_pdf_") as tmpdir:
            tmpdir = Path(tmpdir)
            page_pdf_paths: List[Path] = []
            for idx, img_path in enumerate(input_paths, start=1):
                print(f"[{idx}/{len(input_paths)}] Processing {img_path} ...")
                try:
                    page_img = prepare_page(img_path)
                    pdf_bytes = image_to_ocr_pdf_bytes(
                        page_img, dpi=args.dpi, lang=args.lang, oem=args.oem, psm=args.psm
                    )
                    page_pdf = tmpdir / f"page_{idx:04d}.pdf"
                    with open(page_pdf, "wb") as f:
                        f.write(pdf_bytes)
                    page_pdf_paths.append(page_pdf)
                except Exception as e:
                    print(f"ERROR processing {img_path}: {e}", file=sys.stderr)
            if not page_pdf_paths:
                print("No pages were successfully processed; aborting.", file=sys.stderr)
                sys.exit(1)
            combine_pdfs(page_pdf_paths, out_path)
            print(f"✅ Wrote OCRed PDF: {out_path}")
            if args.keep_intermediate:
                keep_dir = out_path.with_suffix("")
                keep_dir = keep_dir.parent / (keep_dir.name + "_pages")
                keep_dir.mkdir(parents=True, exist_ok=True)
                for p in page_pdf_paths:
                    dest = keep_dir / p.name
                    dest.write_bytes(p.read_bytes())
                print(f"Kept per-page PDFs in: {keep_dir}")
        return
    # ---------------- Batch mode ----------------
    root = Path(args.batch_root)
    if not root.is_dir():
        print(f"--batch-root is not a directory: {root}", file=sys.stderr)
        sys.exit(1)
    out_dir = Path(args.out_dir) if args.out_dir else root / "ocr_pdfs"
    out_dir.mkdir(parents=True, exist_ok=True)
    patterns = [p.strip() for p in args.patterns.split(",") if p.strip()]
    targets = iter_target_dirs(root, recursive=args.recursive)
    if not targets:
        print("No subdirectories with images found under batch root.", file=sys.stderr)
        sys.exit(1)
    for d in targets:
        rel = d.relative_to(root)
        images = find_image_files(d, patterns)
        if not images:
            print(f"[skip] {rel} — no images matching {patterns}", file=sys.stderr)
            continue
        out_pdf = out_dir / (str(rel).replace(os.sep, "__") + ".pdf")
        out_pdf.parent.mkdir(parents=True, exist_ok=True)
        print(f"\n=== {rel} → {out_pdf.name} ({len(images)} pages) ===")
        with TemporaryDirectory(prefix=f"ocr_{rel}_") as tmpdir:
            tmpdir = Path(tmpdir)
            page_pdf_paths: List[Path] = []
            for idx, img_path in enumerate(images, start=1):
                print(f"[{rel}] [{idx}/{len(images)}] {img_path.name}")
                try:
                    page_img = prepare_page(img_path)
                    pdf_bytes = image_to_ocr_pdf_bytes(
                        page_img, dpi=args.dpi, lang=args.lang, oem=args.oem, psm=args.psm
                    )
                    page_pdf = tmpdir / f"page_{idx:04d}.pdf"
                    with open(page_pdf, "wb") as f:
                        f.write(pdf_bytes)
                    page_pdf_paths.append(page_pdf)
                except Exception as e:
                    print(f"ERROR processing {img_path}: {e}", file=sys.stderr)
            if not page_pdf_paths:
                print(f"[skip] {rel} — no pages processed successfully.", file=sys.stderr)
                continue
            combine_pdfs(page_pdf_paths, out_pdf)
            print(f"✅ Wrote: {out_pdf}")
    print("\nBatch complete.")
    return
 if __name__ == "__main__":
    main()
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,24 @@
 [project]
 name = "ocr-pdf"
 version = "0.1.0"
 description = "Convert smartphone JPGs to auto-trimmed, deskewed OCRed PDF (English, typewritten)."
 readme = "README.md"
 requires-python = ">=3.9"
 dependencies = [
  "opencv-python>=4.9",
  "numpy>=1.26",
  "pytesseract>=0.3.10",
  "pypdf>=5.0",
  "Pillow>=10.0",
 ]
 [project.scripts]
 ocr-pdf = "ocr_pdf:main"
 [build-system]
 requires = ["setuptools>=69"]
 build-backend = "setuptools.build_meta"
 [tool.uv]
 # You can pin python version or set custom indexes here if needed.