initial commit

This commit is contained in:
2025-11-01 18:04:28 -04:00
commit 4eb7ddfd99
5 changed files with 703 additions and 0 deletions

57
.gitignore vendored Normal file
View File

@@ -0,0 +1,57 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# Virtual Environment
.venv/
venv/
ENV/
env/
# IDEs
.vscode/
.idea/
*.swp
*.swo
*~
.DS_Store
# UV
uv.lock
# Project specific
out_pdfs/
# Logs
*.log
# Testing
.pytest_cache/
.coverage
htmlcov/
# mypy
.mypy_cache/
.dmypy.json
dmypy.json

84
README.md Normal file
View File

@@ -0,0 +1,84 @@
# jpegdoc2pdf
Convert smartphone JPGs of typewritten English documents into searchable **OCRed PDFs** using parallel batch processing.
## Prerequisites
Install the following tools:
- **Tesseract OCR** (ensure it's in PATH)
- **img2pdf** - lossless image to PDF converter
- **ocrmypdf** - adds OCR layer to PDFs
```bash
# macOS
brew install tesseract img2pdf ocrmypdf
# Linux (Debian/Ubuntu)
apt-get install tesseract-ocr img2pdf ocrmypdf
```
## Usage
### Basic Usage
```bash
./convert.sh ROOT_DIR [OUT_DIR] [-P N] [--recursive]
```
### Examples
**Process subdirectories in ROOT with default settings:**
```bash
./convert.sh ./ROOT
```
**Specify custom output directory:**
```bash
./convert.sh ./ROOT ./my_output
```
**Use 4 parallel processes:**
```bash
./convert.sh ./ROOT ./out_pdfs -P 4
```
**Process nested subdirectories recursively:**
```bash
./convert.sh ./ROOT ./out_pdfs -P 4 --recursive
```
## Folder Structure
Organize your images with one subdirectory per PDF:
```
ROOT/
CaseA/
001.jpg
002.jpg
CaseB/
page1.jpg
page2.jpg
```
- Each subdirectory under `ROOT` becomes a single PDF
- Nested subfolders (with `--recursive`) are named like `Parent__Child.pdf`
- Output PDFs are saved to `out_pdfs/` (or your specified output directory)
## Options
- **ROOT_DIR** (required): Root directory containing subdirectories of images
- **OUT_DIR** (optional): Output directory (default: `out_pdfs`)
- **-P N** (optional): Number of parallel processes (default: CPU core count)
- **--recursive** or **-r**: Process nested subdirectories recursively
## Supported Image Formats
jpg, jpeg, png, tif, tiff (case-insensitive)
## OCR Settings
- Language: English (`eng`)
- Tesseract OEM: 1 (LSTM neural net mode)
- Page segmentation mode: 6 (uniform text block)
- Optimization level: 1

114
convert.sh Executable file
View File

@@ -0,0 +1,114 @@
#!/usr/bin/env zsh
# 並列バッチOCRzsh + xargs
# 依存: img2pdf, ocrmypdf, Tesseract
# 使い方:
# ./batch_ocr_parallel.zsh ROOT_DIR [OUT_DIR] [-P N] [--recursive]
# 例:
# ./batch_ocr_parallel.zsh ./ROOT ./out_pdfs -P 4 --recursive
emulate -L zsh
set -euo pipefail
# ---- 引数処理 ----
ROOT="${1:-}"
[[ -z "${ROOT}" ]] && { print -u2 -- "Usage: $0 ROOT_DIR [OUT_DIR] [-P N] [--recursive]"; exit 2; }
shift
OUT="out_pdfs"
PROCS=""
RECURSIVE=0
if (( $# )); then
# 2番目がディレクトリ名っぽければ OUT に採用
if [[ -n "${1:-}" && "${1:-}" != "-P" && "${1:-}" != "--recursive" && "${1:-}" != "-r" ]]; then
OUT="$1"; shift
fi
# 残りのオプション
while (( $# )); do
case "$1" in
-P) PROCS="${2:-}"; shift 2 ;;
--recursive|-r) RECURSIVE=1; shift ;;
*) print -u2 -- "Unknown arg: $1"; exit 2 ;;
esac
done
fi
# 並列度のデフォルト未指定ならCPUコア数
if [[ -z "${PROCS}" ]]; then
if command -v nproc >/dev/null 2>&1; then
PROCS="$(nproc)"
elif command -v sysctl >/dev/null 2>&1; then
PROCS="$(sysctl -n hw.ncpu)"
else
PROCS=2
fi
fi
# 出力先
mkdir -p -- "$OUT"
# ---- ワーカーサブディレクトリ1つを処理を一時ファイルに作成 ----
WORKER="$(mktemp -t ocrmypdf_worker.XXXXXX).zsh"
cat > "$WORKER" <<'ZSH'
#!/usr/bin/env zsh
emulate -L zsh
set -euo pipefail
setopt null_glob
ROOT="$1"
OUT="$2"
DIR="$3"
name="${${DIR#${ROOT%/}/}#/}" # ROOT/ 以降を相対名に(深い場合は "A/B"
base="${DIR:t}" # 末尾名(単純名)
# 画像拡張子
typeset -a exts; exts=( jpg jpeg png tif tiff JPG JPEG PNG TIF TIFF )
# 画像列挙
typeset -a imgs; imgs=()
for ext in $exts; do
imgs+=("$DIR"/*.${ext})
done
if (( ${#imgs} == 0 )); then
print -u2 -- "[skip] ${name} (no images)"
exit 0
fi
# 一時PDF
tmp_pdf="$(mktemp -t "ocr_${base}.XXXXXX").pdf"
# 1) 画像 → 無劣化PDF結合-o を先に、-- の後に入力)
img2pdf --auto-orient -o "$tmp_pdf" -- "${imgs[@]}"
# 出力ファイル名(ネストを __ でつなぐ)
out_pdf="$OUT/${name//\//__}.pdf"
mkdir -p -- "${out_pdf:h}"
# 2) OCRタイプ打ち英語向けチューニング
ocrmypdf \
-l eng \
--tesseract-oem 1 \
--tesseract-pagesegmode 6 \
--optimize 1 \
--output-type pdf \
"$tmp_pdf" "$out_pdf"
rm -f -- "$tmp_pdf"
print -r -- "✅ Wrote: $out_pdf"
ZSH
chmod +x "$WORKER"
# ---- 対象ディレクトリ列挙 → xargs 並列実行 ----
if (( RECURSIVE )); then
# ROOT 配下の全サブディレクトリROOT 自身は除外)
find "$ROOT" -mindepth 1 -type d -print0 \
| xargs -0 -I {} -n 1 -P "$PROCS" "$WORKER" "$ROOT" "$OUT" {}
else
# 直下のサブディレクトリのみ
find "$ROOT" -mindepth 1 -maxdepth 1 -type d -print0 \
| xargs -0 -I {} -n 1 -P "$PROCS" "$WORKER" "$ROOT" "$OUT" {}
fi
print -r -- "Batch complete. (parallel: $PROCS)"

424
ocr_pdf.py Normal file
View File

@@ -0,0 +1,424 @@
#!/usr/bin/env python3
"""
ocr_pdf.py — Convert JPGs of documents into OCRed PDFs.
Now supports **batch mode** where each subdirectory becomes one PDF.
Features
- Automatic trimming via document contour detection + perspective warp
- Deskew fallback if no clear document contour is found
- Image cleanup tuned for 1970s typewritten English pages (contrast/binarization)
- Tesseract-based OCR to embed a searchable text layer in the PDF
- Batch multiple images into one output PDF
- Batch mode: process a root folder; each subdirectory becomes its own PDF
USAGE
------
# Single-PDF mode (glob patterns allowed)
uv run ocr-pdf -o out.pdf scans/*.jpg
# Batch mode: each subdirectory under ROOT becomes a PDF
uv run ocr-pdf --batch-root ROOT_DIR --out-dir out_pdfs
# Batch mode with filters
uv run ocr-pdf --batch-root ROOT_DIR --out-dir out_pdfs --patterns "*.jpg,*.png" --recursive
DEPENDENCIES
------------
- Tesseract must be installed on your system and in PATH.
macOS (brew): brew install tesseract
Ubuntu/Debian: sudo apt-get install tesseract-ocr
Windows: Install from https://github.com/UB-Mannheim/tesseract/wiki
"""
import argparse
import os
import sys
from pathlib import Path
from typing import Optional, Tuple, List, Iterable, Sequence, Dict
import cv2
import numpy as np
import pytesseract
from pypdf import PdfReader, PdfWriter
from tempfile import TemporaryDirectory
import re
# -----------------------------
# Geometry helpers
# -----------------------------
def _order_quad_points(pts: np.ndarray) -> np.ndarray:
"""Order 4 points as (top-left, top-right, bottom-right, bottom-left)."""
rect = np.zeros((4, 2), dtype="float32")
s = pts.sum(axis=1)
rect[0] = pts[np.argmin(s)]
rect[2] = pts[np.argmax(s)]
diff = np.diff(pts, axis=1)
rect[1] = pts[np.argmin(diff)]
rect[3] = pts[np.argmax(diff)]
return rect
def four_point_transform(image: np.ndarray, pts: np.ndarray) -> np.ndarray:
rect = _order_quad_points(pts.astype("float32"))
(tl, tr, br, bl) = rect
# Compute the width of the new image
widthA = np.linalg.norm(br - bl)
widthB = np.linalg.norm(tr - tl)
maxWidth = int(max(widthA, widthB))
# Compute the height of the new image
heightA = np.linalg.norm(tr - br)
heightB = np.linalg.norm(tl - bl)
maxHeight = int(max(heightA, heightB))
dst = np.array([
[0, 0],
[maxWidth - 1, 0],
[maxWidth - 1, maxHeight - 1],
[0, maxHeight - 1]], dtype="float32")
M = cv2.getPerspectiveTransform(rect, dst)
warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))
return warped
# -----------------------------
# Document detection and cleanup
# -----------------------------
def detect_document_contour(image_bgr: np.ndarray) -> Optional[np.ndarray]:
"""Find the largest 4-point contour that looks like a document page."""
image = image_bgr.copy()
ratio = 1000.0 / max(image.shape[:2]) # scale longest side to ~1000px for speed
small = cv2.resize(image, (int(image.shape[1]*ratio), int(image.shape[0]*ratio)),
interpolation=cv2.INTER_AREA)
gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
gray = cv2.GaussianBlur(gray, (5, 5), 0)
edges = cv2.Canny(gray, 60, 180)
# Close gaps
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
edges = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel, iterations=1)
contours, _ = cv2.findContours(edges, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
contours = sorted(contours, key=cv2.contourArea, reverse=True)
for c in contours[:10]:
peri = cv2.arcLength(c, True)
approx = cv2.approxPolyDP(c, 0.02 * peri, True)
if len(approx) == 4:
# Scale contour back to original image coords
approx = (approx.reshape(4, 2) / ratio).astype(np.float32)
return approx
return None
def deskew(image_gray: np.ndarray) -> np.ndarray:
"""Estimate skew angle with Hough transform on text lines; rotate to correct."""
# Binary for line detection
g = cv2.GaussianBlur(image_gray, (3, 3), 0)
_, bw = cv2.threshold(g, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
inv = cv2.bitwise_not(bw)
edges = cv2.Canny(inv, 50, 150, apertureSize=3)
lines = cv2.HoughLines(edges, 1, np.pi / 180.0, 150)
angle_deg = 0.0
if lines is not None:
angles = []
for rho_theta in lines[:200]:
rho, theta = rho_theta[0]
# Convert to degrees relative to horizontal
deg = (theta * 180.0 / np.pi) - 90.0
# Normalize to [-45, 45] to avoid vertical lines
if deg < -45: deg += 90
if deg > 45: deg -= 90
angles.append(deg)
if angles:
angle_deg = float(np.median(angles))
if abs(angle_deg) < 0.1:
return image_gray # no significant skew
(h, w) = image_gray.shape[:2]
M = cv2.getRotationMatrix2D((w // 2, h // 2), angle_deg, 1.0)
rotated = cv2.warpAffine(image_gray, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
return rotated
def cleanup_for_ocr(image_bgr: np.ndarray) -> np.ndarray:
"""Return a high-contrast, noise-reduced grayscale image suitable for OCR."""
gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
# CLAHE to recover typewriter ink contrast without blowing highlights
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
gray = clahe.apply(gray)
# Gentle denoise to preserve glyph edges
gray = cv2.bilateralFilter(gray, d=7, sigmaColor=50, sigmaSpace=50)
# Deskew after contrast/denoise
gray = deskew(gray)
# Adaptive threshold tends to work well on aged paper; keep grayscale if needed
th = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, 15
)
# Remove small speckles
kernel = np.ones((2, 2), np.uint8)
th = cv2.morphologyEx(th, cv2.MORPH_OPEN, kernel, iterations=1)
return th
def prepare_page(image_path: Path) -> np.ndarray:
"""Load, auto-trim (if possible), and clean up a single page image. Returns grayscale uint8 image."""
bgr = cv2.imdecode(np.fromfile(str(image_path), dtype=np.uint8), cv2.IMREAD_COLOR)
if bgr is None:
raise RuntimeError(f"Failed to load image: {image_path}")
cnt = detect_document_contour(bgr)
if cnt is not None:
warped = four_point_transform(bgr, cnt)
else:
warped = bgr # fall back to original framing
cleaned = cleanup_for_ocr(warped)
return cleaned
# -----------------------------
# OCR + PDF assembly
# -----------------------------
def image_to_ocr_pdf_bytes(image: np.ndarray, dpi: int, lang: str, oem: Optional[int], psm: Optional[int]) -> bytes:
"""Use Tesseract to produce a searchable PDF bytes for one image page."""
# Tesseract prefers RGB
if len(image.shape) == 2:
rgb = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
else:
rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# Hint DPI via config; upsample if very small to approach ~300 dpi text size
h, w = rgb.shape[:2]
scale = 1.0
if max(h, w) < 1500:
scale = 1500.0 / max(h, w)
rgb = cv2.resize(rgb, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_CUBIC)
config_parts = [f'--dpi {dpi}', f'-l {lang}']
if oem is not None:
config_parts.append(f'--oem {int(oem)}')
if psm is not None:
config_parts.append(f'--psm {int(psm)}')
config = " ".join(config_parts)
pdf_bytes = pytesseract.image_to_pdf_or_hocr(rgb, extension='pdf', config=config)
return pdf_bytes
def combine_pdfs(pdf_paths: List[Path], out_path: Path) -> None:
writer = PdfWriter()
for p in pdf_paths:
reader = PdfReader(str(p))
for page in reader.pages:
writer.add_page(page)
with open(out_path, "wb") as f:
writer.write(f)
# -----------------------------
# Batch helpers
# -----------------------------
VALID_EXT = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp"}
def natural_key(s: str):
"""Sort key that groups numbers naturally (e.g., page2 < page10)."""
return [int(t) if t.isdigit() else t.lower() for t in re.split(r'(\d+)', s)]
def find_image_files(dir_path: Path, patterns: Sequence[str]) -> List[Path]:
files: List[Path] = []
for pat in patterns:
files.extend(sorted(dir_path.glob(pat), key=lambda p: natural_key(p.name)))
# fallback: if no patterns matched, include known image extensions
if not files:
for p in sorted(dir_path.iterdir(), key=lambda p: natural_key(p.name)):
if p.suffix.lower() in VALID_EXT and p.is_file():
files.append(p)
return files
def iter_target_dirs(root: Path, recursive: bool) -> List[Path]:
if not recursive:
# only direct children that are directories (ignore hidden)
return [p for p in sorted(root.iterdir()) if p.is_dir() and not p.name.startswith(".")]
# walk recursively; include any directory that contains at least one image
out = []
for d, subdirs, files in os.walk(root):
dpath = Path(d)
if dpath == root:
continue
# skip hidden directories
if any(part.startswith(".") for part in dpath.parts):
continue
for f in files:
if Path(f).suffix.lower() in VALID_EXT:
out.append(dpath)
break
out = sorted(set(out), key=lambda p: natural_key(str(p.relative_to(root))))
return out
# -----------------------------
# Main CLI
# -----------------------------
def main():
ap = argparse.ArgumentParser(description="Convert images to OCRed PDF(s) with auto-trim/deskew. Supports batch by subdirectory.")
mode = ap.add_mutually_exclusive_group(required=False)
# Single-PDF mode (default when inputs provided)
ap.add_argument("inputs", nargs="*", help="Input image files (JPG/PNG/etc.). Glob patterns ok (quote them).")
ap.add_argument("-o", "--output", help="Output PDF path, e.g., out.pdf (required in single mode).")
# Batch mode
ap.add_argument("--batch-root", type=str, help="Root directory containing subdirectories of images. Each subdirectory -> 1 PDF.")
ap.add_argument("--out-dir", type=str, help="Directory to write PDFs in batch mode.")
ap.add_argument("--patterns", type=str, default="*.jpg,*.jpeg,*.png,*.tif,*.tiff,*.bmp", help="Comma-separated glob patterns per subdir.")
ap.add_argument("--recursive", action="store_true", help="Recurse into nested subdirectories in batch mode.")
# Common OCR knobs
ap.add_argument("--dpi", type=int, default=300, help="DPI hint for Tesseract (default: 300)")
ap.add_argument("--lang", default="eng", help="Tesseract language(s), e.g., 'eng' (default)")
ap.add_argument("--oem", type=int, choices=[0, 1, 2, 3], default=1, help="Tesseract OCR Engine Mode (default: 1)")
ap.add_argument("--psm", type=int, choices=list(range(0, 14)), default=6, help="Tesseract Page Segmentation Mode (default: 6)")
ap.add_argument("--keep-intermediate", action="store_true", help="Keep per-page PDFs in a temp folder for inspection.")
args = ap.parse_args()
# Validate tesseract availability early
try:
_ = pytesseract.get_tesseract_version()
except Exception as e:
print("ERROR: Tesseract is not available. Please install it and ensure it's in PATH.", file=sys.stderr)
print(str(e), file=sys.stderr)
sys.exit(2)
# ---------------- Single-PDF mode ----------------
if args.batch_root is None:
if not args.inputs or not args.output:
print("In single-PDF mode, provide inputs and --output. For batch mode, use --batch-root and --out-dir.", file=sys.stderr)
sys.exit(1)
# Expand inputs
input_paths: List[Path] = []
for pattern in args.inputs:
if any(ch in pattern for ch in "*?[]"):
expanded = [Path(p) for p in sorted(map(str, Path().glob(pattern)))]
else:
expanded = [Path(pattern)]
for p in expanded:
if p.exists() and p.is_file():
input_paths.append(p)
if not input_paths:
print("No valid input files found.", file=sys.stderr)
sys.exit(1)
out_path = Path(args.output)
out_path.parent.mkdir(parents=True, exist_ok=True)
with TemporaryDirectory(prefix="ocr_pdf_") as tmpdir:
tmpdir = Path(tmpdir)
page_pdf_paths: List[Path] = []
for idx, img_path in enumerate(input_paths, start=1):
print(f"[{idx}/{len(input_paths)}] Processing {img_path} ...")
try:
page_img = prepare_page(img_path)
pdf_bytes = image_to_ocr_pdf_bytes(
page_img, dpi=args.dpi, lang=args.lang, oem=args.oem, psm=args.psm
)
page_pdf = tmpdir / f"page_{idx:04d}.pdf"
with open(page_pdf, "wb") as f:
f.write(pdf_bytes)
page_pdf_paths.append(page_pdf)
except Exception as e:
print(f"ERROR processing {img_path}: {e}", file=sys.stderr)
if not page_pdf_paths:
print("No pages were successfully processed; aborting.", file=sys.stderr)
sys.exit(1)
combine_pdfs(page_pdf_paths, out_path)
print(f"✅ Wrote OCRed PDF: {out_path}")
if args.keep_intermediate:
keep_dir = out_path.with_suffix("")
keep_dir = keep_dir.parent / (keep_dir.name + "_pages")
keep_dir.mkdir(parents=True, exist_ok=True)
for p in page_pdf_paths:
dest = keep_dir / p.name
dest.write_bytes(p.read_bytes())
print(f"Kept per-page PDFs in: {keep_dir}")
return
# ---------------- Batch mode ----------------
root = Path(args.batch_root)
if not root.is_dir():
print(f"--batch-root is not a directory: {root}", file=sys.stderr)
sys.exit(1)
out_dir = Path(args.out_dir) if args.out_dir else root / "ocr_pdfs"
out_dir.mkdir(parents=True, exist_ok=True)
patterns = [p.strip() for p in args.patterns.split(",") if p.strip()]
targets = iter_target_dirs(root, recursive=args.recursive)
if not targets:
print("No subdirectories with images found under batch root.", file=sys.stderr)
sys.exit(1)
for d in targets:
rel = d.relative_to(root)
images = find_image_files(d, patterns)
if not images:
print(f"[skip] {rel} — no images matching {patterns}", file=sys.stderr)
continue
out_pdf = out_dir / (str(rel).replace(os.sep, "__") + ".pdf")
out_pdf.parent.mkdir(parents=True, exist_ok=True)
print(f"\n=== {rel}{out_pdf.name} ({len(images)} pages) ===")
with TemporaryDirectory(prefix=f"ocr_{rel}_") as tmpdir:
tmpdir = Path(tmpdir)
page_pdf_paths: List[Path] = []
for idx, img_path in enumerate(images, start=1):
print(f"[{rel}] [{idx}/{len(images)}] {img_path.name}")
try:
page_img = prepare_page(img_path)
pdf_bytes = image_to_ocr_pdf_bytes(
page_img, dpi=args.dpi, lang=args.lang, oem=args.oem, psm=args.psm
)
page_pdf = tmpdir / f"page_{idx:04d}.pdf"
with open(page_pdf, "wb") as f:
f.write(pdf_bytes)
page_pdf_paths.append(page_pdf)
except Exception as e:
print(f"ERROR processing {img_path}: {e}", file=sys.stderr)
if not page_pdf_paths:
print(f"[skip] {rel} — no pages processed successfully.", file=sys.stderr)
continue
combine_pdfs(page_pdf_paths, out_pdf)
print(f"✅ Wrote: {out_pdf}")
print("\nBatch complete.")
return
if __name__ == "__main__":
main()

24
pyproject.toml Normal file
View File

@@ -0,0 +1,24 @@
[project]
name = "ocr-pdf"
version = "0.1.0"
description = "Convert smartphone JPGs to auto-trimmed, deskewed OCRed PDF (English, typewritten)."
readme = "README.md"
requires-python = ">=3.9"
dependencies = [
"opencv-python>=4.9",
"numpy>=1.26",
"pytesseract>=0.3.10",
"pypdf>=5.0",
"Pillow>=10.0",
]
[project.scripts]
ocr-pdf = "ocr_pdf:main"
[build-system]
requires = ["setuptools>=69"]
build-backend = "setuptools.build_meta"
[tool.uv]
# You can pin python version or set custom indexes here if needed.