initial commit
This commit is contained in:
57
.gitignore
vendored
Normal file
57
.gitignore
vendored
Normal file
@@ -0,0 +1,57 @@
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
pip-wheel-metadata/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# Virtual Environment
|
||||
.venv/
|
||||
venv/
|
||||
ENV/
|
||||
env/
|
||||
|
||||
# IDEs
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
.DS_Store
|
||||
|
||||
# UV
|
||||
uv.lock
|
||||
|
||||
# Project specific
|
||||
out_pdfs/
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
|
||||
# Testing
|
||||
.pytest_cache/
|
||||
.coverage
|
||||
htmlcov/
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
84
README.md
Normal file
84
README.md
Normal file
@@ -0,0 +1,84 @@
|
||||
# jpegdoc2pdf
|
||||
|
||||
Convert smartphone JPGs of typewritten English documents into searchable **OCRed PDFs** using parallel batch processing.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Install the following tools:
|
||||
- **Tesseract OCR** (ensure it's in PATH)
|
||||
- **img2pdf** - lossless image to PDF converter
|
||||
- **ocrmypdf** - adds OCR layer to PDFs
|
||||
|
||||
```bash
|
||||
# macOS
|
||||
brew install tesseract img2pdf ocrmypdf
|
||||
|
||||
# Linux (Debian/Ubuntu)
|
||||
apt-get install tesseract-ocr img2pdf ocrmypdf
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Basic Usage
|
||||
|
||||
```bash
|
||||
./convert.sh ROOT_DIR [OUT_DIR] [-P N] [--recursive]
|
||||
```
|
||||
|
||||
### Examples
|
||||
|
||||
**Process subdirectories in ROOT with default settings:**
|
||||
```bash
|
||||
./convert.sh ./ROOT
|
||||
```
|
||||
|
||||
**Specify custom output directory:**
|
||||
```bash
|
||||
./convert.sh ./ROOT ./my_output
|
||||
```
|
||||
|
||||
**Use 4 parallel processes:**
|
||||
```bash
|
||||
./convert.sh ./ROOT ./out_pdfs -P 4
|
||||
```
|
||||
|
||||
**Process nested subdirectories recursively:**
|
||||
```bash
|
||||
./convert.sh ./ROOT ./out_pdfs -P 4 --recursive
|
||||
```
|
||||
|
||||
## Folder Structure
|
||||
|
||||
Organize your images with one subdirectory per PDF:
|
||||
|
||||
```
|
||||
ROOT/
|
||||
CaseA/
|
||||
001.jpg
|
||||
002.jpg
|
||||
CaseB/
|
||||
page1.jpg
|
||||
page2.jpg
|
||||
```
|
||||
|
||||
- Each subdirectory under `ROOT` becomes a single PDF
|
||||
- Nested subfolders (with `--recursive`) are named like `Parent__Child.pdf`
|
||||
- Output PDFs are saved to `out_pdfs/` (or your specified output directory)
|
||||
|
||||
## Options
|
||||
|
||||
- **ROOT_DIR** (required): Root directory containing subdirectories of images
|
||||
- **OUT_DIR** (optional): Output directory (default: `out_pdfs`)
|
||||
- **-P N** (optional): Number of parallel processes (default: CPU core count)
|
||||
- **--recursive** or **-r**: Process nested subdirectories recursively
|
||||
|
||||
## Supported Image Formats
|
||||
|
||||
jpg, jpeg, png, tif, tiff (case-insensitive)
|
||||
|
||||
## OCR Settings
|
||||
|
||||
- Language: English (`eng`)
|
||||
- Tesseract OEM: 1 (LSTM neural net mode)
|
||||
- Page segmentation mode: 6 (uniform text block)
|
||||
- Optimization level: 1
|
||||
114
convert.sh
Executable file
114
convert.sh
Executable file
@@ -0,0 +1,114 @@
|
||||
#!/usr/bin/env zsh
|
||||
# 並列バッチOCR(zsh + xargs)
|
||||
# 依存: img2pdf, ocrmypdf, Tesseract
|
||||
# 使い方:
|
||||
# ./batch_ocr_parallel.zsh ROOT_DIR [OUT_DIR] [-P N] [--recursive]
|
||||
# 例:
|
||||
# ./batch_ocr_parallel.zsh ./ROOT ./out_pdfs -P 4 --recursive
|
||||
|
||||
emulate -L zsh
|
||||
set -euo pipefail
|
||||
|
||||
# ---- 引数処理 ----
|
||||
ROOT="${1:-}"
|
||||
[[ -z "${ROOT}" ]] && { print -u2 -- "Usage: $0 ROOT_DIR [OUT_DIR] [-P N] [--recursive]"; exit 2; }
|
||||
shift
|
||||
|
||||
OUT="out_pdfs"
|
||||
PROCS=""
|
||||
RECURSIVE=0
|
||||
|
||||
if (( $# )); then
|
||||
# 2番目がディレクトリ名っぽければ OUT に採用
|
||||
if [[ -n "${1:-}" && "${1:-}" != "-P" && "${1:-}" != "--recursive" && "${1:-}" != "-r" ]]; then
|
||||
OUT="$1"; shift
|
||||
fi
|
||||
# 残りのオプション
|
||||
while (( $# )); do
|
||||
case "$1" in
|
||||
-P) PROCS="${2:-}"; shift 2 ;;
|
||||
--recursive|-r) RECURSIVE=1; shift ;;
|
||||
*) print -u2 -- "Unknown arg: $1"; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
fi
|
||||
|
||||
# 並列度のデフォルト(未指定ならCPUコア数)
|
||||
if [[ -z "${PROCS}" ]]; then
|
||||
if command -v nproc >/dev/null 2>&1; then
|
||||
PROCS="$(nproc)"
|
||||
elif command -v sysctl >/dev/null 2>&1; then
|
||||
PROCS="$(sysctl -n hw.ncpu)"
|
||||
else
|
||||
PROCS=2
|
||||
fi
|
||||
fi
|
||||
|
||||
# 出力先
|
||||
mkdir -p -- "$OUT"
|
||||
|
||||
# ---- ワーカー(サブディレクトリ1つを処理)を一時ファイルに作成 ----
|
||||
WORKER="$(mktemp -t ocrmypdf_worker.XXXXXX).zsh"
|
||||
cat > "$WORKER" <<'ZSH'
|
||||
#!/usr/bin/env zsh
|
||||
emulate -L zsh
|
||||
set -euo pipefail
|
||||
setopt null_glob
|
||||
|
||||
ROOT="$1"
|
||||
OUT="$2"
|
||||
DIR="$3"
|
||||
|
||||
name="${${DIR#${ROOT%/}/}#/}" # ROOT/ 以降を相対名に(深い場合は "A/B")
|
||||
base="${DIR:t}" # 末尾名(単純名)
|
||||
|
||||
# 画像拡張子
|
||||
typeset -a exts; exts=( jpg jpeg png tif tiff JPG JPEG PNG TIF TIFF )
|
||||
|
||||
# 画像列挙
|
||||
typeset -a imgs; imgs=()
|
||||
for ext in $exts; do
|
||||
imgs+=("$DIR"/*.${ext})
|
||||
done
|
||||
|
||||
if (( ${#imgs} == 0 )); then
|
||||
print -u2 -- "[skip] ${name} (no images)"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# 一時PDF
|
||||
tmp_pdf="$(mktemp -t "ocr_${base}.XXXXXX").pdf"
|
||||
|
||||
# 1) 画像 → 無劣化PDF結合(-o を先に、-- の後に入力)
|
||||
img2pdf --auto-orient -o "$tmp_pdf" -- "${imgs[@]}"
|
||||
|
||||
# 出力ファイル名(ネストを __ でつなぐ)
|
||||
out_pdf="$OUT/${name//\//__}.pdf"
|
||||
mkdir -p -- "${out_pdf:h}"
|
||||
|
||||
# 2) OCR(タイプ打ち英語向けチューニング)
|
||||
ocrmypdf \
|
||||
-l eng \
|
||||
--tesseract-oem 1 \
|
||||
--tesseract-pagesegmode 6 \
|
||||
--optimize 1 \
|
||||
--output-type pdf \
|
||||
"$tmp_pdf" "$out_pdf"
|
||||
|
||||
rm -f -- "$tmp_pdf"
|
||||
print -r -- "✅ Wrote: $out_pdf"
|
||||
ZSH
|
||||
chmod +x "$WORKER"
|
||||
|
||||
# ---- 対象ディレクトリ列挙 → xargs 並列実行 ----
|
||||
if (( RECURSIVE )); then
|
||||
# ROOT 配下の全サブディレクトリ(ROOT 自身は除外)
|
||||
find "$ROOT" -mindepth 1 -type d -print0 \
|
||||
| xargs -0 -I {} -n 1 -P "$PROCS" "$WORKER" "$ROOT" "$OUT" {}
|
||||
else
|
||||
# 直下のサブディレクトリのみ
|
||||
find "$ROOT" -mindepth 1 -maxdepth 1 -type d -print0 \
|
||||
| xargs -0 -I {} -n 1 -P "$PROCS" "$WORKER" "$ROOT" "$OUT" {}
|
||||
fi
|
||||
|
||||
print -r -- "Batch complete. (parallel: $PROCS)"
|
||||
424
ocr_pdf.py
Normal file
424
ocr_pdf.py
Normal file
@@ -0,0 +1,424 @@
|
||||
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
ocr_pdf.py — Convert JPGs of documents into OCRed PDFs.
|
||||
Now supports **batch mode** where each subdirectory becomes one PDF.
|
||||
|
||||
Features
|
||||
- Automatic trimming via document contour detection + perspective warp
|
||||
- Deskew fallback if no clear document contour is found
|
||||
- Image cleanup tuned for 1970s typewritten English pages (contrast/binarization)
|
||||
- Tesseract-based OCR to embed a searchable text layer in the PDF
|
||||
- Batch multiple images into one output PDF
|
||||
- Batch mode: process a root folder; each subdirectory becomes its own PDF
|
||||
|
||||
USAGE
|
||||
------
|
||||
# Single-PDF mode (glob patterns allowed)
|
||||
uv run ocr-pdf -o out.pdf scans/*.jpg
|
||||
|
||||
# Batch mode: each subdirectory under ROOT becomes a PDF
|
||||
uv run ocr-pdf --batch-root ROOT_DIR --out-dir out_pdfs
|
||||
|
||||
# Batch mode with filters
|
||||
uv run ocr-pdf --batch-root ROOT_DIR --out-dir out_pdfs --patterns "*.jpg,*.png" --recursive
|
||||
|
||||
DEPENDENCIES
|
||||
------------
|
||||
- Tesseract must be installed on your system and in PATH.
|
||||
macOS (brew): brew install tesseract
|
||||
Ubuntu/Debian: sudo apt-get install tesseract-ocr
|
||||
Windows: Install from https://github.com/UB-Mannheim/tesseract/wiki
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional, Tuple, List, Iterable, Sequence, Dict
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import pytesseract
|
||||
from pypdf import PdfReader, PdfWriter
|
||||
from tempfile import TemporaryDirectory
|
||||
import re
|
||||
|
||||
# -----------------------------
|
||||
# Geometry helpers
|
||||
# -----------------------------
|
||||
|
||||
def _order_quad_points(pts: np.ndarray) -> np.ndarray:
|
||||
"""Order 4 points as (top-left, top-right, bottom-right, bottom-left)."""
|
||||
rect = np.zeros((4, 2), dtype="float32")
|
||||
s = pts.sum(axis=1)
|
||||
rect[0] = pts[np.argmin(s)]
|
||||
rect[2] = pts[np.argmax(s)]
|
||||
diff = np.diff(pts, axis=1)
|
||||
rect[1] = pts[np.argmin(diff)]
|
||||
rect[3] = pts[np.argmax(diff)]
|
||||
return rect
|
||||
|
||||
|
||||
def four_point_transform(image: np.ndarray, pts: np.ndarray) -> np.ndarray:
|
||||
rect = _order_quad_points(pts.astype("float32"))
|
||||
(tl, tr, br, bl) = rect
|
||||
|
||||
# Compute the width of the new image
|
||||
widthA = np.linalg.norm(br - bl)
|
||||
widthB = np.linalg.norm(tr - tl)
|
||||
maxWidth = int(max(widthA, widthB))
|
||||
|
||||
# Compute the height of the new image
|
||||
heightA = np.linalg.norm(tr - br)
|
||||
heightB = np.linalg.norm(tl - bl)
|
||||
maxHeight = int(max(heightA, heightB))
|
||||
|
||||
dst = np.array([
|
||||
[0, 0],
|
||||
[maxWidth - 1, 0],
|
||||
[maxWidth - 1, maxHeight - 1],
|
||||
[0, maxHeight - 1]], dtype="float32")
|
||||
|
||||
M = cv2.getPerspectiveTransform(rect, dst)
|
||||
warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))
|
||||
return warped
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Document detection and cleanup
|
||||
# -----------------------------
|
||||
|
||||
def detect_document_contour(image_bgr: np.ndarray) -> Optional[np.ndarray]:
|
||||
"""Find the largest 4-point contour that looks like a document page."""
|
||||
image = image_bgr.copy()
|
||||
ratio = 1000.0 / max(image.shape[:2]) # scale longest side to ~1000px for speed
|
||||
small = cv2.resize(image, (int(image.shape[1]*ratio), int(image.shape[0]*ratio)),
|
||||
interpolation=cv2.INTER_AREA)
|
||||
gray = cv2.cvtColor(small, cv2.COLOR_BGR2GRAY)
|
||||
gray = cv2.GaussianBlur(gray, (5, 5), 0)
|
||||
edges = cv2.Canny(gray, 60, 180)
|
||||
|
||||
# Close gaps
|
||||
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
|
||||
edges = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel, iterations=1)
|
||||
|
||||
contours, _ = cv2.findContours(edges, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
|
||||
contours = sorted(contours, key=cv2.contourArea, reverse=True)
|
||||
|
||||
for c in contours[:10]:
|
||||
peri = cv2.arcLength(c, True)
|
||||
approx = cv2.approxPolyDP(c, 0.02 * peri, True)
|
||||
if len(approx) == 4:
|
||||
# Scale contour back to original image coords
|
||||
approx = (approx.reshape(4, 2) / ratio).astype(np.float32)
|
||||
return approx
|
||||
return None
|
||||
|
||||
|
||||
def deskew(image_gray: np.ndarray) -> np.ndarray:
|
||||
"""Estimate skew angle with Hough transform on text lines; rotate to correct."""
|
||||
# Binary for line detection
|
||||
g = cv2.GaussianBlur(image_gray, (3, 3), 0)
|
||||
_, bw = cv2.threshold(g, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
||||
inv = cv2.bitwise_not(bw)
|
||||
|
||||
edges = cv2.Canny(inv, 50, 150, apertureSize=3)
|
||||
lines = cv2.HoughLines(edges, 1, np.pi / 180.0, 150)
|
||||
|
||||
angle_deg = 0.0
|
||||
if lines is not None:
|
||||
angles = []
|
||||
for rho_theta in lines[:200]:
|
||||
rho, theta = rho_theta[0]
|
||||
# Convert to degrees relative to horizontal
|
||||
deg = (theta * 180.0 / np.pi) - 90.0
|
||||
# Normalize to [-45, 45] to avoid vertical lines
|
||||
if deg < -45: deg += 90
|
||||
if deg > 45: deg -= 90
|
||||
angles.append(deg)
|
||||
if angles:
|
||||
angle_deg = float(np.median(angles))
|
||||
|
||||
if abs(angle_deg) < 0.1:
|
||||
return image_gray # no significant skew
|
||||
|
||||
(h, w) = image_gray.shape[:2]
|
||||
M = cv2.getRotationMatrix2D((w // 2, h // 2), angle_deg, 1.0)
|
||||
rotated = cv2.warpAffine(image_gray, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
|
||||
return rotated
|
||||
|
||||
|
||||
def cleanup_for_ocr(image_bgr: np.ndarray) -> np.ndarray:
|
||||
"""Return a high-contrast, noise-reduced grayscale image suitable for OCR."""
|
||||
gray = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2GRAY)
|
||||
|
||||
# CLAHE to recover typewriter ink contrast without blowing highlights
|
||||
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
|
||||
gray = clahe.apply(gray)
|
||||
|
||||
# Gentle denoise to preserve glyph edges
|
||||
gray = cv2.bilateralFilter(gray, d=7, sigmaColor=50, sigmaSpace=50)
|
||||
|
||||
# Deskew after contrast/denoise
|
||||
gray = deskew(gray)
|
||||
|
||||
# Adaptive threshold tends to work well on aged paper; keep grayscale if needed
|
||||
th = cv2.adaptiveThreshold(
|
||||
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 35, 15
|
||||
)
|
||||
|
||||
# Remove small speckles
|
||||
kernel = np.ones((2, 2), np.uint8)
|
||||
th = cv2.morphologyEx(th, cv2.MORPH_OPEN, kernel, iterations=1)
|
||||
|
||||
return th
|
||||
|
||||
|
||||
def prepare_page(image_path: Path) -> np.ndarray:
|
||||
"""Load, auto-trim (if possible), and clean up a single page image. Returns grayscale uint8 image."""
|
||||
bgr = cv2.imdecode(np.fromfile(str(image_path), dtype=np.uint8), cv2.IMREAD_COLOR)
|
||||
if bgr is None:
|
||||
raise RuntimeError(f"Failed to load image: {image_path}")
|
||||
|
||||
cnt = detect_document_contour(bgr)
|
||||
if cnt is not None:
|
||||
warped = four_point_transform(bgr, cnt)
|
||||
else:
|
||||
warped = bgr # fall back to original framing
|
||||
|
||||
cleaned = cleanup_for_ocr(warped)
|
||||
return cleaned
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# OCR + PDF assembly
|
||||
# -----------------------------
|
||||
|
||||
def image_to_ocr_pdf_bytes(image: np.ndarray, dpi: int, lang: str, oem: Optional[int], psm: Optional[int]) -> bytes:
|
||||
"""Use Tesseract to produce a searchable PDF bytes for one image page."""
|
||||
# Tesseract prefers RGB
|
||||
if len(image.shape) == 2:
|
||||
rgb = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
|
||||
else:
|
||||
rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
||||
|
||||
# Hint DPI via config; upsample if very small to approach ~300 dpi text size
|
||||
h, w = rgb.shape[:2]
|
||||
scale = 1.0
|
||||
if max(h, w) < 1500:
|
||||
scale = 1500.0 / max(h, w)
|
||||
rgb = cv2.resize(rgb, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_CUBIC)
|
||||
|
||||
config_parts = [f'--dpi {dpi}', f'-l {lang}']
|
||||
if oem is not None:
|
||||
config_parts.append(f'--oem {int(oem)}')
|
||||
if psm is not None:
|
||||
config_parts.append(f'--psm {int(psm)}')
|
||||
config = " ".join(config_parts)
|
||||
|
||||
pdf_bytes = pytesseract.image_to_pdf_or_hocr(rgb, extension='pdf', config=config)
|
||||
return pdf_bytes
|
||||
|
||||
|
||||
def combine_pdfs(pdf_paths: List[Path], out_path: Path) -> None:
|
||||
writer = PdfWriter()
|
||||
for p in pdf_paths:
|
||||
reader = PdfReader(str(p))
|
||||
for page in reader.pages:
|
||||
writer.add_page(page)
|
||||
with open(out_path, "wb") as f:
|
||||
writer.write(f)
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Batch helpers
|
||||
# -----------------------------
|
||||
|
||||
VALID_EXT = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp"}
|
||||
|
||||
def natural_key(s: str):
|
||||
"""Sort key that groups numbers naturally (e.g., page2 < page10)."""
|
||||
return [int(t) if t.isdigit() else t.lower() for t in re.split(r'(\d+)', s)]
|
||||
|
||||
def find_image_files(dir_path: Path, patterns: Sequence[str]) -> List[Path]:
|
||||
files: List[Path] = []
|
||||
for pat in patterns:
|
||||
files.extend(sorted(dir_path.glob(pat), key=lambda p: natural_key(p.name)))
|
||||
# fallback: if no patterns matched, include known image extensions
|
||||
if not files:
|
||||
for p in sorted(dir_path.iterdir(), key=lambda p: natural_key(p.name)):
|
||||
if p.suffix.lower() in VALID_EXT and p.is_file():
|
||||
files.append(p)
|
||||
return files
|
||||
|
||||
def iter_target_dirs(root: Path, recursive: bool) -> List[Path]:
|
||||
if not recursive:
|
||||
# only direct children that are directories (ignore hidden)
|
||||
return [p for p in sorted(root.iterdir()) if p.is_dir() and not p.name.startswith(".")]
|
||||
# walk recursively; include any directory that contains at least one image
|
||||
out = []
|
||||
for d, subdirs, files in os.walk(root):
|
||||
dpath = Path(d)
|
||||
if dpath == root:
|
||||
continue
|
||||
# skip hidden directories
|
||||
if any(part.startswith(".") for part in dpath.parts):
|
||||
continue
|
||||
for f in files:
|
||||
if Path(f).suffix.lower() in VALID_EXT:
|
||||
out.append(dpath)
|
||||
break
|
||||
out = sorted(set(out), key=lambda p: natural_key(str(p.relative_to(root))))
|
||||
return out
|
||||
|
||||
|
||||
# -----------------------------
|
||||
# Main CLI
|
||||
# -----------------------------
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description="Convert images to OCRed PDF(s) with auto-trim/deskew. Supports batch by subdirectory.")
|
||||
mode = ap.add_mutually_exclusive_group(required=False)
|
||||
# Single-PDF mode (default when inputs provided)
|
||||
ap.add_argument("inputs", nargs="*", help="Input image files (JPG/PNG/etc.). Glob patterns ok (quote them).")
|
||||
ap.add_argument("-o", "--output", help="Output PDF path, e.g., out.pdf (required in single mode).")
|
||||
|
||||
# Batch mode
|
||||
ap.add_argument("--batch-root", type=str, help="Root directory containing subdirectories of images. Each subdirectory -> 1 PDF.")
|
||||
ap.add_argument("--out-dir", type=str, help="Directory to write PDFs in batch mode.")
|
||||
ap.add_argument("--patterns", type=str, default="*.jpg,*.jpeg,*.png,*.tif,*.tiff,*.bmp", help="Comma-separated glob patterns per subdir.")
|
||||
ap.add_argument("--recursive", action="store_true", help="Recurse into nested subdirectories in batch mode.")
|
||||
|
||||
# Common OCR knobs
|
||||
ap.add_argument("--dpi", type=int, default=300, help="DPI hint for Tesseract (default: 300)")
|
||||
ap.add_argument("--lang", default="eng", help="Tesseract language(s), e.g., 'eng' (default)")
|
||||
ap.add_argument("--oem", type=int, choices=[0, 1, 2, 3], default=1, help="Tesseract OCR Engine Mode (default: 1)")
|
||||
ap.add_argument("--psm", type=int, choices=list(range(0, 14)), default=6, help="Tesseract Page Segmentation Mode (default: 6)")
|
||||
ap.add_argument("--keep-intermediate", action="store_true", help="Keep per-page PDFs in a temp folder for inspection.")
|
||||
args = ap.parse_args()
|
||||
|
||||
# Validate tesseract availability early
|
||||
try:
|
||||
_ = pytesseract.get_tesseract_version()
|
||||
except Exception as e:
|
||||
print("ERROR: Tesseract is not available. Please install it and ensure it's in PATH.", file=sys.stderr)
|
||||
print(str(e), file=sys.stderr)
|
||||
sys.exit(2)
|
||||
|
||||
# ---------------- Single-PDF mode ----------------
|
||||
if args.batch_root is None:
|
||||
if not args.inputs or not args.output:
|
||||
print("In single-PDF mode, provide inputs and --output. For batch mode, use --batch-root and --out-dir.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Expand inputs
|
||||
input_paths: List[Path] = []
|
||||
for pattern in args.inputs:
|
||||
if any(ch in pattern for ch in "*?[]"):
|
||||
expanded = [Path(p) for p in sorted(map(str, Path().glob(pattern)))]
|
||||
else:
|
||||
expanded = [Path(pattern)]
|
||||
for p in expanded:
|
||||
if p.exists() and p.is_file():
|
||||
input_paths.append(p)
|
||||
|
||||
if not input_paths:
|
||||
print("No valid input files found.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
out_path = Path(args.output)
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with TemporaryDirectory(prefix="ocr_pdf_") as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
page_pdf_paths: List[Path] = []
|
||||
|
||||
for idx, img_path in enumerate(input_paths, start=1):
|
||||
print(f"[{idx}/{len(input_paths)}] Processing {img_path} ...")
|
||||
try:
|
||||
page_img = prepare_page(img_path)
|
||||
pdf_bytes = image_to_ocr_pdf_bytes(
|
||||
page_img, dpi=args.dpi, lang=args.lang, oem=args.oem, psm=args.psm
|
||||
)
|
||||
page_pdf = tmpdir / f"page_{idx:04d}.pdf"
|
||||
with open(page_pdf, "wb") as f:
|
||||
f.write(pdf_bytes)
|
||||
page_pdf_paths.append(page_pdf)
|
||||
except Exception as e:
|
||||
print(f"ERROR processing {img_path}: {e}", file=sys.stderr)
|
||||
|
||||
if not page_pdf_paths:
|
||||
print("No pages were successfully processed; aborting.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
combine_pdfs(page_pdf_paths, out_path)
|
||||
print(f"✅ Wrote OCRed PDF: {out_path}")
|
||||
|
||||
if args.keep_intermediate:
|
||||
keep_dir = out_path.with_suffix("")
|
||||
keep_dir = keep_dir.parent / (keep_dir.name + "_pages")
|
||||
keep_dir.mkdir(parents=True, exist_ok=True)
|
||||
for p in page_pdf_paths:
|
||||
dest = keep_dir / p.name
|
||||
dest.write_bytes(p.read_bytes())
|
||||
print(f"Kept per-page PDFs in: {keep_dir}")
|
||||
return
|
||||
|
||||
# ---------------- Batch mode ----------------
|
||||
root = Path(args.batch_root)
|
||||
if not root.is_dir():
|
||||
print(f"--batch-root is not a directory: {root}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
out_dir = Path(args.out_dir) if args.out_dir else root / "ocr_pdfs"
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
patterns = [p.strip() for p in args.patterns.split(",") if p.strip()]
|
||||
targets = iter_target_dirs(root, recursive=args.recursive)
|
||||
|
||||
if not targets:
|
||||
print("No subdirectories with images found under batch root.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
for d in targets:
|
||||
rel = d.relative_to(root)
|
||||
images = find_image_files(d, patterns)
|
||||
if not images:
|
||||
print(f"[skip] {rel} — no images matching {patterns}", file=sys.stderr)
|
||||
continue
|
||||
|
||||
out_pdf = out_dir / (str(rel).replace(os.sep, "__") + ".pdf")
|
||||
out_pdf.parent.mkdir(parents=True, exist_ok=True)
|
||||
print(f"\n=== {rel} → {out_pdf.name} ({len(images)} pages) ===")
|
||||
|
||||
with TemporaryDirectory(prefix=f"ocr_{rel}_") as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
page_pdf_paths: List[Path] = []
|
||||
|
||||
for idx, img_path in enumerate(images, start=1):
|
||||
print(f"[{rel}] [{idx}/{len(images)}] {img_path.name}")
|
||||
try:
|
||||
page_img = prepare_page(img_path)
|
||||
pdf_bytes = image_to_ocr_pdf_bytes(
|
||||
page_img, dpi=args.dpi, lang=args.lang, oem=args.oem, psm=args.psm
|
||||
)
|
||||
page_pdf = tmpdir / f"page_{idx:04d}.pdf"
|
||||
with open(page_pdf, "wb") as f:
|
||||
f.write(pdf_bytes)
|
||||
page_pdf_paths.append(page_pdf)
|
||||
except Exception as e:
|
||||
print(f"ERROR processing {img_path}: {e}", file=sys.stderr)
|
||||
|
||||
if not page_pdf_paths:
|
||||
print(f"[skip] {rel} — no pages processed successfully.", file=sys.stderr)
|
||||
continue
|
||||
|
||||
combine_pdfs(page_pdf_paths, out_pdf)
|
||||
print(f"✅ Wrote: {out_pdf}")
|
||||
|
||||
print("\nBatch complete.")
|
||||
return
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
24
pyproject.toml
Normal file
24
pyproject.toml
Normal file
@@ -0,0 +1,24 @@
|
||||
|
||||
[project]
|
||||
name = "ocr-pdf"
|
||||
version = "0.1.0"
|
||||
description = "Convert smartphone JPGs to auto-trimmed, deskewed OCRed PDF (English, typewritten)."
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.9"
|
||||
dependencies = [
|
||||
"opencv-python>=4.9",
|
||||
"numpy>=1.26",
|
||||
"pytesseract>=0.3.10",
|
||||
"pypdf>=5.0",
|
||||
"Pillow>=10.0",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
ocr-pdf = "ocr_pdf:main"
|
||||
|
||||
[build-system]
|
||||
requires = ["setuptools>=69"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[tool.uv]
|
||||
# You can pin python version or set custom indexes here if needed.
|
||||
Reference in New Issue
Block a user