"""
convert_to_pdf.py — JPEG/PNG → Searchable PDF (OCR)

Usage:
    python3 convert_to_pdf.py [input_dir] [output.pdf] [--lang chi_tra+eng]

Requires:
    pip install Pillow pytesseract reportlab
    sudo apt install tesseract-ocr tesseract-ocr-chi-tra
"""

import sys
import os
import argparse
from PIL import Image
import pytesseract
from reportlab.pdfgen import canvas
from reportlab.lib.utils import ImageReader
import io

# ── CLI ───────────────────────────────────────────────────────────────────────
parser = argparse.ArgumentParser(description="Convert images to searchable PDF via OCR")
parser.add_argument("input_dir", nargs="?", default="jpeg/懷孕指南2",
                    help="Directory containing JPEG/PNG images")
parser.add_argument("output_pdf", nargs="?", default=None,
                    help="Output PDF path (default: <input_dir_name>.pdf)")
parser.add_argument("--lang", default="chi_tra+eng",
                    help="Tesseract language(s), e.g. chi_tra+eng (default: chi_tra+eng)")
parser.add_argument("--dpi", type=int, default=150,
                    help="Image DPI for PDF page size calculation (default: 150)")
parser.add_argument("--ocr-scale", type=float, default=0.5,
                    help="Scale factor for OCR image to speed up processing (default: 0.5)")
parser.add_argument("--max-pages", type=int, default=None,
                    help="Process only first N pages (for testing)")
args = parser.parse_args()

input_dir = args.input_dir
output_pdf = args.output_pdf or (os.path.basename(input_dir.rstrip("/")) + ".pdf")
lang = args.lang
dpi = args.dpi
ocr_scale = args.ocr_scale
max_pages = args.max_pages

# ── Collect images ────────────────────────────────────────────────────────────
image_files = sorted([
    f for f in os.listdir(input_dir)
    if f.lower().endswith(('.jpg', '.jpeg', '.png'))
])

if not image_files:
    print("No image files found.")
    sys.exit(1)

print(f"Found {len(image_files)} images in '{input_dir}'")
print(f"OCR language: {lang}")
print(f"Output: {output_pdf}")
print()

if max_pages:
    image_files = image_files[:max_pages]
    print(f"(limited to first {max_pages} pages)")

# ── Build searchable PDF ──────────────────────────────────────────────────────
c = canvas.Canvas(output_pdf)

for idx, filename in enumerate(image_files, 1):
    path = os.path.join(input_dir, filename)
    img = Image.open(path).convert("RGB")
    width_px, height_px = img.size

    # Points (1 pt = 1/72 inch).  Assume dpi for px→pt conversion.
    pt_w = width_px * 72.0 / dpi
    pt_h = height_px * 72.0 / dpi

    c.setPageSize((pt_w, pt_h))

    # Draw full-page image
    img_reader = ImageReader(img)
    c.drawImage(img_reader, 0, 0, width=pt_w, height=pt_h)

    # OCR — word level
    print(f"[{idx}/{len(image_files)}] OCR: {filename} ...", flush=True)
    try:
        # Optionally downscale image for faster OCR
        if ocr_scale != 1.0:
            ocr_img = img.resize(
                (int(width_px * ocr_scale), int(height_px * ocr_scale)),
                Image.LANCZOS
            )
            scale_factor = 1.0 / ocr_scale
        else:
            ocr_img = img
            scale_factor = 1.0
        data = pytesseract.image_to_data(
            ocr_img, lang=lang, output_type=pytesseract.Output.DICT
        )
    except Exception as e:
        print(f"  WARNING: OCR failed ({e}), skipping text layer for this page")
        c.showPage()
        continue

    n = len(data["text"])
    words_placed = 0
    for i in range(n):
        word = data["text"][i]
        conf = int(data["conf"][i])
        if not word.strip() or conf < 30:
            continue

        # Tesseract bbox (pixels, top-left origin)
        x_px = data["left"][i]
        y_px = data["top"][i]
        w_px = data["width"][i]
        h_px = data["height"][i]

        if w_px <= 0 or h_px <= 0:
            continue

        # Scale back to original image coordinates if OCR was done on downscaled image
        x_px = x_px * scale_factor
        y_px = y_px * scale_factor
        w_px = w_px * scale_factor
        h_px = h_px * scale_factor

        # Convert to PDF points (bottom-left origin in reportlab)
        x_pt = x_px * 72.0 / dpi
        # PDF y=0 is bottom; tesseract y=0 is top
        y_pt = pt_h - (y_px + h_px) * 72.0 / dpi
        font_size = max(h_px * 72.0 / dpi, 1)

        # Invisible text (white, tiny alpha workaround: use white on white image)
        c.setFillColorRGB(1, 1, 1, alpha=0)  # fully transparent
        try:
            c.setFont("Helvetica", font_size)
        except Exception:
            c.setFont("Helvetica", 6)

        c.drawString(x_pt, y_pt, word)
        words_placed += 1

    print(f"  → {words_placed} words embedded")
    c.showPage()

c.save()
print(f"\n✅ Done! Searchable PDF saved: {output_pdf} ({len(image_files)} pages)")