#!/usr/bin/env python3
"""
scrub_coa.py - COA scrubbing tool for Wingman Solutions

Removes identifying information from lab Certificates of Analysis:
  - Company name, address, license numbers
  - METRC / seed-to-sale numbers
  - All dates and times
  - Analysis method / analyzed date lines
  - QR codes, signature images, product photos

Usage:
    python3 scrub_coa.py <input.pdf> <LABEL>
    python3 scrub_coa.py <input.pdf> <LABEL> --dry-run   # prints findings, no output file

Output:
    coa/scrubbed/LABEL_MMDDYY.pdf
    coa/originals/LABEL_MMDDYY_orig.pdf
"""

import fitz  # PyMuPDF
import json
import re
import os
import sys
import shutil
from datetime import datetime
from pathlib import Path

# ── Directories ────────────────────────────────────────────────────────────────
BASE_DIR      = Path(__file__).parent
TEMPLATES_DIR = BASE_DIR / "templates"
SCRUBBED_DIR  = BASE_DIR / "scrubbed"
ORIGINALS_DIR = BASE_DIR / "originals"

for d in [TEMPLATES_DIR, SCRUBBED_DIR, ORIGINALS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# ── Exact strings to redact (case-sensitive search) ───────────────────────────
EXACT_STRINGS = [
    "Wingman Solutions LLC",
    "Wingman Solutions",
    "Wingman",
    "Romulus, NY, 14541, US",
    "Romulus, NY, 14541",
    "LIQUID DIAMOND",
    "FOR R&D",
    "XXX",   # street address placeholder
]

# ── Line-level labels: if a line STARTS WITH any of these, redact the whole line
LINE_LABELS = [
    # ── Kaycha Labs ───────────────────────────────────────────────────────────
    "Analysis Method",
    "Analyzed Date",
    "Sampled Date",
    "Sample Collection Time",
    "Sampling End",
    "Sampling Method",
    "Sample Size",
    "Manifest #",
    "Batch #",
    "Lab ID",
    "Sample:",
    "Seed to sale",
    "Ordered:",
    "Sampled:",
    "Completed:",
    "Production Method",
    "Total Amount",
    "Retail Product Size",
    "Retail Serving Size",
    "Servings:",
    "Metrc Package #",
    "Metrc Source Package",
    "Laboratory License #",
    "Matrix:",
    "Classification:",
    "Type:",
    "License #",
    # ── Green Analytics NY ────────────────────────────────────────────────────
    # Section headers (all pages): "TestType: Sample Name (SampleID)"
    "Cannabinoids:",
    "Terpenes:",
    "Residual Solvents:",
    "Pesticides:",
    "Mycotoxins:",
    "Heavy Metals:",
    "Microbiology",
    # Test metadata lines (all pages)
    # "Test ID: #XXXXX | Date Tested: ..."  (single test)
    # "Test IDs: #X, #X, #X | Date"        (multiple tests, wraps across 3 lines)
    # "Tested: MM/DD/YYYY ..."              (continuation of multi-line Test IDs)
    "Test ID:",
    "Test IDs:",
    "Tested:",
    # Client/sample field labels (page 1 right column)
    "Date Reported:",
    "Client Name:",
    "Sampling Location:",
    "Contact Name:",
    "Contact Email:",
    "License Number:",
    "Medical/Adult Use:",
    "Sampling Date:",
    "METRC Test Tag ID:",
    "Sample ID:",
    "Sample Name:",
    "Sample Matrix:",
    "Package ID:",
    "Batch Lot ID:",
    "Batch Size:",
    "Serving Size (g):",
    "METRC Source ID:",
    "Sample Sub Type:",
]

# ── Word-level regex patterns (match individual whitespace-delimited tokens) ──
WORD_PATTERNS = [
    re.compile(r"^\d{2}/\d{2}/\d{2,4}$"),                   # dates: MM/DD/YY or MM/DD/YYYY
    re.compile(r"^\d{1,2}/\d{1,2}/\d{4}$"),                  # dates: M/D/YYYY (e.g. 1/31/2026)
    re.compile(r"^\d{2}:\d{2}:\d{2}$"),                      # times: HH:MM:SS
    re.compile(r"^\d{2}:\d{2}$"),                            # times: HH:MM
    re.compile(r"^1A[A-F0-9]{22}$", re.IGNORECASE),          # METRC seed-to-sale tag
    re.compile(r"^OCM-PROC-\d{2}-\d{6}$"),                   # OCM processor license
    re.compile(r"^OCM-PT3B-\d{2}-\d{6}$"),                   # OCM tier-3B license
    re.compile(r"^OCM-[A-Z0-9]+-\d{2}-\d{5,6}$"),            # OCM license (general)
    re.compile(r"^[A-Z]{2}\d{8}-\d{3}$"),                    # Lab ID (e.g. AL60219004-001)
    re.compile(r"^OCM-CPL-\d{2}-\d{5,6}-[A-Z]\d+$"),         # Lab compliance license
    re.compile(r"^\d{8}-[A-Z]{2,6}-\d{3}$"),                 # GA sample ID: 20260122-FLXE-002
    re.compile(r"^\(\d{8}-[A-Z]{2,6}-\d{3}\)$"),             # parenthetical: (20260122-FLXE-002)
    re.compile(r"^\(\d{8}-$"),                                # wrapped open: (20260122-
    re.compile(r"^[A-Z]{2,6}-\d{3}\)$"),                     # wrapped close: FLXE-002)
    re.compile(r"^[A-Z]{2}-[A-Z0-9]+-\d{6}$"),               # GA package ID: EC-C1-EC-260122 style
    re.compile(r"^(?:AM|PM),?$"),                             # time suffix fragments from wrapped test ID lines
]

# ── Known lab identifiers ─────────────────────────────────────────────────────
KNOWN_LABS = {
    "kaycha": ["Kaycha Labs", "Kaycha"],
    "green_analytics": ["Green Analytics NY", "green analytics ny", "greenanalyticsllc"],
}


# ──────────────────────────────────────────────────────────────────────────────
# Redaction helpers
# ──────────────────────────────────────────────────────────────────────────────

def redact_exact(page, text):
    """Redact all occurrences of an exact string on this page."""
    hits = page.search_for(text, quads=False)
    for r in hits:
        page.add_redact_annot(fitz.Rect(r.x0-1, r.y0-1, r.x1+1, r.y1+1), fill=(1, 1, 1))
    return len(hits)


def redact_lines_by_label(page):
    """
    Redact any line whose stripped text starts with a known label string.
    Extends from the line's own x0 to page width -- avoids bleeding into adjacent
    left-column content that shares the same y-band (e.g. lab address vs product panel).

    Also catches lines containing phone numbers or city/state/zip address patterns.
    """
    pw = page.rect.width
    count = 0

    # Address/contact line patterns (match against full line text)
    ADDRESS_PATTERNS = [
        re.compile(r"\(\d{3}\)\s*\d{3}[-.\s]\d{4}"),             # phone: (NNN) NNN-NNNN
        re.compile(r"\d{3}[-.\s]\d{3}[-.\s]\d{4}"),               # phone: NNN-NNN-NNNN
        re.compile(r"\b\d{5}\b.*\bUS\b"),                          # zip + US (address line)
        re.compile(r"[A-Za-z\s]+,\s*[A-Z]{2},?\s*\d{5}"),         # City, ST, 12345
        re.compile(r"\b\d+\s+[A-Z][a-z]+\s+"
                   r"(Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|"
                   r"Drive|Dr|Lane|Ln|Way|Court|Ct|Circle|Place|Pl)\b",
                   re.IGNORECASE),                                  # N Street Type
    ]

    blocks = page.get_text("dict", flags=fitz.TEXT_PRESERVE_WHITESPACE)["blocks"]
    for block in blocks:
        if block.get("type") != 0:
            continue
        for line in block.get("lines", []):
            line_text = "".join(span["text"] for span in line.get("spans", []))
            stripped = line_text.strip()
            bbox = line["bbox"]
            hit = False

            # Check label-start match
            for label in LINE_LABELS:
                if stripped.lower().startswith(label.lower()):
                    hit = True
                    break

            # Check address/contact patterns
            if not hit:
                for pat in ADDRESS_PATTERNS:
                    if pat.search(stripped):
                        hit = True
                        break

            if hit:
                # Start from line's own x0, not 0 -- prevents left-column bleed
                r = fitz.Rect(bbox[0] - 2, bbox[1] - 1, pw, bbox[3] + 2)
                page.add_redact_annot(r, fill=(1, 1, 1))
                count += 1

    return count


def redact_word_patterns(page):
    """Redact individual word tokens matching variable-format patterns (dates, METRC, etc.)."""
    count = 0
    for word in page.get_text("words"):
        x0, y0, x1, y1, text = word[0], word[1], word[2], word[3], word[4]
        text = text.strip()
        for pat in WORD_PATTERNS:
            if pat.match(text):
                page.add_redact_annot(fitz.Rect(x0 - 1, y0 - 1, x1 + 1, y1 + 1), fill=(1, 1, 1))
                count += 1
                break
    return count


def redact_images_by_position(page, page_num, template=None):
    """
    Detect and mask embedded images using positional heuristics:
      - QR code: roughly square, small, top portion, right half
      - Signature: wider than tall, bottom portion, right half
      - Product photo: page 1 only, left half, mid-page, larger
      - Product info panel header: tall narrow block top-right (text box -- covered by coord mask)

    If a template is provided with stored image positions, uses those for comparison.
    Returns list of (classification, rect) for each masked image.
    """
    pw, ph = page.rect.width, page.rect.height
    masked = []

    for img in page.get_images(full=True):
        xref = img[0]
        rects = page.get_image_rects(xref)
        for rect in rects:
            w  = rect.x1 - rect.x0
            h  = rect.y1 - rect.y0
            cx = (rect.x0 + rect.x1) / 2
            cy = (rect.y0 + rect.y1) / 2
            aspect = w / h if h > 0 else 1

            classification = None

            # QR code: roughly square, top 15% of page, right 50% (Kaycha style)
            if 0.65 < aspect < 1.55 and cy < ph * 0.15 and cx > pw * 0.5 and w < pw * 0.25:
                classification = "qr_code"

            # QR code: roughly square, bottom 20% of page, left 25% (Green Analytics style)
            elif 0.65 < aspect < 1.55 and cy > ph * 0.80 and cx < pw * 0.25 and w < pw * 0.20:
                classification = "qr_code"

            # Signature: wider than tall, bottom 15% of page, right half
            elif aspect > 1.8 and cy > ph * 0.82 and cx > pw * 0.45:
                classification = "signature"

            # Product photo: page 1 only, left half, between 10-60% height, larger area
            elif page_num == 0 and cx < pw * 0.45 and 0.10 < cy / ph < 0.65 and w > 40:
                classification = "product_photo"

            if classification:
                page.add_redact_annot(fitz.Rect(rect.x0-2, rect.y0-2, rect.x1+2, rect.y1+2), fill=(1, 1, 1))
                masked.append((classification, rect))

    return masked


def mask_green_analytics(page, page_num):
    """
    Supplemental coordinate-based masking for Green Analytics NY COAs.

    These PDFs are text-based; LINE_LABELS and WORD_PATTERNS handle most
    identifying content. This function only covers the two areas that text
    patterns cannot reliably reach:

      Page 1: The client/sample info VALUE block (y≈130-268).
              These are raw values without adjacent labels - client name,
              contact name, sample name, matrix, package IDs, etc.
              Text positions measured from actual COA: values live in
              two columns at x≈134-265 and x≈302-550, y≈130-265.

      Page 5: Lab director signature image + name/title text.
              "Matthew Elmes / Lab Director" at y≈620-675.
    """
    pw, ph = page.rect.width, page.rect.height

    def blot(x0, y0, x1, y1):
        page.add_redact_annot(fitz.Rect(x0, y0, x1, y1), fill=(1, 1, 1))

    # Report number (e.g. "26483") appears bottom-right on every page.
    # Measured: x=548-567, y=813 on page 1 (841pt page height).
    # Mask the bottom-right corner consistently across all pages.
    blot(pw - 80, ph - 36, pw, ph - 20)

    if page_num == 0:
        # Full-width mask over client/sample info value block.
        # Starts just below the "Sample Result: PASS" banner (y≈120),
        # ends just before the test status legend (y≈355).
        # Measured span: y=130-265; adding margin → y=125-270.
        blot(0, 125, pw, 270)

    elif page_num == 4:
        # Signature image sits above the name text; name at y≈656-672.
        # Mask from y=600 to cover any signature image above the text.
        blot(0, 600, 300, 680)


def mask_header_info_block(page, lab, page_num):
    """
    Coordinate-based masks for lab-specific layout blocks that are reliably
    positioned the same way on every page:

    Kaycha Labs:
      - Product info panel (right column header): "Kaycha Labs" label +
        LIQUID DIAMOND + Matrix / Classification / Type
        Measured positions: x 335-555, y 25-95
      - FOR R&D + TESTED banner (page 1 only): y 125-155, full width
        Removes the large "FOR R&D" (red) and "TESTED" (green) banner row.

    Add entries for other labs as they are encountered.
    """
    if lab == "green_analytics":
        mask_green_analytics(page, page_num)
        return

    if lab != "kaycha":
        return

    pw, ph = page.rect.width, page.rect.height

    # Product info panel: right column header (all pages)
    # Precise bounds measured from actual Kaycha template
    product_panel = fitz.Rect(335, 25, 560, 95)
    page.add_redact_annot(product_panel, fill=(1, 1, 1))

    # FOR R&D + TESTED banner: page 1 only, full-width row
    # Also covers the top edge of the sample info box (y=154) and the
    # gray separator line at y=125.
    if page_num == 0:
        for_rd_row = fitz.Rect(0, 122, pw, 156)
        page.add_redact_annot(for_rd_row, fill=(1, 1, 1))

        # Sample info box outline: gray stroked rect (251,154)-(552,258)
        # Mask the box edges with a white-filled rect slightly larger than the box.
        sample_box = fitz.Rect(245, 150, 558, 262)
        page.add_redact_annot(sample_box, fill=(1, 1, 1))


# ──────────────────────────────────────────────────────────────────────────────
# Lab detection & template management
# ──────────────────────────────────────────────────────────────────────────────

def detect_lab(doc):
    """
    Identify lab by scanning page 1 text for known lab name strings.
    Falls back to structural detection for image-only PDFs.
    """
    page = doc[0]
    text = page.get_text("text")

    # Text-based detection (works for text/hybrid PDFs like Kaycha)
    for lab_key, identifiers in KNOWN_LABS.items():
        for ident in identifiers:
            if ident in text:
                return lab_key

    # Structural detection for image-only PDFs
    # Green Analytics NY: A4 size (h≈841 pts) with a single full-page image per page
    pw, ph = page.rect.width, page.rect.height
    if 838 < ph < 845:  # A4 height in pts (841.89)
        imgs = page.get_images(full=True)
        if len(imgs) == 1:
            xref = imgs[0][0]
            rects = page.get_image_rects(xref)
            if rects and (rects[0].x1 - rects[0].x0) > pw * 0.95:
                return "green_analytics"

    return "unknown"


def make_fingerprint(doc):
    """Build a structural fingerprint of the document for template comparison."""
    page = doc[0]
    pw, ph = page.rect.width, page.rect.height
    images = []
    for img in page.get_images(full=True):
        xref = img[0]
        for rect in page.get_image_rects(xref):
            images.append({
                "cx": round((rect.x0 + rect.x1) / 2 / pw, 3),
                "cy": round((rect.y0 + rect.y1) / 2 / ph, 3),
                "aspect": round((rect.x1 - rect.x0) / max(rect.y1 - rect.y0, 1), 2),
            })

    sections = []
    text = " ".join(doc[i].get_text("text") for i in range(len(doc)))
    for sec in ["Cannabinoid", "Terpenes", "Pesticide", "Residual Solvents",
                "Microbial", "Mycotoxins", "Heavy Metals"]:
        if sec in text:
            sections.append(sec)

    return {
        "lab": detect_lab(doc),
        "page_1_image_count": len(images),
        "page_1_images": images,
        "sections_present": sections,
    }


def check_template(doc, lab):
    """
    Compare doc against stored lab template.
    First-time: save fingerprint as template, return (True, []).
    Subsequent: compare image positions. Returns (match: bool, diff_notes: list).
    """
    tpath = TEMPLATES_DIR / f"{lab}.json"
    fp = make_fingerprint(doc)

    if not tpath.exists():
        with open(tpath, "w") as f:
            json.dump(fp, f, indent=2)
        return True, []  # first time -- auto-trust

    with open(tpath) as f:
        stored = json.load(f)

    diffs = []

    # Image count change on page 1 is a meaningful signal
    if fp["page_1_image_count"] != stored["page_1_image_count"]:
        diffs.append(
            f"Page 1 image count changed: was {stored['page_1_image_count']}, now {fp['page_1_image_count']}"
        )

    # Check image positions (10% tolerance -- coordinate shift > 10% flags review)
    for i, (cur, ref) in enumerate(zip(fp["page_1_images"], stored["page_1_images"])):
        dx = abs(cur["cx"] - ref["cx"])
        dy = abs(cur["cy"] - ref["cy"])
        if dx > 0.10 or dy > 0.10:
            diffs.append(
                f"Image {i+1} position shifted from ({ref['cx']},{ref['cy']}) "
                f"to ({cur['cx']},{cur['cy']})"
            )

    # New sections appearing
    new_secs = set(fp["sections_present"]) - set(stored.get("sections_present", []))
    if new_secs:
        diffs.append(f"New sections on this COA: {', '.join(new_secs)}")

    return len(diffs) == 0, diffs


# ──────────────────────────────────────────────────────────────────────────────
# Main scrub function
# ──────────────────────────────────────────────────────────────────────────────

def scrub_pdf(input_path: str, label: str, dry_run: bool = False):
    """
    Scrub a COA PDF and return result dict with output path + diagnostics.
    """
    date_str = datetime.now().strftime("%m%d%y")
    safe_label = re.sub(r"[^A-Z0-9]", "", label.upper())
    filename = f"{safe_label}{date_str}.pdf"

    doc = fitz.open(input_path)
    lab = detect_lab(doc)
    template_match, template_diffs = check_template(doc, lab)

    stats = {
        "lab": lab,
        "pages": len(doc),
        "template_match": template_match,
        "template_diffs": template_diffs,
        "redactions": {"exact": 0, "line": 0, "word": 0, "image": 0},
        "output": None,
    }

    if dry_run:
        doc.close()
        return stats

    for page_num, page in enumerate(doc):
        stats["redactions"]["exact"]  += sum(redact_exact(page, s) for s in EXACT_STRINGS)
        stats["redactions"]["line"]   += redact_lines_by_label(page)
        stats["redactions"]["word"]   += redact_word_patterns(page)
        masked = redact_images_by_position(page, page_num)
        stats["redactions"]["image"]  += len(masked)
        mask_header_info_block(page, lab, page_num)
        page.apply_redactions(
            images=fitz.PDF_REDACT_IMAGE_PIXELS,
            graphics=fitz.PDF_REDACT_LINE_ART_REMOVE_IF_COVERED,
        )

    # Save originals copy
    orig_dest = ORIGINALS_DIR / filename.replace(".pdf", "_orig.pdf")
    shutil.copy2(input_path, orig_dest)

    # Save scrubbed output
    out_path = SCRUBBED_DIR / filename
    doc.save(str(out_path), garbage=4, deflate=True)
    doc.close()

    stats["output"] = str(out_path)
    stats["original"] = str(orig_dest)
    return stats


# ──────────────────────────────────────────────────────────────────────────────
# CLI entry point
# ──────────────────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    dry_run = "--dry-run" in sys.argv
    args = [a for a in sys.argv[1:] if not a.startswith("--")]

    if len(args) < 2:
        print("Usage: python3 scrub_coa.py <input.pdf> <LABEL> [--dry-run]")
        sys.exit(1)

    input_pdf, label = args[0], args[1]

    if not os.path.exists(input_pdf):
        print(f"Error: file not found: {input_pdf}")
        sys.exit(1)

    result = scrub_pdf(input_pdf, label, dry_run=dry_run)

    print(f"Lab detected:     {result['lab']}")
    print(f"Pages:            {result['pages']}")
    print(f"Template match:   {result['template_match']}")
    if result["template_diffs"]:
        print("Template diffs:")
        for d in result["template_diffs"]:
            print(f"  - {d}")
    if not dry_run:
        r = result["redactions"]
        print(f"Redactions:       {r['exact']} exact  {r['line']} lines  {r['word']} words  {r['image']} images")
        print(f"Output:           {result['output']}")
        print(f"Original saved:   {result['original']}")
    else:
        print("(dry run -- no output written)")