#!/usr/bin/env python3
"""
Split customer statement PDF by detecting customer account numbers.
Detects account numbers using OCR and groups consecutive pages with the same account.
"""
import sys
import json
import os
import re

try:
    import fitz  # PyMuPDF
except Exception as e:
    print(json.dumps({
        "ok": False,
        "error": "PyMuPDF is not available. Install with: pip3 install pymupdf",
        "details": str(e)
    }))
    sys.exit(1)

try:
    import pytesseract
    from PIL import Image
    import io
except Exception as e:
    print(json.dumps({
        "ok": False,
        "error": "OCR libraries not available. Install with: pip3 install pytesseract pillow",
        "details": str(e)
    }))
    sys.exit(1)


def extract_text_from_page(page):
    """
    Extract text from the TOP-RIGHT region of the page only - that is where
    the customer account sits on a statement. Restricting to the region
    stops constant header text (company details etc.) matching on every page.

    1. Try the built-in text layer for that region (fast, exact).
    2. If the region has no text (scanned statement), OCR the same region.
    """
    rect = page.rect
    top_right = fitz.Rect(
        rect.x0 + rect.width * 0.40,
        rect.y0,
        rect.x1,
        rect.y0 + rect.height * 0.40
    )

    text = ""
    try:
        text = page.get_text("text", clip=top_right) or ""
    except Exception:
        text = ""

    if len(text.strip()) >= 4:
        return text

    # OCR fallback for scanned pages (same top-right region)
    try:
        pix = page.get_pixmap(matrix=fitz.Matrix(200 / 72, 200 / 72), clip=top_right)
        img = Image.open(io.BytesIO(pix.tobytes("png")))
        return pytesseract.image_to_string(img, lang="eng", config="--psm 6") or ""
    except Exception:
        return text


def extract_account_number(text):
    """
    Extract customer account number from page text.
    Looks for common patterns like "Account:", "Acc:", "A/C:", or alphanumeric codes.
    """
    if not text:
        return None

    # Words that are labels/headers, never account codes
    JUNK = {
        'NO', 'NUMBER', 'NAME', 'REF', 'CODE', 'DATE', 'PAGE', 'PAGES',
        'STATEMENT', 'ACCOUNT', 'CUSTOMER', 'INVOICE', 'TOTAL', 'BALANCE',
        'VAT', 'TEL', 'FAX', 'EMAIL', 'LTD', 'LIMITED', 'TO', 'FROM', 'OF'
    }

    def is_datelike(s):
        return bool(re.search(r'\d{1,2}\s*[/.\-]\s*\d{1,2}', s))

    # 1. Labelled patterns: "Account Ref <newline> ALS", "A/C No. X" etc.
    #    (& allowed for codes like "D&B")
    labelled = r'(?:ACCOUNT|ACC|A/C|CUSTOMER|CUST)\s*(?:NO|NUMBER|REF|CODE)?\s*[:.\-]?\s+([A-Z0-9][A-Z0-9&\-]{0,14})'
    for m in re.finditer(labelled, text, re.IGNORECASE):
        cand = m.group(1).strip().upper().rstrip('.,:')
        if cand and cand not in JUNK and not is_datelike(cand):
            return cand

    # 2. Generic per-line fallback (top of region): short code like
    #    "MCLT", "ABC-12345", "ABC 123" - skipping label/junk lines and dates.
    for line in text.splitlines()[:15]:
        line = line.strip()
        if not line or is_datelike(line):
            continue
        first_word = re.split(r'[\s:]+', line)[0].upper().rstrip('.,:')
        if first_word in JUNK:
            continue
        m = re.match(r'^([A-Z]{1,6}(?:\s*-?\s*\d{1,6})?)$', line, re.IGNORECASE)
        if m:
            cand = m.group(1).strip().upper()
            if cand not in JUNK:
                return cand

    return None


def extract_statement_date(text):
    """Find the statement date (e.g. 11/06/2026) and return it as YYYY-MM-DD."""
    if not text:
        return None
    m = re.search(r'(\d{1,2})\s*/\s*(\d{1,2})\s*/\s*(\d{2,4})', text)
    if not m:
        return None
    d, mo, y = int(m.group(1)), int(m.group(2)), int(m.group(3))
    if y < 100:
        y += 2000
    if not (1 <= d <= 31 and 1 <= mo <= 12):
        return None
    return f"{y:04d}-{mo:02d}-{d:02d}"


def main():
    if len(sys.argv) < 3:
        print(json.dumps({
            "ok": False,
            "error": "Usage: split_statement_pdf.py input.pdf output_dir"
        }))
        sys.exit(1)

    input_pdf = sys.argv[1]
    output_dir = sys.argv[2]

    if not os.path.isfile(input_pdf):
        print(json.dumps({
            "ok": False,
            "error": f"Input PDF not found: {input_pdf}"
        }))
        sys.exit(1)

    os.makedirs(output_dir, exist_ok=True)

    try:
        doc = fitz.open(input_pdf)
        total_pages = len(doc)

        if total_pages == 0:
            print(json.dumps({
                "ok": False,
                "error": "PDF has no pages"
            }))
            sys.exit(1)

        # Extract account number + statement date from each page
        page_accounts = []
        for page_num in range(total_pages):
            page = doc[page_num]
            text = extract_text_from_page(page)
            account = extract_account_number(text)
            page_accounts.append({
                'page_num': page_num,
                'account': account,
                'statement_date': extract_statement_date(text),
                'text_sample': text[:200] if text else ''
            })

        # Group consecutive pages by account.
        # Rules:
        # - page with the SAME account as the current group -> continuation
        # - page with NO detected account -> continuation of the current
        #   statement (continuation pages often lack the account header)
        # - page with a DIFFERENT account -> new statement
        detected_any = any(p['account'] for p in page_accounts)

        groups = []
        current_group = None

        for page_info in page_accounts:
            account = page_info['account']

            if current_group is None:
                current_group = {
                    'account': account,
                    'start_page': page_info['page_num'],
                    'end_page': page_info['page_num']
                }
            elif account is None or account == current_group['account']:
                # Continuation page (no header found, or same account)
                current_group['end_page'] = page_info['page_num']
            else:
                groups.append(current_group)
                current_group = {
                    'account': account,
                    'start_page': page_info['page_num'],
                    'end_page': page_info['page_num']
                }

        if current_group is not None:
            groups.append(current_group)

        # Safety net: if NO account could be detected on ANY page, splitting
        # into one 42-page blob is useless - fall back to one statement per
        # page so each can be identified manually in review.
        if not detected_any:
            groups = [
                {'account': None, 'start_page': p, 'end_page': p}
                for p in range(total_pages)
            ]

        # Create PDF for each group
        statements = []
        for idx, group in enumerate(groups):
            account = group['account'] or f"UNKNOWN_{idx}"
            start = group['start_page']
            end = group['end_page']
            page_range = f"{start + 1}-{end + 1}"

            # Statement date: first date found within the group's pages
            statement_date = None
            for p in range(start, end + 1):
                if page_accounts[p].get('statement_date'):
                    statement_date = page_accounts[p]['statement_date']
                    break

            # Sanitize account name for filename
            safe_account = re.sub(r'[^\w\-\.]', '_', str(account))
            filename = f"statement_{safe_account}_pages_{page_range}.pdf"
            output_path = os.path.join(output_dir, filename)

            # Create new PDF with these pages
            new_doc = fitz.open()
            new_doc.insert_pdf(doc, from_page=start, to_page=end)
            new_doc.save(output_path)
            new_doc.close()

            statements.append({
                'account': str(account),
                'statement_date': statement_date,
                'page_count': end - start + 1,
                'page_range': page_range,
                'filename': filename,
                'path': output_path
            })

        doc.close()

        print(json.dumps({
            "ok": True,
            "statement_count": len(statements),
            "page_count": total_pages,
            "statements": statements,
            "page_details": page_accounts
        }))

    except Exception as e:
        print(json.dumps({
            "ok": False,
            "error": "Failed to split PDF",
            "details": str(e)
        }))
        sys.exit(1)


if __name__ == "__main__":
    main()
