#!/usr/bin/env python3 """ Split customer statement PDF by detecting customer account numbers. Detects account numbers using OCR and groups consecutive pages with the same account. """ import sys import json import os import re try: import fitz # PyMuPDF except Exception as e: print(json.dumps({ "ok": False, "error": "PyMuPDF is not available. Install with: pip3 install pymupdf", "details": str(e) })) sys.exit(1) try: import pytesseract from PIL import Image import io except Exception as e: print(json.dumps({ "ok": False, "error": "OCR libraries not available. Install with: pip3 install pytesseract pillow", "details": str(e) })) sys.exit(1) def extract_text_from_page(page): """ Extract text from the TOP-RIGHT region of the page only - that is where the customer account sits on a statement. Restricting to the region stops constant header text (company details etc.) matching on every page. 1. Try the built-in text layer for that region (fast, exact). 2. If the region has no text (scanned statement), OCR the same region. """ rect = page.rect top_right = fitz.Rect( rect.x0 + rect.width * 0.40, rect.y0, rect.x1, rect.y0 + rect.height * 0.40 ) text = "" try: text = page.get_text("text", clip=top_right) or "" except Exception: text = "" if len(text.strip()) >= 4: return text # OCR fallback for scanned pages (same top-right region) try: pix = page.get_pixmap(matrix=fitz.Matrix(200 / 72, 200 / 72), clip=top_right) img = Image.open(io.BytesIO(pix.tobytes("png"))) return pytesseract.image_to_string(img, lang="eng", config="--psm 6") or "" except Exception: return text def extract_account_number(text): """ Extract customer account number from page text. Looks for common patterns like "Account:", "Acc:", "A/C:", or alphanumeric codes. """ if not text: return None # Words that are labels/headers, never account codes JUNK = { 'NO', 'NUMBER', 'NAME', 'REF', 'CODE', 'DATE', 'PAGE', 'PAGES', 'STATEMENT', 'ACCOUNT', 'CUSTOMER', 'INVOICE', 'TOTAL', 'BALANCE', 'VAT', 'TEL', 'FAX', 'EMAIL', 'LTD', 'LIMITED', 'TO', 'FROM', 'OF' } def is_datelike(s): return bool(re.search(r'\d{1,2}\s*[/.\-]\s*\d{1,2}', s)) # 1. Labelled patterns: "Account Ref ALS", "A/C No. X" etc. # (& allowed for codes like "D&B") labelled = r'(?:ACCOUNT|ACC|A/C|CUSTOMER|CUST)\s*(?:NO|NUMBER|REF|CODE)?\s*[:.\-]?\s+([A-Z0-9][A-Z0-9&\-]{0,14})' for m in re.finditer(labelled, text, re.IGNORECASE): cand = m.group(1).strip().upper().rstrip('.,:') if cand and cand not in JUNK and not is_datelike(cand): return cand # 2. Generic per-line fallback (top of region): short code like # "MCLT", "ABC-12345", "ABC 123" - skipping label/junk lines and dates. for line in text.splitlines()[:15]: line = line.strip() if not line or is_datelike(line): continue first_word = re.split(r'[\s:]+', line)[0].upper().rstrip('.,:') if first_word in JUNK: continue m = re.match(r'^([A-Z]{1,6}(?:\s*-?\s*\d{1,6})?)$', line, re.IGNORECASE) if m: cand = m.group(1).strip().upper() if cand not in JUNK: return cand return None def extract_statement_date(text): """Find the statement date (e.g. 11/06/2026) and return it as YYYY-MM-DD.""" if not text: return None m = re.search(r'(\d{1,2})\s*/\s*(\d{1,2})\s*/\s*(\d{2,4})', text) if not m: return None d, mo, y = int(m.group(1)), int(m.group(2)), int(m.group(3)) if y < 100: y += 2000 if not (1 <= d <= 31 and 1 <= mo <= 12): return None return f"{y:04d}-{mo:02d}-{d:02d}" def main(): if len(sys.argv) < 3: print(json.dumps({ "ok": False, "error": "Usage: split_statement_pdf.py input.pdf output_dir" })) sys.exit(1) input_pdf = sys.argv[1] output_dir = sys.argv[2] if not os.path.isfile(input_pdf): print(json.dumps({ "ok": False, "error": f"Input PDF not found: {input_pdf}" })) sys.exit(1) os.makedirs(output_dir, exist_ok=True) try: doc = fitz.open(input_pdf) total_pages = len(doc) if total_pages == 0: print(json.dumps({ "ok": False, "error": "PDF has no pages" })) sys.exit(1) # Extract account number + statement date from each page page_accounts = [] for page_num in range(total_pages): page = doc[page_num] text = extract_text_from_page(page) account = extract_account_number(text) page_accounts.append({ 'page_num': page_num, 'account': account, 'statement_date': extract_statement_date(text), 'text_sample': text[:200] if text else '' }) # Group consecutive pages by account. # Rules: # - page with the SAME account as the current group -> continuation # - page with NO detected account -> continuation of the current # statement (continuation pages often lack the account header) # - page with a DIFFERENT account -> new statement detected_any = any(p['account'] for p in page_accounts) groups = [] current_group = None for page_info in page_accounts: account = page_info['account'] if current_group is None: current_group = { 'account': account, 'start_page': page_info['page_num'], 'end_page': page_info['page_num'] } elif account is None or account == current_group['account']: # Continuation page (no header found, or same account) current_group['end_page'] = page_info['page_num'] else: groups.append(current_group) current_group = { 'account': account, 'start_page': page_info['page_num'], 'end_page': page_info['page_num'] } if current_group is not None: groups.append(current_group) # Safety net: if NO account could be detected on ANY page, splitting # into one 42-page blob is useless - fall back to one statement per # page so each can be identified manually in review. if not detected_any: groups = [ {'account': None, 'start_page': p, 'end_page': p} for p in range(total_pages) ] # Create PDF for each group statements = [] for idx, group in enumerate(groups): account = group['account'] or f"UNKNOWN_{idx}" start = group['start_page'] end = group['end_page'] page_range = f"{start + 1}-{end + 1}" # Statement date: first date found within the group's pages statement_date = None for p in range(start, end + 1): if page_accounts[p].get('statement_date'): statement_date = page_accounts[p]['statement_date'] break # Sanitize account name for filename safe_account = re.sub(r'[^\w\-\.]', '_', str(account)) filename = f"statement_{safe_account}_pages_{page_range}.pdf" output_path = os.path.join(output_dir, filename) # Create new PDF with these pages new_doc = fitz.open() new_doc.insert_pdf(doc, from_page=start, to_page=end) new_doc.save(output_path) new_doc.close() statements.append({ 'account': str(account), 'statement_date': statement_date, 'page_count': end - start + 1, 'page_range': page_range, 'filename': filename, 'path': output_path }) doc.close() print(json.dumps({ "ok": True, "statement_count": len(statements), "page_count": total_pages, "statements": statements, "page_details": page_accounts })) except Exception as e: print(json.dumps({ "ok": False, "error": "Failed to split PDF", "details": str(e) })) sys.exit(1) if __name__ == "__main__": main()