import sys
import pdfplumber
import csv
import re

if len(sys.argv) < 3:
    print("Usage: python extract_tables.py input.pdf output.csv")
    sys.exit(1)

pdf_path = sys.argv[1]
csv_path = sys.argv[2]

def clean_text(text):
    if not text:
        return ""
    text = text.replace('\r', '').replace('\u2022', '-')
    return text.strip()

def extract_weight(text):
    if not text:
        return ""
    match = re.search(r'\b(\d{1,3})\b', text)
    return match.group(1) if match else ""

def split_criterion_and_description(cell):
    if not cell:
        return "", ""
    # Split by first line break or bullet
    parts = re.split(r'\n|- |\u2022', cell, maxsplit=1)
    criterion = parts[0].strip()
    description = ""
    if len(parts) > 1:
        rest = cell[len(parts[0]):].lstrip('\n').lstrip('- ').lstrip('\u2022').strip()
        description = rest
    return criterion, description


def extract_heading_near_table(page, bbox, max_lines=3, band_height=120):
    if not bbox:
        return ""

    try:
        words = page.extract_words(use_text_flow=True, keep_blank_chars=False)
    except Exception:
        return ""

    if not words:
        return ""

    table_top = float(bbox[1])
    band_top = max(0.0, table_top - float(band_height))
    candidates = []

    for word in words:
        top = float(word.get("top", 0.0))
        bottom = float(word.get("bottom", top))
        if bottom <= table_top and top >= band_top:
            candidates.append(word)

    if not candidates:
        return ""

    candidates.sort(key=lambda item: (round(float(item.get("top", 0.0)), 1), float(item.get("x0", 0.0))))
    lines = []
    current_words = []
    current_top = None

    for word in candidates:
        top = float(word.get("top", 0.0))
        text = clean_text(word.get("text", ""))
        if not text:
            continue

        if current_top is None or abs(top - current_top) <= 4:
            current_words.append(text)
            if current_top is None:
                current_top = top
        else:
            if current_words:
                lines.append(" ".join(current_words).strip())
            current_words = [text]
            current_top = top

    if current_words:
        lines.append(" ".join(current_words).strip())

    lines = [line for line in lines if line]
    if not lines:
        return ""

    return " | ".join(lines[-max_lines:])

with pdfplumber.open(pdf_path) as pdf:
    with open(csv_path, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        for page_num, page in enumerate(pdf.pages, start=1):
            table_entries = []

            try:
                found_tables = page.find_tables()
            except Exception:
                found_tables = []

            if found_tables:
                for table_index, table_obj in enumerate(found_tables, start=1):
                    try:
                        rows = table_obj.extract() or []
                    except Exception:
                        rows = []
                    if not rows:
                        continue
                    heading = extract_heading_near_table(page, getattr(table_obj, "bbox", None))
                    table_entries.append((table_index, heading, rows))
            else:
                raw_tables = page.extract_tables() or []
                for table_index, rows in enumerate(raw_tables, start=1):
                    if not rows:
                        continue
                    table_entries.append((table_index, "", rows))

            for table_index, heading, rows in table_entries:
                writer.writerow(["__TABLE_START__", str(page_num), str(table_index), clean_text(heading)])
                for row in rows:
                    # Case 1: 3 columns (criterion, weight, description)
                    if len(row) == 3:
                        criterion = clean_text(row[0])
                        weight = extract_weight(row[1])
                        description = clean_text(row[2])
                    # Case 2: 2 columns (criterion+desc, weight)
                    elif len(row) == 2:
                        crit_and_desc = clean_text(row[0])
                        weight = extract_weight(row[1])
                        criterion, description = split_criterion_and_description(crit_and_desc)
                    # Case 4: 1 column (all in one)
                    elif len(row) == 1:
                        cell = clean_text(row[0])
                        weight = extract_weight(cell)
                        cell_wo_weight = cell.replace(weight, '', 1).strip()
                        criterion, description = split_criterion_and_description(cell_wo_weight)
                    else:
                        criterion = description = weight = ""

                    if criterion == "" and description == "" and weight == "":
                        continue

                    writer.writerow([criterion, weight, description])