#!/usr/bin/env python3
"""
Filter a diploma CSV into shipping-ready CSVs plus light validation.

Inputs should contain these columns, or obvious aliases:
  CWID, First Name, Last Name, Suffix, Diploma Hold Issue, Program, Holds,
  Diploma Notification Sent, Metal Diploma Picked Up, Metal Diploma Mailed,
  Student Email, Street Address 1, Street Address 2, Street Address 3,
  City, State, Zip or Zipcode, Country, Intl Phone

Outputs to ./Filtered_CSVs
  - CSV_undergraduate_domestic.csv
  - CSV_graduate_domestic.csv
  - CSV_undergraduate_international.csv    (adds Country and Phone; Country is after Zipcode)
  - CSV_graduate_international.csv         (adds Country and Phone; Country is after Zipcode)
  - CSV_holds_undergraduate.csv
  - CSV_holds_graduate.csv
  - CSV_undergraduate_invalid_address.csv
  - CSV_graduate_invalid_address.csv
  - CSV_filter_notes.txt

Validation to ./Error_CSVs
  - duplicate_cwid_UG.csv
  - duplicate_cwid_G.csv
  - inconsistent_country_state_rows_ALL.csv
"""

from __future__ import annotations
import sys
from pathlib import Path
from typing import List
import pandas as pd

INPUT_FILENAME = "diploma_data.csv"

US_STATE_ABBRS = {
    "AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA",
    "HI","ID","IL","IN","IA","KS","KY","LA","ME","MD",
    "MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ",
    "NM","NY","NC","ND","OH","OK","OR","PA","RI","SC",
    "SD","TN","TX","UT","VT","VA","WA","WV","WI","WY","DC"
}

US_ALIASES = {"US","USA","U.S.","U.S.A.","UNITED STATES","UNITED STATES OF AMERICA"}


# ---------------- helpers ----------------

def norm_str(x) -> str:
    if x is None:
        return ""
    return str(x).strip()

def is_truthy(x) -> bool:
    s = norm_str(x).lower()
    return s in {"true","t","yes","y","1","x","✓","shipped","picked","mailed","delivered"}

def is_yes(x) -> bool:
    s = norm_str(x).lower()
    return s in {"yes","y","true","1"}

def full_name(first: str, last: str, suffix: str) -> str:
    f = norm_str(first)
    l = norm_str(last)
    s = norm_str(suffix)
    return f"{f} {l}, {s}" if s else f"{f} {l}".strip()

def detect_invalid_address(street1: str) -> bool:
    """
    Returns True for rows that should be treated as INVALID ADDRESS.
    """
    s = norm_str(street1).lower()
    if not s:
        return True

    INVALID_MARKERS = {
        "po box address invalid for diploma shipping",
        "no diploma address on file",
        "no address on file",
        "address not provided",
    }
    return any(token in s for token in INVALID_MARKERS)

def ensure_column_alias(df: pd.DataFrame, new_name: str, candidates: List[str]) -> pd.DataFrame:
    if new_name in df.columns:
        return df
    for c in candidates:
        if c in df.columns:
            df[new_name] = df[c]
            return df
    df[new_name] = ""
    return df

def domestic_country_and_state(country: str, state: str, zipcode: str | None = None) -> bool:
    """
    Domestic if:
      - Country is a US alias, or
      - Country empty and State is a valid US abbreviation, or
      - Country empty and Zip looks like a US ZIP (5 or 9 digits)
    """
    c = norm_str(country).upper()
    s = norm_str(state).upper()
    z = norm_str(zipcode)
    if c in US_ALIASES:
        return True
    if c == "" and s in US_STATE_ABBRS:
        return True
    zr = z.replace("-", "")
    if c == "" and zr.isdigit() and len(zr) in (5, 9):
        return True
    return False

def project_view(sub: pd.DataFrame, attach_phone: bool = False, include_country: bool = False) -> pd.DataFrame:
    """
    Build the outbound view with stable column ordering.

    For international outputs (include_country=True), place Country immediately after Zipcode.
    Phone is included only when attach_phone=True.
    """
    # Ensure optional address columns exist
    for col in ["Street Address 2", "Street Address 3"]:
        if col not in sub.columns:
            sub[col] = ""

    df_all = pd.DataFrame({
        "Full Name": [full_name(f, l, s) for f, l, s in zip(sub["First Name"], sub["Last Name"], sub["Suffix"])],
        "Address": sub["Street Address 1"],
        "Address Line 2": sub["Street Address 2"],
        "Address Line 3": sub["Street Address 3"],
        "City": sub["City"],
        "State": sub["State"].astype(str).str.upper(),
        "Zipcode": sub["Zipcode"],
        "Country": sub.get("Country", "").astype(str).str.upper(),  # may be dropped from order for domestic
        "Email Address": sub["Student Email"],
        "Phone": sub.get("Intl Phone", ""),                         # included only when requested
    })

    # Base order through Zipcode
    order = [
        "Full Name",
        "Address",
        "Address Line 2",
        "Address Line 3",
        "City",
        "State",
        "Zipcode",
    ]

    # Insert Country right after Zipcode for international outputs only
    if include_country:
        order.append("Country")

    # Then email
    order.append("Email Address")

    # Phone only when requested
    if attach_phone:
        order.append("Phone")

    return df_all[order]


# ---------------- main ----------------

def main():
    script_dir = Path(__file__).resolve().parent
    out_filtered = script_dir / "Filtered_CSVs"
    out_errors = script_dir / "Error_CSVs"
    out_filtered.mkdir(parents=True, exist_ok=True)
    out_errors.mkdir(parents=True, exist_ok=True)

    # CLI override: allow absolute or relative CSV path
    if len(sys.argv) > 1 and sys.argv[1]:
        input_csv = Path(sys.argv[1]).expanduser().resolve()
    else:
        input_csv = script_dir / INPUT_FILENAME

    if not input_csv.exists():
        print(f"ERROR: Input file not found: {input_csv}")
        return

    print(f"Reading input file: {input_csv.name}")
    print(f"Working directory: {script_dir}")

    df = pd.read_csv(input_csv, dtype=str, keep_default_na=False).copy()
    df.columns = [c.strip() for c in df.columns]

    # Normalize aliases
    df = ensure_column_alias(df, "Zipcode", ["Zip","ZIP","Postal Code","Postal"])
    df = ensure_column_alias(df, "CWID", ["CWID","Student ID","ID"])
    df = ensure_column_alias(df, "Intl Phone", ["Intl Phone","International Phone","Phone Intl","International Phone Number"])
    df = ensure_column_alias(df, "Street Address 2", ["Street Address 2","Address 2","Addr2"])
    df = ensure_column_alias(df, "Street Address 3", ["Street Address 3","Address 3","Addr3"])

    required_cols = [
        "First Name","Last Name","Suffix","Diploma Hold Issue","Program","Holds",
        "Diploma Notification Sent","Metal Diploma Picked Up","Metal Diploma Mailed",
        "Student Email","Street Address 1","Street Address 2","Street Address 3",
        "City","State","Zipcode","Country","Intl Phone","CWID"
    ]
    soft_optional = {"Street Address 2","Street Address 3","Intl Phone","CWID"}
    missing = [c for c in required_cols if c not in df.columns]
    truly_missing = [c for c in missing if c not in soft_optional]
    if truly_missing:
        raise ValueError(f"Missing required columns: {truly_missing}")

    print(f"Total rows in input: {len(df)}")

    # Standardize capitalization
    if "State" in df.columns:
        df["State"] = df["State"].astype(str).str.upper()
    if "Country" in df.columns:
        df["Country"] = df["Country"].astype(str).str.upper()

    # Keep only rows that still need to ship
    need_ship = ~df["Metal Diploma Picked Up"].apply(is_truthy) & ~df["Metal Diploma Mailed"].apply(is_truthy)
    df_ship = df[need_ship].copy()

    # UG vs G pools
    is_bs = df_ship["Program"].astype(str).str.contains("Bachelor of Science", case=False, na=False)
    ug_pool = df_ship[is_bs].copy()
    g_pool = df_ship[~is_bs].copy()

    # Holds and invalid address
    holds_ug = ug_pool[ug_pool["Holds"].apply(is_yes)].copy()
    holds_g  = g_pool[g_pool["Holds"].apply(is_yes)].copy()
    invalid_ug = ug_pool[ug_pool["Street Address 1"].apply(detect_invalid_address)].copy()
    invalid_g  = g_pool[g_pool["Street Address 1"].apply(detect_invalid_address)].copy()

    # Exclude holds and invalid from shipping subsets
    ug_pool = ug_pool[~ug_pool.index.isin(holds_ug.index) & ~ug_pool.index.isin(invalid_ug.index)].copy()
    g_pool  = g_pool[~g_pool.index.isin(holds_g.index)   & ~g_pool.index.isin(invalid_g.index)].copy()

    # Domestic vs international, using US alias and ZIP heuristic
    domestic_flag = df_ship.apply(lambda r: domestic_country_and_state(
        r.get("Country",""), r.get("State",""), r.get("Zipcode","")), axis=1)
    international_flag = ~domestic_flag

    ug_domestic      = ug_pool[domestic_flag[ug_pool.index]].copy()
    ug_international = ug_pool[international_flag[ug_pool.index]].copy()
    g_domestic       = g_pool[domestic_flag[g_pool.index]].copy()
    g_international  = g_pool[international_flag[g_pool.index]].copy()

    # Build outputs
    outputs = {
        "CSV_undergraduate_domestic.csv":       project_view(ug_domestic, attach_phone=False, include_country=False),
        "CSV_graduate_domestic.csv":            project_view(g_domestic, attach_phone=False, include_country=False),
        "CSV_undergraduate_international.csv":  project_view(ug_international, attach_phone=True,  include_country=True),
        "CSV_graduate_international.csv":       project_view(g_international,  attach_phone=True,  include_country=True),
        "CSV_holds_undergraduate.csv":          project_view(holds_ug, attach_phone=False, include_country=False),
        "CSV_holds_graduate.csv":               project_view(holds_g, attach_phone=False, include_country=False),
        "CSV_undergraduate_invalid_address.csv":project_view(invalid_ug, attach_phone=False, include_country=False),
        "CSV_graduate_invalid_address.csv":     project_view(invalid_g, attach_phone=False, include_country=False),
    }

    print("\nWriting output files to Filtered_CSVs ...")
    for name, df_out in outputs.items():
        (out_filtered / name).parent.mkdir(parents=True, exist_ok=True)
        df_out.to_csv(out_filtered / name, index=False)
        print(f"  ✓ {name} ({len(df_out)} rows)")

    # ---------------- validation ----------------

    # Duplicate CWIDs by group, for internal reconciliation only
    for tag, subset in [("UG", pd.concat([ug_domestic, ug_international, holds_ug, invalid_ug], ignore_index=True)),
                        ("G",  pd.concat([g_domestic, g_international, holds_g,  invalid_g ], ignore_index=True))]:
        if "CWID" in subset.columns:
            cwid = subset["CWID"].astype(str).str.strip()
            dupe_mask = cwid.duplicated(keep=False) & cwid.ne("")
            if dupe_mask.any():
                subset.loc[dupe_mask, [
                    "CWID","First Name","Last Name","Student Email",
                    "Street Address 1","City","State","Zipcode","Country"
                ]].to_csv(out_errors / f"duplicate_cwid_{tag}.csv", index=False)

    # Country and State consistency across entire input
    weird_rows = []
    for idx, r in df.iterrows():
        c = norm_str(r.get("Country","")).upper()
        s = norm_str(r.get("State","")).upper()
        if c and c not in US_ALIASES and s in US_STATE_ABBRS:
            weird_rows.append((idx+2, r.get("First Name",""), r.get("Last Name",""), c, s, r.get("Street Address 1","")))
        if c in US_ALIASES and s and s not in US_STATE_ABBRS:
            weird_rows.append((idx+2, r.get("First Name",""), r.get("Last Name",""), c, s, r.get("Street Address 1","")))
    if weird_rows:
        pd.DataFrame(weird_rows, columns=["Row","First Name","Last Name","Country","State","Address1"]).to_csv(
            out_errors / "inconsistent_country_state_rows_ALL.csv", index=False
        )

    # Notes for quick triage
    notes = []
    def add_count(label, xdf): notes.append(f"{label}: {len(xdf)}")
    add_count("UG domestic (shipping)", ug_domestic)
    add_count("G domestic (shipping)", g_domestic)
    add_count("UG international (shipping)", ug_international)
    add_count("G international (shipping)", g_international)
    add_count("Holds UG", holds_ug)
    add_count("Holds G", holds_g)
    add_count("Invalid address UG", invalid_ug)
    add_count("Invalid address G", invalid_g)
    notes.append(f"Inconsistent Country or State rows: {len(weird_rows)}")
    with open(out_filtered / "CSV_filter_notes.txt", "w", encoding="utf-8") as f:
        f.write("=== FILTER NOTES ===\n")
        for line in notes:
            f.write(line + "\n")
        if weird_rows:
            f.write("\nSample inconsistent Country/State rows (first 25):\n")
            for tup in weird_rows[:25]:
                idx, fn, ln, c, s, a1 = tup
                f.write(f"- Row {idx}: {fn} {ln} | Country='{c}' State='{s}' | Address1='{a1}'\n")

    print("\nNotes saved to: Filtered_CSVs/CSV_filter_notes.txt")
    print(f"Filtered CSVs saved to: {out_filtered}")
    print(f"Validation CSVs saved to: {out_errors}")
    print("Done.")

if __name__ == "__main__":
    main()
