"""
Generator for 03_format_standardizer test fixtures.

Six fixtures in test_data/formats/:
  24_format_dates.csv       (FD-prefixed cases)
  25_format_phones.csv      (FP-prefixed cases)
  26_format_emails.csv      (FE-prefixed cases)
  27_format_addresses.csv   (FA-prefixed cases)
  28_format_names.csv       (FN-prefixed cases)
  29_format_currencies.csv  (FC-prefixed cases)
  30_format_integration.csv (FI-prefixed; cross-domain rows)

Plus expected outputs in expected/formats/ where policy is clear enough
to diff against.

Each fixture has the same column shape:
  case_id,category,description,input

Expected outputs:
  case_id,output                       (single-policy domains)
  case_id,output_default,output_aggressive  (multi-policy domains)

Default-policy choices are documented in FORMATS-CASES.md. Re-running
the generator is the canonical way to refresh fixtures after a policy
change; do not hand-edit the expected files.
"""
import csv
from pathlib import Path

ROOT = Path(__file__).parent
TD = ROOT / "test_data" / "formats"
EX = ROOT / "expected" / "formats"
TD.mkdir(parents=True, exist_ok=True)
EX.mkdir(parents=True, exist_ok=True)


def write_input(filename, rows):
    """rows: list of (case_id, category, description, input_value)."""
    path = TD / filename
    with path.open("w", encoding="utf-8", newline="") as f:
        w = csv.writer(f)
        w.writerow(["case_id", "category", "description", "input"])
        for r in rows:
            w.writerow(r)
    return path


def write_expected(filename, rows, columns):
    """rows: list of tuples matching `columns` length. columns: list of field names."""
    path = EX / filename
    with path.open("w", encoding="utf-8", newline="") as f:
        w = csv.writer(f)
        w.writerow(columns)
        for r in rows:
            w.writerow(r)
    return path


# ============================================================================
# 24 - DATES
# ============================================================================
# Default output policy: ISO 8601 (YYYY-MM-DD). For datetime values, ISO with
# T separator. Time zones preserved if present, never auto-converted.
#
# Locale ambiguity (M/D/Y vs D/M/Y): per-column inspection. The cleaner
# scans all values in the column; if any value has day > 12 the locale is
# unambiguously D/M/Y; if any value has month > 12 (impossible) the locale
# is unambiguously M/D/Y; if no disambiguating value exists, error and ask
# user to specify.
#
# Two-digit year cutoff: Python default (00-49 -> 2000s, 50-99 -> 1900s).
# Documented as a flag the user can override.

date_rows = [
    # ISO formats
    ("FD01", "iso",          "ISO date plain",                            "2024-01-15"),
    ("FD02", "iso",          "ISO datetime no zone",                      "2024-01-15T10:30:00"),
    ("FD03", "iso",          "ISO datetime UTC",                          "2024-01-15T10:30:00Z"),
    ("FD04", "iso",          "ISO datetime offset",                       "2024-01-15T10:30:00+05:00"),
    ("FD05", "iso",          "ISO datetime with millis",                  "2024-01-15T10:30:00.123Z"),
    ("FD06", "iso",          "ISO datetime space separator",              "2024-01-15 10:30:00"),
    # US formats
    ("FD07", "us",           "US slash 4-digit year",                     "01/15/2024"),
    ("FD08", "us",           "US slash 2-digit year",                     "1/15/24"),
    ("FD09", "us",           "US slash no leading zero",                  "1/5/2024"),
    ("FD10", "us",           "US slash unambiguous (day > 12)",           "5/30/2024"),
    # European formats
    ("FD11", "eu",           "EU dot 4-digit year",                       "15.01.2024"),
    ("FD12", "eu",           "EU dot 2-digit year",                       "15.01.24"),
    ("FD13", "eu",           "EU slash 4-digit year",                     "15/01/2024"),
    ("FD14", "eu",           "EU slash unambiguous (day > 12)",           "30/05/2024"),
    ("FD15", "eu",           "EU dash format",                            "15-01-2024"),
    # Long-form
    ("FD16", "longform",     "Month name long",                           "January 15, 2024"),
    ("FD17", "longform",     "Month name short",                          "Jan 15, 2024"),
    ("FD18", "longform",     "Day-month-year long",                       "15 January 2024"),
    ("FD19", "longform",     "Day-month-year short",                      "15 Jan 2024"),
    ("FD20", "longform",     "With weekday",                              "Monday, January 15, 2024"),
    ("FD21", "longform",     "All caps month",                            "JAN 15 2024"),
    # Excel-origin
    ("FD22", "excel",        "Excel serial date",                         "45306"),
    ("FD23", "excel",        "Excel serial with fractional time",         "45306.4375"),
    # Unix
    ("FD24", "unix",         "Unix timestamp seconds",                    "1705320000"),
    ("FD25", "unix",         "Unix timestamp milliseconds",               "1705320000000"),
    # Partial / coarse
    ("FD26", "partial",      "Year-month only ISO",                       "2024-01"),
    ("FD27", "partial",      "Year-month text",                           "January 2024"),
    ("FD28", "partial",      "Quarter notation",                          "Q1 2024"),
    ("FD29", "partial",      "Year only",                                 "2024"),
    # Edge / pathological
    ("FD30", "edge",         "Two-digit year ambiguity (1969 vs 2069)",   "1/15/69"),
    ("FD31", "edge",         "Leap day valid",                            "2024-02-29"),
    ("FD32", "edge",         "Leap day invalid (not a leap year)",        "2023-02-29"),
    ("FD33", "edge",         "Excel 1900 leap year bug",                  "1900-02-29"),
    ("FD34", "edge",         "Invalid month",                             "2024-13-15"),
    ("FD35", "edge",         "Invalid day",                               "2024-04-31"),
    ("FD36", "edge",         "Date with extraneous text",                 "Date: 2024-01-15"),
    ("FD37", "edge",         "Date in parens annotation",                 "2024-01-15 (verified)"),
    ("FD38", "edge",         "Empty",                                     ""),
    ("FD39", "edge",         "Whitespace-only",                           "   "),
    ("FD40", "edge",         "Garbage",                                   "not a date"),
    # Localized month names
    ("FD41", "locale",       "French month name",                         "15 janvier 2024"),
    ("FD42", "locale",       "German month name",                         "15. Januar 2024"),
    # Time zones
    ("FD43", "timezone",     "Datetime with named tz",                    "2024-01-15 10:30:00 EST"),
    ("FD44", "timezone",     "Datetime with offset and DST ambiguity",    "2024-03-10 02:30:00-05:00"),
    # Padding / quotes
    ("FD45", "padding",      "Already-clean: pass through",               "2024-01-15"),
]

write_input("24_format_dates.csv", date_rows)

# Expected outputs for dates. Default policy: ISO 8601, errors as the
# literal string "<error: reason>" so they round-trip in CSV but are
# clearly distinguishable from valid dates.
date_expected = [
    ("FD01", "2024-01-15"),
    ("FD02", "2024-01-15T10:30:00"),
    ("FD03", "2024-01-15T10:30:00+00:00"),  # Z normalized to +00:00
    ("FD04", "2024-01-15T10:30:00+05:00"),
    ("FD05", "2024-01-15T10:30:00.123+00:00"),
    ("FD06", "2024-01-15T10:30:00"),         # space -> T
    ("FD07", "2024-01-15"),                  # US 4-digit
    ("FD08", "2024-01-15"),                  # US 2-digit, 24 -> 2024
    ("FD09", "2024-01-05"),
    ("FD10", "2024-05-30"),                  # US unambiguous (30 > 12, can't be month)
    ("FD11", "2024-01-15"),                  # EU dot
    ("FD12", "2024-01-15"),
    ("FD13", "2024-01-15"),                  # EU slash; will require column-level locale detection
    ("FD14", "2024-05-30"),
    ("FD15", "2024-01-15"),
    ("FD16", "2024-01-15"),
    ("FD17", "2024-01-15"),
    ("FD18", "2024-01-15"),
    ("FD19", "2024-01-15"),
    ("FD20", "2024-01-15"),                  # weekday discarded
    ("FD21", "2024-01-15"),                  # all-caps month
    ("FD22", "2024-01-15"),                  # Excel serial 45306
    ("FD23", "2024-01-15T10:30:00"),         # serial with .4375 = 10:30 AM
    ("FD24", "2024-01-15T12:00:00+00:00"),   # unix seconds
    ("FD25", "2024-01-15T12:00:00+00:00"),   # unix millis
    ("FD26", "2024-01"),                     # partial preserved (NOT padded to day 01)
    ("FD27", "2024-01"),
    ("FD28", "2024-Q1"),
    ("FD29", "2024"),
    ("FD30", "2069-01-15"),                  # Python default cutoff: 50-99 -> 1900s, 00-49 -> 2000s. 69 -> 2069? NO: 69 -> 1969. Wait.
    # Python's default: %y interprets 00-68 as 2000-2068, 69-99 as 1969-1999.
    # So "69" -> 1969. Update: FD30 -> 1969-01-15.
    ("FD31", "2024-02-29"),
    ("FD32", "<error: invalid date - 2023 not a leap year>"),
    ("FD33", "<error: 1900-02-29 does not exist (Excel leap year bug)>"),
    ("FD34", "<error: invalid month 13>"),
    ("FD35", "<error: invalid day - April has 30 days>"),
    ("FD36", "2024-01-15"),                  # extract date from "Date: 2024-01-15"
    ("FD37", "2024-01-15"),                  # extract date, drop annotation
    ("FD38", ""),                            # empty stays empty (04's territory)
    ("FD39", ""),                            # whitespace-only becomes empty (02's territory but defensive)
    ("FD40", "<error: unparseable - 'not a date'>"),
    ("FD41", "2024-01-15"),                  # French
    ("FD42", "2024-01-15"),                  # German
    ("FD43", "2024-01-15T10:30:00-05:00"),   # EST -> -05:00
    ("FD44", "2024-03-10T02:30:00-05:00"),
    ("FD45", "2024-01-15"),                  # already clean
]
# Fix FD30: Python default is 69 -> 1969. Update.
date_expected[29] = ("FD30", "1969-01-15")

write_expected("24_format_dates_expected.csv", date_expected, ["case_id", "output"])


# ============================================================================
# 25 - PHONES
# ============================================================================
# Default output policy: E.164 (+<country><number>). Extensions preserved
# via RFC 3966 ;ext=<digits>. Letters in numbers (1-800-FLOWERS) converted
# to digits using standard phone keypad mapping.
#
# Default country: detected from column-level pattern, OR explicitly set
# via --default-country=US. For this fixture, default = US.

phone_rows = [
    # US standard
    ("FP01", "us",       "Plain digits 10",                       "5551234567"),
    ("FP02", "us",       "Standard formatting",                   "(555) 123-4567"),
    ("FP03", "us",       "Dashes",                                "555-123-4567"),
    ("FP04", "us",       "Dots",                                  "555.123.4567"),
    ("FP05", "us",       "Spaces",                                "555 123 4567"),
    ("FP06", "us",       "With country code +1",                  "+1 555 123 4567"),
    ("FP07", "us",       "With country code 1- prefix",           "1-555-123-4567"),
    ("FP08", "us",       "With 001 prefix",                       "001 555 123 4567"),
    # US extensions
    ("FP09", "ext",      "Extension ext keyword",                 "555-123-4567 ext 123"),
    ("FP10", "ext",      "Extension x abbreviation",              "555-123-4567 x123"),
    ("FP11", "ext",      "Extension hash",                        "555-123-4567 #123"),
    # Vanity / letters
    ("FP12", "vanity",   "Vanity number 1-800-FLOWERS",           "1-800-FLOWERS"),
    ("FP13", "vanity",   "Mixed letters and digits",              "555-CALL-NOW"),
    # International
    ("FP14", "intl",     "UK with +44",                           "+44 20 7946 0958"),
    ("FP15", "intl",     "UK domestic",                           "020 7946 0958"),
    ("FP16", "intl",     "Germany with +49",                      "+49 30 12345678"),
    ("FP17", "intl",     "France with +33",                       "+33 1 23 45 67 89"),
    ("FP18", "intl",     "Japan with +81",                        "+81-3-1234-5678"),
    ("FP19", "intl",     "Australia with +61",                    "+61 2 1234 5678"),
    # Already E.164
    ("FP20", "e164",     "Already E.164 format",                  "+15551234567"),
    # Edge cases
    ("FP21", "edge",     "Too few digits (local-only)",           "555-1234"),
    ("FP22", "edge",     "Too many digits",                       "1-555-123-4567-extra-99"),
    ("FP23", "edge",     "All-zeros placeholder",                 "000-000-0000"),
    ("FP24", "edge",     "All-nines placeholder",                 "999-999-9999"),
    ("FP25", "edge",     "Multiple numbers in cell",              "555-123-4567 / 555-987-6543"),
    ("FP26", "edge",     "Mismatched parens",                     "555-(123)-4567"),
    ("FP27", "edge",     "NBSP in number",                        "555\u00a0123\u00a04567"),
    ("FP28", "edge",     "Very spaced",                           "5 5 5 1 2 3 4 5 6 7"),
    ("FP29", "edge",     "Empty",                                 ""),
    ("FP30", "edge",     "Non-phone string",                      "TBD"),
    ("FP31", "edge",     "Smart-apostrophe contamination",        "555\u2019s 123-4567"),
]

write_input("25_format_phones.csv", phone_rows)

phone_expected = [
    ("FP01", "+15551234567"),
    ("FP02", "+15551234567"),
    ("FP03", "+15551234567"),
    ("FP04", "+15551234567"),
    ("FP05", "+15551234567"),
    ("FP06", "+15551234567"),
    ("FP07", "+15551234567"),
    ("FP08", "+15551234567"),                # 001 = international prefix from US -> drop, then +1
    ("FP09", "+15551234567;ext=123"),
    ("FP10", "+15551234567;ext=123"),
    ("FP11", "+15551234567;ext=123"),
    ("FP12", "+18003569377"),                # FLOWERS = 3569377
    ("FP13", "+15552255669"),                # CALL=2255, NOW=669
    ("FP14", "+442079460958"),
    ("FP15", "+442079460958"),               # detected as UK; 0 is trunk prefix, dropped
    ("FP16", "+493012345678"),
    ("FP17", "+33123456789"),
    ("FP18", "+81312345678"),
    ("FP19", "+61212345678"),
    ("FP20", "+15551234567"),                # already E.164, idempotent
    ("FP21", "<error: insufficient digits - need 10 for US>"),
    ("FP22", "<error: too many digits>"),
    ("FP23", "<error: invalid - all zeros>"),
    ("FP24", "<error: invalid - reserved/placeholder pattern>"),
    ("FP25", "<error: multiple numbers detected - split into separate cells first>"),
    ("FP26", "+15551234567"),                # parens are noise, regardless of placement
    ("FP27", "+15551234567"),                # NBSP stripped (02 should have done this; defensive)
    ("FP28", "+15551234567"),
    ("FP29", ""),                            # empty stays empty
    ("FP30", "<error: unparseable - 'TBD'>"),
    ("FP31", "<error: unparseable - too few digits after cleanup>"),
]

write_expected("25_format_phones_expected.csv", phone_expected, ["case_id", "output"])


# ============================================================================
# 26 - EMAILS
# ============================================================================
# SCOPE NOTE: per TECHNICAL.md Section 10.1 item 8, email normalization is
# spec'd inside 01_deduplicator. This corpus tests it AS IF it lives in 03,
# on the assumption that 03 owns format standardization across all domains
# and 01 calls into it for matching. If you disagree, drop this fixture
# without affecting the others.
#
# Default policy:
#   - Lowercase both local and domain
#   - Strip outer whitespace
#   - Strip mailto: prefix
#   - Strip wrapping <>
#   - Extract from "Display Name <email>" format
#   - PRESERVE dots in local-part (Gmail-equivalence is opt-in)
#   - PRESERVE +tag (also opt-in)
# Aggressive policy (--gmail-canonical):
#   - For @gmail.com: strip dots, strip +tag

email_rows = [
    # Basic
    ("FE01", "basic",      "Plain ASCII",                            "alice@example.com"),
    ("FE02", "basic",      "Mixed case",                             "Alice@Example.COM"),
    ("FE03", "basic",      "All caps",                               "ALICE@EXAMPLE.COM"),
    ("FE04", "basic",      "Whitespace padding",                     "  alice@example.com  "),
    # Display name forms
    ("FE05", "displayname","Display name no quotes",                 "Alice Smith <alice@example.com>"),
    ("FE06", "displayname","Display name with quotes",               '"Alice Smith" <alice@example.com>'),
    ("FE07", "displayname","Wrapped in angle brackets only",         "<alice@example.com>"),
    # Prefix variants
    ("FE08", "prefix",     "mailto: prefix",                         "mailto:alice@example.com"),
    ("FE09", "prefix",     "MAILTO: caps",                           "MAILTO:Alice@Example.com"),
    # Gmail-specific normalizations (opt-in)
    ("FE10", "gmail",      "Gmail with dots",                        "a.l.i.c.e@gmail.com"),
    ("FE11", "gmail",      "Gmail with +tag",                        "alice+newsletter@gmail.com"),
    ("FE12", "gmail",      "Gmail with both",                        "a.l.i.c.e+work@gmail.com"),
    ("FE13", "gmail",      "Non-Gmail with dots (don't touch)",      "a.l.i.c.e@example.com"),
    ("FE14", "gmail",      "Non-Gmail with +tag (don't touch)",      "alice+newsletter@example.com"),
    # IDN / Unicode
    ("FE15", "idn",        "Unicode in domain",                      "alice@m\u00fcnchen.de"),
    ("FE16", "idn",        "Unicode in local",                       "\u30a2\u30ea\u30b9@example.jp"),
    # Trailing punctuation contamination
    ("FE17", "trailing",   "Trailing comma",                         "alice@example.com,"),
    ("FE18", "trailing",   "Trailing period",                        "alice@example.com."),
    ("FE19", "trailing",   "Trailing closing paren",                 "alice@example.com)"),
    ("FE20", "trailing",   "Trailing semicolon",                     "alice@example.com;"),
    # Smart-quote contamination (assumes 02 didn't run)
    ("FE21", "smartquote", "Wrapped in curly quotes",                "\u201Calice@example.com\u201D"),
    # Invalid / error cases
    ("FE22", "invalid",    "Missing @",                              "aliceexample.com"),
    ("FE23", "invalid",    "Double @",                               "alice@@example.com"),
    ("FE24", "invalid",    "Multiple @",                             "alice@example@com"),
    ("FE25", "invalid",    "Spaces inside",                          "alice @ example.com"),
    ("FE26", "invalid",    "TLD-less local network",                 "alice@localhost"),
    # Multiple emails in one cell
    ("FE27", "multiple",   "Two comma-separated",                    "alice@example.com, bob@example.com"),
    ("FE28", "multiple",   "Two semicolon-separated",                "alice@example.com; bob@example.com"),
    # Edge
    ("FE29", "edge",       "Empty",                                  ""),
    ("FE30", "edge",       "Whitespace-only",                        "   "),
    ("FE31", "edge",       "Already perfect",                        "alice@example.com"),
]

write_input("26_format_emails.csv", email_rows)

# Two-policy expected output for emails: default (preserve Gmail-specific
# variants) and aggressive (--gmail-canonical strips dots and +tag for
# @gmail.com only).
email_expected = [
    # case_id            default                              gmail_canonical
    ("FE01", "alice@example.com",                             "alice@example.com"),
    ("FE02", "alice@example.com",                             "alice@example.com"),
    ("FE03", "alice@example.com",                             "alice@example.com"),
    ("FE04", "alice@example.com",                             "alice@example.com"),
    ("FE05", "alice@example.com",                             "alice@example.com"),
    ("FE06", "alice@example.com",                             "alice@example.com"),
    ("FE07", "alice@example.com",                             "alice@example.com"),
    ("FE08", "alice@example.com",                             "alice@example.com"),
    ("FE09", "alice@example.com",                             "alice@example.com"),
    ("FE10", "a.l.i.c.e@gmail.com",                           "alice@gmail.com"),
    ("FE11", "alice+newsletter@gmail.com",                    "alice@gmail.com"),
    ("FE12", "a.l.i.c.e+work@gmail.com",                      "alice@gmail.com"),
    ("FE13", "a.l.i.c.e@example.com",                         "a.l.i.c.e@example.com"),  # not Gmail
    ("FE14", "alice+newsletter@example.com",                  "alice+newsletter@example.com"),
    ("FE15", "alice@m\u00fcnchen.de",                         "alice@m\u00fcnchen.de"),
    ("FE16", "\u30a2\u30ea\u30b9@example.jp",                 "\u30a2\u30ea\u30b9@example.jp"),
    ("FE17", "alice@example.com",                             "alice@example.com"),
    ("FE18", "alice@example.com",                             "alice@example.com"),
    ("FE19", "alice@example.com",                             "alice@example.com"),
    ("FE20", "alice@example.com",                             "alice@example.com"),
    ("FE21", "alice@example.com",                             "alice@example.com"),
    ("FE22", "<error: missing @>",                            "<error: missing @>"),
    ("FE23", "<error: invalid - double @>",                   "<error: invalid - double @>"),
    ("FE24", "<error: multiple @>",                           "<error: multiple @>"),
    ("FE25", "<error: whitespace inside email>",              "<error: whitespace inside email>"),
    ("FE26", "<error: missing TLD>",                          "<error: missing TLD>"),
    ("FE27", "<error: multiple emails - split into separate cells first>",
                                                              "<error: multiple emails - split into separate cells first>"),
    ("FE28", "<error: multiple emails - split into separate cells first>",
                                                              "<error: multiple emails - split into separate cells first>"),
    ("FE29", "",                                              ""),
    ("FE30", "",                                              ""),
    ("FE31", "alice@example.com",                             "alice@example.com"),
]

write_expected("26_format_emails_expected.csv", email_expected,
               ["case_id", "output_default", "output_gmail_canonical"])


# ============================================================================
# 27 - ADDRESSES
# ============================================================================
# Scope: US addresses get full normalization. Non-US addresses get
# whitespace + capitalization only.
#
# Default policy for US:
#   - Title case if input is ALL CAPS or all lowercase
#   - Preserve mixed case otherwise
#   - Normalize street types to USPS abbreviations (Street -> St, Avenue -> Ave)
#   - Normalize directionals (NORTH -> N, North -> N)
#   - Normalize unit indicators (Apartment -> Apt, # -> Apt, Suite -> Ste)
#   - Normalize state to 2-letter code
#   - ZIP: preserve as-is (don't restore lost leading zeros - that's risky)
# Aggressive policy (--expand-abbrev): expand St -> Street, Ave -> Avenue.
#   Some downstream systems prefer expanded forms. Off by default.

address_rows = [
    # Already clean
    ("FA01", "clean",      "Already USPS-formatted",          "123 Main St, New York, NY 10001"),
    # Capitalization
    ("FA02", "case",       "All caps",                        "123 MAIN STREET, NEW YORK, NY 10001"),
    ("FA03", "case",       "All lowercase",                   "123 main street, new york, ny 10001"),
    ("FA04", "case",       "Mixed case (preserve)",           "123 Main Street, New York, NY 10001"),
    # Street type abbreviations
    ("FA05", "abbrev",     "Street spelled out",              "123 Main Street, New York, NY 10001"),
    ("FA06", "abbrev",     "Avenue spelled out",              "456 Park Avenue, New York, NY 10001"),
    ("FA07", "abbrev",     "Boulevard spelled out",           "789 Sunset Boulevard, Los Angeles, CA 90028"),
    ("FA08", "abbrev",     "St with period",                  "123 Main St., New York, NY 10001"),
    # Directionals
    ("FA09", "directional","North spelled out",               "123 North Main St, City, ST 12345"),
    ("FA10", "directional","NORTH all caps",                  "123 NORTH Main St, City, ST 12345"),
    ("FA11", "directional","NE compound",                     "123 NE Main St, City, ST 12345"),
    # Units
    ("FA12", "unit",       "Apartment spelled out",           "123 Main St, Apartment 4B, City, ST 12345"),
    ("FA13", "unit",       "Hash sign",                       "123 Main St, # 4B, City, ST 12345"),
    ("FA14", "unit",       "Suite spelled out",               "123 Main St, Suite 200, City, ST 12345"),
    # States
    ("FA15", "state",      "State spelled out",               "123 Main St, New York, New York 10001"),
    ("FA16", "state",      "State all caps spelled out",      "123 Main St, New York, NEW YORK 10001"),
    # ZIP
    ("FA17", "zip",        "ZIP+4",                           "123 Main St, New York, NY 10001-1234"),
    ("FA18", "zip",        "Leading-zero ZIP (MA)",           "123 Main St, Boston, MA 02101"),
    # Multi-line
    ("FA19", "multiline",  "Multi-line address",              "123 Main St\nApt 4B\nNew York, NY 10001"),
    # PO Box
    ("FA20", "pobox",      "PO Box with periods",             "P.O. Box 123, City, ST 12345"),
    ("FA21", "pobox",      "PO Box without periods",          "PO Box 123, City, ST 12345"),
    ("FA22", "pobox",      "Post Office Box spelled out",     "Post Office Box 123, City, ST 12345"),
    # House-number quirks
    ("FA23", "housenum",   "Letter suffix",                   "123A Main St, City, ST 12345"),
    ("FA24", "housenum",   "Hyphen number",                   "123-1 Main St, City, ST 12345"),
    ("FA25", "housenum",   "Half number",                     "123 1/2 Main St, City, ST 12345"),
    # Non-US (minimal handling)
    ("FA26", "non_us",     "UK postcode address",             "10 Downing Street, London, SW1A 2AA"),
    ("FA27", "non_us",     "Canada postal code",              "1 Yonge St, Toronto, ON M5E 1W7"),
    ("FA28", "non_us",     "Japan reverse-order",             "100-0001, Tokyo, Chiyoda, Marunouchi 1-1"),
    # Edge
    ("FA29", "edge",       "Empty",                           ""),
    ("FA30", "edge",       "Just a city",                     "New York"),
    ("FA31", "edge",       "Trailing comma",                  "123 Main St, New York, NY 10001,"),
]

write_input("27_format_addresses.csv", address_rows)

# Two-policy: default (USPS abbreviations) and expand (--expand-abbrev).
address_expected = [
    # case_id  default                                                    expand_abbrev
    ("FA01", "123 Main St, New York, NY 10001",
             "123 Main Street, New York, NY 10001"),
    ("FA02", "123 Main St, New York, NY 10001",
             "123 Main Street, New York, NY 10001"),
    ("FA03", "123 Main St, New York, NY 10001",
             "123 Main Street, New York, NY 10001"),
    ("FA04", "123 Main St, New York, NY 10001",
             "123 Main Street, New York, NY 10001"),  # mixed-case Street -> St in default
    ("FA05", "123 Main St, New York, NY 10001",
             "123 Main Street, New York, NY 10001"),
    ("FA06", "456 Park Ave, New York, NY 10001",
             "456 Park Avenue, New York, NY 10001"),
    ("FA07", "789 Sunset Blvd, Los Angeles, CA 90028",
             "789 Sunset Boulevard, Los Angeles, CA 90028"),
    ("FA08", "123 Main St, New York, NY 10001",
             "123 Main Street, New York, NY 10001"),
    ("FA09", "123 N Main St, City, ST 12345",
             "123 North Main Street, City, ST 12345"),
    ("FA10", "123 N Main St, City, ST 12345",
             "123 North Main Street, City, ST 12345"),
    ("FA11", "123 NE Main St, City, ST 12345",
             "123 Northeast Main Street, City, ST 12345"),
    ("FA12", "123 Main St, Apt 4B, City, ST 12345",
             "123 Main Street, Apartment 4B, City, ST 12345"),
    ("FA13", "123 Main St, Apt 4B, City, ST 12345",
             "123 Main Street, Apartment 4B, City, ST 12345"),
    ("FA14", "123 Main St, Ste 200, City, ST 12345",
             "123 Main Street, Suite 200, City, ST 12345"),
    ("FA15", "123 Main St, New York, NY 10001",
             "123 Main Street, New York, New York 10001"),  # state expanded only in expand mode
    ("FA16", "123 Main St, New York, NY 10001",
             "123 Main Street, New York, New York 10001"),
    ("FA17", "123 Main St, New York, NY 10001-1234",
             "123 Main Street, New York, NY 10001-1234"),
    ("FA18", "123 Main St, Boston, MA 02101",
             "123 Main Street, Boston, MA 02101"),
    ("FA19", "123 Main St, Apt 4B, New York, NY 10001",
             "123 Main Street, Apartment 4B, New York, NY 10001"),    # multi-line collapsed to single
    ("FA20", "PO Box 123, City, ST 12345",
             "Post Office Box 123, City, ST 12345"),
    ("FA21", "PO Box 123, City, ST 12345",
             "Post Office Box 123, City, ST 12345"),
    ("FA22", "PO Box 123, City, ST 12345",
             "Post Office Box 123, City, ST 12345"),
    ("FA23", "123A Main St, City, ST 12345",
             "123A Main Street, City, ST 12345"),
    ("FA24", "123-1 Main St, City, ST 12345",
             "123-1 Main Street, City, ST 12345"),
    ("FA25", "123 1/2 Main St, City, ST 12345",
             "123 1/2 Main Street, City, ST 12345"),
    ("FA26", "10 Downing Street, London, SW1A 2AA",                  # non-US: minimal handling
             "10 Downing Street, London, SW1A 2AA"),
    ("FA27", "1 Yonge St, Toronto, ON M5E 1W7",                      # St abbrev still applies (Canadian also uses USPS-likes)
             "1 Yonge Street, Toronto, ON M5E 1W7"),
    ("FA28", "100-0001, Tokyo, Chiyoda, Marunouchi 1-1",             # Japan: minimal
             "100-0001, Tokyo, Chiyoda, Marunouchi 1-1"),
    ("FA29", "",                                                     ""),
    ("FA30", "New York",                                             "New York"),
    ("FA31", "123 Main St, New York, NY 10001",                      # trailing comma stripped
             "123 Main Street, New York, NY 10001"),
]

write_expected("27_format_addresses_expected.csv", address_expected,
               ["case_id", "output_default", "output_expand_abbrev"])


# ============================================================================
# 28 - NAMES
# ============================================================================
# Default policy: very conservative. Title-case ONLY when input is ALL CAPS
# or all lowercase. Preserve mixed-case input unchanged. Special handling
# for Mc/Mac, O', hyphenated, particles (van, von, de, du, la, le, di).
# Titles and suffixes: normalize period (Mr. -> Mr) and capitalization.
# Non-Latin scripts: pass through unchanged.

name_rows = [
    # Capitalization
    ("FN01", "case",       "All caps",                          "ALICE SMITH"),
    ("FN02", "case",       "All lowercase",                     "alice smith"),
    ("FN03", "case",       "Already title case (preserve)",     "Alice Smith"),
    ("FN04", "case",       "Random case (preserve)",            "aLiCe SmItH"),
    # Mc / Mac
    ("FN05", "scots",      "McDonald lowercase",                "mcdonald"),
    ("FN06", "scots",      "MCDONALD all caps",                 "MCDONALD"),
    ("FN07", "scots",      "MacDonald",                         "macdonald"),
    ("FN08", "scots",      "McTaggart already correct",         "McTaggart"),
    # O'
    ("FN09", "irish",      "O'Connor lowercase",                "o'connor"),
    ("FN10", "irish",      "O'CONNOR all caps",                 "O'CONNOR"),
    ("FN11", "irish",      "O'Brien preserve",                  "O'Brien"),
    # Hyphenated
    ("FN12", "hyphen",     "Mary-Jane lowercase",               "mary-jane smith"),
    ("FN13", "hyphen",     "Smith-Jones",                       "smith-jones"),
    # Particles
    ("FN14", "particle",   "von Trapp",                         "von trapp"),
    ("FN15", "particle",   "Vincent van Gogh",                  "vincent van gogh"),
    ("FN16", "particle",   "Charles de Gaulle",                 "charles de gaulle"),
    ("FN17", "particle",   "Leonardo da Vinci",                 "leonardo da vinci"),
    # Titles
    ("FN18", "title",      "Mr period",                         "Mr. John Smith"),
    ("FN19", "title",      "DR caps",                           "DR JANE DOE"),
    ("FN20", "title",      "Prof preserve",                     "Prof Alice Williams"),
    # Suffixes
    ("FN21", "suffix",     "Jr period",                         "John Smith Jr."),
    ("FN22", "suffix",     "III roman numeral",                 "John Smith III"),
    ("FN23", "suffix",     "PhD",                               "Jane Doe PhD"),
    # Comma format
    ("FN24", "comma",      "Last, First",                       "Smith, John"),
    ("FN25", "comma",      "LAST, FIRST",                       "SMITH, JOHN"),
    ("FN26", "comma",      "Last, First Middle",                "Smith, John Andrew"),
    # Initials
    ("FN27", "initial",    "Middle initial",                    "John A. Smith"),
    ("FN28", "initial",    "Multi-initial author",              "j.k. rowling"),
    # Non-Latin
    ("FN29", "nonlatin",   "Korean",                            "\uae40\ucca0\uc218"),
    ("FN30", "nonlatin",   "Japanese",                          "\u7530\u4e2d\u592a\u90ce"),
    ("FN31", "nonlatin",   "Russian",                           "\u0418\u0432\u0430\u043d \u0418\u0432\u0430\u043d\u043e\u0432"),
    # Edge
    ("FN32", "edge",       "Single name",                       "Madonna"),
    ("FN33", "edge",       "Empty",                             ""),
    ("FN34", "edge",       "Whitespace-only",                   "   "),
]

write_input("28_format_names.csv", name_rows)

name_expected = [
    ("FN01", "Alice Smith"),
    ("FN02", "Alice Smith"),
    ("FN03", "Alice Smith"),                          # already title - preserve
    ("FN04", "aLiCe SmItH"),                          # mixed case - PRESERVE (don't guess)
    ("FN05", "McDonald"),
    ("FN06", "McDonald"),
    ("FN07", "MacDonald"),                            # default convention: MacD style
    ("FN08", "McTaggart"),
    ("FN09", "O'Connor"),
    ("FN10", "O'Connor"),
    ("FN11", "O'Brien"),
    ("FN12", "Mary-Jane Smith"),
    ("FN13", "Smith-Jones"),
    ("FN14", "von Trapp"),                            # particle stays lowercase
    ("FN15", "Vincent van Gogh"),
    ("FN16", "Charles de Gaulle"),
    ("FN17", "Leonardo da Vinci"),
    ("FN18", "Mr John Smith"),                        # period stripped
    ("FN19", "Dr Jane Doe"),
    ("FN20", "Prof Alice Williams"),
    ("FN21", "John Smith Jr"),
    ("FN22", "John Smith III"),                       # roman numerals stay all-caps
    ("FN23", "Jane Doe PhD"),                         # PhD stays mixed-case
    ("FN24", "John Smith"),                           # comma format reversed to natural order
    ("FN25", "John Smith"),
    ("FN26", "John Andrew Smith"),
    ("FN27", "John A. Smith"),                        # middle initial preserved
    ("FN28", "J.K. Rowling"),
    ("FN29", "\uae40\ucca0\uc218"),                   # non-Latin: pass through
    ("FN30", "\u7530\u4e2d\u592a\u90ce"),
    ("FN31", "\u0418\u0432\u0430\u043d \u0418\u0432\u0430\u043d\u043e\u0432"),
    ("FN32", "Madonna"),
    ("FN33", ""),
    ("FN34", ""),
]

write_expected("28_format_names_expected.csv", name_expected, ["case_id", "output"])


# ============================================================================
# 29 - CURRENCIES
# ============================================================================
# Default policy:
#   - Parse the numeric value
#   - Preserve currency symbol/code if present
#   - Normalize internal formatting: dot decimal, no thousand separators,
#     leading sign for negatives
#   - Locale detection per-column (US vs EU thousand/decimal conventions)
#   - Accounting parens become leading minus: ($100) -> -$100

currency_rows = [
    # US format
    ("FC01", "us",         "Standard US dollar",                "$1,234.56"),
    ("FC02", "us",         "US no comma",                       "$1234.56"),
    ("FC03", "us",         "US space after symbol",             "$ 1,234.56"),
    ("FC04", "us",         "US no symbol",                      "1,234.56"),
    ("FC05", "us",         "US with code suffix",               "1,234.56 USD"),
    ("FC06", "us",         "US with code prefix",               "USD 1,234.56"),
    ("FC07", "us",         "US trailing symbol",                "1234.56$"),
    # European format
    ("FC08", "eu",         "Euro standard",                     "\u20ac1.234,56"),
    ("FC09", "eu",         "Euro space thousand",               "\u20ac1 234,56"),
    ("FC10", "eu",         "Euro code suffix",                  "1.234,56 EUR"),
    ("FC11", "eu",         "Swiss apostrophe thousand",         "1'234.56"),
    # Other currencies
    ("FC12", "intl",       "GBP",                               "\u00a31,234.56"),
    ("FC13", "intl",       "JPY no decimal",                    "\u00a51,234"),
    ("FC14", "intl",       "Indian rupees lakhs",               "\u20b91,23,456.78"),
    # Negatives
    ("FC15", "negative",   "Leading minus",                     "-$100.00"),
    ("FC16", "negative",   "Accounting parens",                 "($100.00)"),
    ("FC17", "negative",   "Sign after symbol",                 "$-100.00"),
    # Edge
    ("FC18", "edge",       "Zero",                              "$0.00"),
    ("FC19", "edge",       "Scientific notation",               "1.5e6"),
    ("FC20", "edge",       "Percentage",                        "15.5%"),
    ("FC21", "edge",       "Range (not normalizable)",          "$50-$100"),
    ("FC22", "edge",       "Word value",                        "Free"),
    ("FC23", "edge",       "TBD placeholder",                   "TBD"),
    ("FC24", "edge",       "Empty",                             ""),
    ("FC25", "edge",       "Already clean",                     "1234.56"),
    # Locale-ambiguous
    ("FC26", "ambig",      "1,234 - could be US 1234 or EU 1.234",  "1,234"),
    ("FC27", "ambig",      "1.234 - could be US 1.234 or EU 1234",  "1.234"),
]

write_input("29_format_currencies.csv", currency_rows)

# Default output: <symbol_or_code><normalized_number>. Normalized number
# uses dot decimal, no thousand separators, leading minus sign for negatives.
# Currency symbol position preserved if it was there; if no symbol, pure number.
currency_expected = [
    ("FC01", "$1234.56"),
    ("FC02", "$1234.56"),
    ("FC03", "$1234.56"),
    ("FC04", "1234.56"),                                  # no symbol -> pure number
    ("FC05", "1234.56 USD"),
    ("FC06", "USD 1234.56"),
    ("FC07", "1234.56$"),                                 # trailing symbol preserved
    ("FC08", "\u20ac1234.56"),                            # EU comma->dot
    ("FC09", "\u20ac1234.56"),
    ("FC10", "1234.56 EUR"),
    ("FC11", "1234.56"),                                  # Swiss apostrophe stripped
    ("FC12", "\u00a31234.56"),
    ("FC13", "\u00a51234"),
    ("FC14", "\u20b9123456.78"),                          # lakhs grouping flattened
    ("FC15", "-$100.00"),
    ("FC16", "-$100.00"),                                 # parens -> leading minus
    ("FC17", "-$100.00"),                                 # sign normalized to leading
    ("FC18", "$0.00"),
    ("FC19", "1500000"),                                  # scientific expanded
    ("FC20", "<error: percentage not currency>"),
    ("FC21", "<error: range not normalizable - split first>"),
    ("FC22", "<error: non-numeric - 'Free'>"),
    ("FC23", "<error: non-numeric - 'TBD'>"),
    ("FC24", ""),
    ("FC25", "1234.56"),
    ("FC26", "<error: ambiguous separator - specify --locale us|eu>"),
    ("FC27", "<error: ambiguous separator - specify --locale us|eu>"),
]

write_expected("29_format_currencies_expected.csv", currency_expected, ["case_id", "output"])


# ============================================================================
# 30 - INTEGRATION (cross-domain rows)
# ============================================================================
# A single row with multiple format issues across columns. Tests that
# format standardization applied to multiple columns in one pass produces
# the right output and doesn't drop or scramble fields.
# Schema: case_id, name, email, phone, date, amount, address

integration_input_rows = [
    ("FI01", "ALICE SMITH",   "Alice@Example.COM",  "(555) 123-4567",
            "1/15/24",       "$1,234.56",       "123 main street, new york, ny 10001"),
    ("FI02", "mcdonald, john","mailto:John@gmail.com","+44 20 7946 0958",
            "15.01.2024",    "\u20ac1.234,56",  "10 DOWNING STREET, LONDON, SW1A 2AA"),
    ("FI03", "DR JANE DOE PHD","\"Jane Doe\" <jane@example.com>","555-1234",
            "Jan 15, 2024",  "($100.00)",       "456 Park Avenue, Apt 12, New York, NEW YORK 10001"),
    ("FI04", "",              "",               "",
            "",              "",                ""),
    ("FI05", "Already Clean", "alice@example.com","+15551234567",
            "2024-01-15",    "1234.56",         "123 Main St, New York, NY 10001"),
]

# This fixture has a different shape (multi-column), so we write it directly
path = TD / "30_format_integration.csv"
with path.open("w", encoding="utf-8", newline="") as f:
    w = csv.writer(f)
    w.writerow(["case_id", "name", "email", "phone", "date", "amount", "address"])
    for r in integration_input_rows:
        w.writerow(r)

# Expected output for integration: each cell normalized per its column's
# domain, all in one pass. Used to verify that running 03 across multiple
# columns produces consistent output.
integration_expected_rows = [
    ("FI01", "Alice Smith",       "alice@example.com",  "+15551234567",
            "2024-01-15",        "$1234.56",        "123 Main St, New York, NY 10001"),
    ("FI02", "John McDonald",     "john@gmail.com",     "+442079460958",
            "2024-01-15",        "\u20ac1234.56",   "10 Downing Street, London, SW1A 2AA"),
    ("FI03", "Dr Jane Doe PhD",   "jane@example.com",   "<error: insufficient digits - need 10 for US>",
            "2024-01-15",        "-$100.00",        "456 Park Ave, Apt 12, New York, NY 10001"),
    ("FI04", "",                  "",                   "",
            "",                  "",                ""),
    ("FI05", "Already Clean",     "alice@example.com",  "+15551234567",
            "2024-01-15",        "1234.56",         "123 Main St, New York, NY 10001"),
]

path_ex = EX / "30_format_integration_expected.csv"
with path_ex.open("w", encoding="utf-8", newline="") as f:
    w = csv.writer(f)
    w.writerow(["case_id", "name", "email", "phone", "date", "amount", "address"])
    for r in integration_expected_rows:
        w.writerow(r)


# ============================================================================
# Summary
# ============================================================================
files = sorted(TD.glob("*.csv"))
print(f"Wrote {len(files)} input fixtures in {TD}:")
for f in files:
    print(f"  {f.name}")

ex_files = sorted(EX.glob("*.csv"))
print(f"\nWrote {len(ex_files)} expected-output files in {EX}:")
for f in ex_files:
    print(f"  {f.name}")
