"""
Generates deduplicator test CSVs.
One file per use case in BUSINESS.md Section 4a, plus edge-case files
for Tier 1 spec items not naturally covered by the use cases.

Each file targets ONE primary dedup challenge so test results are diagnostic.
Row counts kept small (15-30) so expected output can be eyeballed.
"""

import csv
import os
from pathlib import Path

OUT = Path(".")


def write_csv(name, header, rows, encoding="utf-8", delimiter=","):
    path = OUT / name
    with open(path, "w", newline="", encoding=encoding) as f:
        w = csv.writer(f, delimiter=delimiter)
        w.writerow(header)
        w.writerows(rows)
    print(f"wrote {name} ({len(rows)} rows)")


# ---------------------------------------------------------------------------
# UC1: Shopify customer list - email/phone variant duplicates
# Tests: email canonicalization (Gmail dots, +tags, case), phone normalization,
#        same-person-different-email-same-phone detection
# ---------------------------------------------------------------------------
write_csv(
    "uc01_shopify_customer_list.csv",
    ["customer_id", "first_name", "last_name", "email", "phone", "signup_date"],
    [
        # Gmail dot + case + plus-tag variants of the same person
        ["C001", "John", "Smith", "john.smith@gmail.com", "(555) 123-4567", "2025-01-15"],
        ["C002", "John", "Smith", "JohnSmith@Gmail.com", "555-123-4567", "2025-02-03"],
        ["C003", "John", "Smith", "j.o.h.n.smith+shop@gmail.com", "+15551234567", "2025-02-19"],
        # Different person, similar name (should NOT match)
        ["C004", "Jon", "Smith", "jon.smith@yahoo.com", "(555) 987-6543", "2025-01-22"],
        # Phone-format variants, exact same person
        ["C005", "Maria", "Garcia", "maria@example.com", "555.222.3333", "2025-03-01"],
        ["C006", "Maria", "Garcia", "maria@example.com", "5552223333", "2025-03-10"],
        ["C007", "Maria", "Garcia", "maria@example.com", "+1 (555) 222-3333", "2025-03-15"],
        # Same person, two emails, same phone
        ["C008", "David", "Lee", "david.lee@work.com", "(555) 444-1111", "2025-02-01"],
        ["C009", "David", "Lee", "dlee@personal.com", "555-444-1111", "2025-02-28"],
        # Trailing whitespace pollution in email
        ["C010", "Sarah", "Jones", "sarah@example.com   ", "(555) 666-7777", "2025-01-08"],
        ["C011", "Sarah", "Jones", "sarah@example.com", "(555) 666-7777", "2025-01-09"],
        # Genuine unique customers (control rows)
        ["C012", "Alice", "Wong", "alice.wong@example.com", "(555) 333-9999", "2025-03-22"],
        ["C013", "Robert", "Brown", "rob.brown@example.com", "(555) 888-2222", "2025-03-25"],
        # Same email different name spelling (data entry typo case)
        ["C014", "Catherine", "Doe", "cdoe@example.com", "(555) 111-0000", "2025-01-30"],
        ["C015", "Katherine", "Doe", "cdoe@example.com", "(555) 111-0000", "2025-04-02"],
    ],
)


# ---------------------------------------------------------------------------
# UC2: Product catalog dedup - SKU whitespace/case + near-identical names
# Tests: per-column normalization on SKU, fuzzy match on product_name
# ---------------------------------------------------------------------------
write_csv(
    "uc02_product_catalog.csv",
    ["sku", "product_name", "price", "stock", "category"],
    [
        # SKU trailing whitespace + case variants (should all collapse)
        ["DOG-001", "Dog Collar - Red - Large", "12.99", "45", "Collars"],
        ["DOG-001 ", "Dog Collar Red Large", "12.99", "12", "Collars"],
        ["dog-001", "Dog Collar, Red, L", "12.99", "8", "Collars"],
        ["DOG-001\t", "Red Dog Collar (Large)", "12.99", "3", "Collars"],
        # Genuine variant with size only (should NOT match the above)
        ["DOG-002", "Dog Collar - Red - Medium", "11.99", "20", "Collars"],
        ["DOG-002", "Dog Collar Red Medium", "11.99", "15", "Collars"],
        # Near-identical names, same SKU prefix, abbreviation differences
        ["CAT-100", "Cat Scratching Post 36 inch", "29.99", "10", "Furniture"],
        ["CAT-100", "Cat Scratching Post 36\"", "29.99", "5", "Furniture"],
        ["CAT-100", "Cat Scratch Post 36in", "29.99", "2", "Furniture"],
        # Different SKU, identical-looking name (should NOT match)
        ["CAT-101", "Cat Scratching Post 36 inch", "31.99", "8", "Furniture"],
        # Whitespace inside product name
        ["BIRD-50", "Parrot   Cage  Large", "89.00", "4", "Cages"],
        ["BIRD-50", "Parrot Cage Large", "89.00", "1", "Cages"],
        # Unique products (control)
        ["FISH-22", "Aquarium Filter 20gal", "45.50", "12", "Aquatics"],
        ["RABT-7", "Rabbit Hutch Outdoor", "199.99", "3", "Cages"],
        ["DOG-555", "Dog Bed Memory Foam XL", "75.00", "8", "Beds"],
    ],
)


# ---------------------------------------------------------------------------
# UC3: Abandoned cart cleanup
# Tests: dedupe by email across multiple cart events (don't email same person 3x)
# ---------------------------------------------------------------------------
write_csv(
    "uc03_abandoned_carts.csv",
    ["cart_id", "email", "abandoned_at", "cart_value", "items_count"],
    [
        # Same customer, 3 separate cart events in 5 days
        ["AC1001", "buyer1@example.com", "2026-04-10 14:23:00", "87.50", "3"],
        ["AC1018", "buyer1@example.com", "2026-04-12 09:11:00", "120.00", "4"],
        ["AC1042", "BUYER1@example.com", "2026-04-15 18:55:00", "65.25", "2"],
        # Same customer, 2 cart events same day
        ["AC1003", "buyer2@example.com", "2026-04-10 10:00:00", "45.00", "1"],
        ["AC1009", "buyer2@example.com", "2026-04-10 16:30:00", "52.00", "2"],
        # Gmail dot variants of same customer
        ["AC1011", "j.doe@gmail.com", "2026-04-11 12:00:00", "200.00", "5"],
        ["AC1027", "jdoe@gmail.com", "2026-04-13 14:00:00", "180.00", "4"],
        # Genuinely unique abandoners (control)
        ["AC1004", "alice@example.com", "2026-04-10 11:00:00", "30.00", "1"],
        ["AC1006", "bob@example.com", "2026-04-10 13:00:00", "75.00", "2"],
        ["AC1019", "carol@example.com", "2026-04-12 15:00:00", "90.00", "3"],
        ["AC1031", "dan@example.com", "2026-04-13 17:00:00", "55.00", "2"],
        # Same person, same cart value, same item count, different cart_id (likely a dupe submit)
        ["AC1050", "eve@example.com", "2026-04-16 08:00:00", "112.00", "3"],
        ["AC1051", "eve@example.com", "2026-04-16 08:01:00", "112.00", "3"],
        # Long gap, same email - debatable whether to dedupe (separate engagement window)
        ["AC1099", "frank@example.com", "2026-01-05 10:00:00", "40.00", "1"],
        ["AC1187", "frank@example.com", "2026-04-18 15:00:00", "60.00", "2"],
    ],
)


# ---------------------------------------------------------------------------
# UC4: Order export consolidation - Shopify + Amazon + manual
# Tests: same-customer-across-sources, source attribution preservation,
#        manual entry typo handling
# ---------------------------------------------------------------------------
write_csv(
    "uc04_orders_consolidated.csv",
    ["source_order_id", "source", "order_date", "customer_email", "customer_name", "total"],
    [
        # Same customer, three platforms - should be one customer record
        ["SHOP-1001", "shopify", "2026-04-01", "kara.miller@gmail.com", "Kara Miller", "45.99"],
        ["AMZ-A789X", "amazon", "2026-04-03", "kara.miller@gmail.com", "K Miller", "32.50"],
        ["MAN-2026-04", "manual", "2026-04-05", "kara@miller-design.com", "Kara Miller", "120.00"],
        # Manual entry typo - same customer
        ["SHOP-1023", "shopify", "2026-04-08", "tom.harris@example.com", "Tom Harris", "67.00"],
        ["MAN-2026-09", "manual", "2026-04-10", "tom.haris@example.com", "Tom Harris", "55.00"],
        # Amazon-style stripped email format
        ["SHOP-1045", "shopify", "2026-04-12", "jenny.l@example.com", "Jenny Lee", "89.00"],
        ["AMZ-B221Y", "amazon", "2026-04-15", "JENNY.L@EXAMPLE.COM", "Jenny L.", "44.50"],
        # Same order ID prefix collision (different customers - should NOT merge)
        ["SHOP-2001", "shopify", "2026-04-02", "alex@example.com", "Alex Park", "33.00"],
        ["SHOP-2002", "shopify", "2026-04-02", "amy@example.com", "Amy Park", "33.00"],
        # Genuine single-channel customers (control)
        ["SHOP-3001", "shopify", "2026-04-06", "single1@example.com", "Single Buyer One", "78.00"],
        ["AMZ-C100Z", "amazon", "2026-04-07", "single2@example.com", "Single Buyer Two", "92.00"],
        ["MAN-2026-12", "manual", "2026-04-09", "single3@example.com", "Single Buyer Three", "150.00"],
        # Email + name disagree - same email, very different name (likely typo or shared inbox)
        ["SHOP-4001", "shopify", "2026-04-11", "sales@bigcorp.com", "Mike Johnson", "200.00"],
        ["SHOP-4012", "shopify", "2026-04-14", "sales@bigcorp.com", "Sarah Wright", "175.00"],
    ],
)


# ---------------------------------------------------------------------------
# UC5: Subscriber list hygiene before Klaviyo / Mailchimp import
# Tests: per-contact pricing means every dupe costs money; this needs to be SHARP
# ---------------------------------------------------------------------------
write_csv(
    "uc05_subscriber_list.csv",
    ["email", "first_name", "source", "subscribed_date", "tags"],
    [
        # Same person across 4 lead sources
        ["sub1@example.com", "Pat", "newsletter_signup", "2025-08-15", "newsletter"],
        ["SUB1@example.com", "Pat", "facebook_lead_form", "2025-09-01", "fb_q3_2025"],
        ["sub1@EXAMPLE.com", "Patricia", "checkout_optin", "2025-10-12", "customer"],
        ["sub1@example.com ", "Pat W", "popup_form", "2026-01-08", "popup_homepage"],
        # Gmail dot variants - all one person (Mailchimp/Klaviyo will charge for each)
        ["mike.smith@gmail.com", "Mike", "import_2024", "2024-11-20", "legacy"],
        ["mikesmith@gmail.com", "Mike S", "newsletter_signup", "2025-03-15", "newsletter"],
        ["m.i.k.e.s.m.i.t.h@gmail.com", "Michael", "facebook_lead_form", "2025-07-22", "fb_q3_2025"],
        # +tag variants
        ["promos+freebie@gmail.com", "Sam", "freebie_download", "2025-12-01", "lead_magnet"],
        ["promos@gmail.com", "Sam", "newsletter_signup", "2026-02-15", "newsletter"],
        # Truly unique subscribers (control)
        ["unique1@example.com", "Alpha", "newsletter_signup", "2025-09-10", "newsletter"],
        ["unique2@example.com", "Beta", "popup_form", "2025-10-05", "popup_homepage"],
        ["unique3@example.com", "Gamma", "facebook_lead_form", "2025-11-12", "fb_q4_2025"],
        ["unique4@example.com", "Delta", "checkout_optin", "2026-01-20", "customer"],
        # Domain-only variation (different person, same name - control)
        ["jdoe@company-a.com", "Jane Doe", "newsletter_signup", "2025-08-01", "newsletter"],
        ["jdoe@company-b.com", "Jane Doe", "newsletter_signup", "2025-08-02", "newsletter"],
    ],
)


# ---------------------------------------------------------------------------
# UC6: Bank export reconciliation - overlapping date ranges
# Tests: same transaction in two exports, slight description variations,
#        date+amount+description matching
# ---------------------------------------------------------------------------
write_csv(
    "uc06_bank_export_overlap.csv",
    ["txn_date", "description", "amount", "balance_after", "export_batch"],
    [
        # Export A: covers Apr 1-15. Export B: covers Apr 10-25. Overlap = Apr 10-15.
        # Transactions in overlap should appear twice in the combined file - dedup them.
        ["2026-04-10", "ACH DEPOSIT PAYROLL", "2500.00", "5230.45", "exportA"],
        ["2026-04-10", "ACH Dep Payroll", "2500.00", "5230.45", "exportB"],
        ["2026-04-11", "POS PURCHASE STARBUCKS #4421", "-6.75", "5223.70", "exportA"],
        ["2026-04-11", "POS Purchase Starbucks 4421", "-6.75", "5223.70", "exportB"],
        ["2026-04-12", "CHECK #1042", "-450.00", "4773.70", "exportA"],
        ["2026-04-12", "CHECK 1042", "-450.00", "4773.70", "exportB"],
        ["2026-04-13", "ATM WITHDRAWAL ATM0019", "-100.00", "4673.70", "exportA"],
        ["2026-04-13", "ATM Withdrawal ATM0019", "-100.00", "4673.70", "exportB"],
        ["2026-04-15", "ONLINE TRANSFER TO SAVINGS", "-200.00", "4473.70", "exportA"],
        ["2026-04-15", "Online Transfer to Savings", "-200.00", "4473.70", "exportB"],
        # Non-overlap transactions, only in one export each
        ["2026-04-02", "ACH DEBIT UTILITY CO", "-145.00", "2730.45", "exportA"],
        ["2026-04-05", "POS PURCHASE GROCERY MART", "-87.32", "2643.13", "exportA"],
        ["2026-04-08", "DEPOSIT MOBILE CHECK", "200.00", "2843.13", "exportA"],
        ["2026-04-20", "ACH DEBIT INSURANCE CO", "-220.00", "4253.70", "exportB"],
        ["2026-04-22", "POS PURCHASE GAS STATION", "-45.10", "4208.60", "exportB"],
        ["2026-04-25", "INTEREST PAYMENT", "0.85", "4209.45", "exportB"],
        # Same date, same amount, different description (NOT a dupe - two coffees)
        ["2026-04-11", "POS PURCHASE STARBUCKS #4421", "-6.75", "5216.95", "exportA"],
        ["2026-04-11", "POS PURCHASE PEET'S #1102", "-6.75", "5210.20", "exportA"],
    ],
)


# ---------------------------------------------------------------------------
# UC7: Vendor list consolidation across QuickBooks + spreadsheet + email
# Tests: company-name fuzzy match (Inc/Corp/LLC/Ltd variants), EIN as exact key
# ---------------------------------------------------------------------------
write_csv(
    "uc07_vendor_consolidation.csv",
    ["vendor_name", "ein", "contact_email", "phone", "source"],
    [
        # Same vendor, three name renderings, same EIN
        ["Acme Corp", "12-3456789", "billing@acme.com", "(555) 100-2000", "quickbooks"],
        ["Acme Corporation", "12-3456789", "ap@acme.com", "555-100-2000", "spreadsheet"],
        ["ACME CORP.", "12-3456789", "billing@acme.com", "5551002000", "email_extract"],
        # Vendor with no EIN populated in some sources (common real-world case)
        ["Beta Solutions LLC", "98-7654321", "info@betasolutions.com", "(555) 200-3000", "quickbooks"],
        ["Beta Solutions, LLC", "", "billing@betasolutions.com", "555.200.3000", "spreadsheet"],
        ["beta solutions", "98-7654321", "", "(555) 200-3000", "email_extract"],
        # Two different vendors with VERY similar names (should NOT merge)
        ["Smith Consulting Inc", "11-1111111", "ap@smithconsulting.com", "(555) 300-4000", "quickbooks"],
        ["Smith Consulting LLC", "22-2222222", "ap@smith-consulting.com", "(555) 300-4001", "quickbooks"],
        # DBA / parent company case
        ["Gamma Industries Inc", "33-3333333", "ap@gamma.com", "(555) 400-5000", "quickbooks"],
        ["Gamma Industries Inc DBA QuickPrint", "33-3333333", "billing@quickprint.com", "(555) 400-5001", "spreadsheet"],
        # Vendor name with trailing punctuation and whitespace
        ["Delta Services, Inc.   ", "44-4444444", "ap@delta.com", "(555) 500-6000", "email_extract"],
        ["Delta Services Inc", "44-4444444", "ap@delta.com", "(555) 500-6000", "quickbooks"],
        # Single-source unique vendors (control)
        ["Unique Vendor One", "55-5555555", "u1@example.com", "(555) 600-7000", "quickbooks"],
        ["Unique Vendor Two", "66-6666666", "u2@example.com", "(555) 700-8000", "spreadsheet"],
        ["Unique Vendor Three", "77-7777777", "u3@example.com", "(555) 800-9000", "email_extract"],
    ],
)


# ---------------------------------------------------------------------------
# UC8: Customer master record cleanup - MERGE MODE test
# Tests: partial records that should be combined into one complete record.
#        This is the critical merge-mode (Tier 1 spec item 12) test case.
# ---------------------------------------------------------------------------
write_csv(
    "uc08_customer_master_merge.csv",
    ["customer_id", "name", "email", "phone", "address", "city", "state", "zip", "last_purchase"],
    [
        # Customer A: 3 partial records, each has different fields populated.
        # Merge should produce ONE complete record.
        ["CUST-A1", "Linda Park", "linda.park@example.com", "", "", "", "", "", "2025-03-15"],
        ["CUST-A2", "Linda Park", "", "(555) 123-9999", "1234 Oak St", "Portland", "OR", "97201", ""],
        ["CUST-A3", "L. Park", "linda.park@example.com", "(555) 123-9999", "1234 Oak Street", "", "", "97201", "2026-01-10"],
        # Customer B: 2 partial records
        ["CUST-B1", "James Wilson", "jwilson@example.com", "(555) 444-8888", "", "Austin", "TX", "", "2025-11-22"],
        ["CUST-B2", "James Wilson", "jwilson@example.com", "", "789 Pine Ave", "Austin", "TX", "78701", ""],
        # Customer C: 4 partial records, conflicting last_purchase dates
        # (most-recent rule should pick the latest)
        ["CUST-C1", "Anna Chen", "anna.chen@example.com", "", "", "", "", "", "2024-08-01"],
        ["CUST-C2", "Anna Chen", "anna.chen@example.com", "(555) 222-7777", "", "", "", "", "2025-02-14"],
        ["CUST-C3", "Anna Chen", "", "(555) 222-7777", "456 Elm Dr", "Seattle", "WA", "", "2025-09-30"],
        ["CUST-C4", "A Chen", "anna.chen@example.com", "(555) 222-7777", "456 Elm Drive", "Seattle", "WA", "98101", "2026-03-20"],
        # Customer D: only ONE record, fully populated (no merge needed - control)
        ["CUST-D1", "Ricardo Lopez", "rlopez@example.com", "(555) 666-1212", "999 Maple Ln", "Denver", "CO", "80202", "2026-02-05"],
        # Customer E: two records that look similar but are different people (control)
        ["CUST-E1", "John A Brown", "jabrown@example.com", "(555) 111-3333", "111 First St", "Boston", "MA", "02101", "2026-01-05"],
        ["CUST-E2", "John B Brown", "jbbrown@example.com", "(555) 111-4444", "222 Second St", "Boston", "MA", "02102", "2026-01-06"],
        # Customer F: empty email AND empty phone in some records (under-keyed - hard case)
        ["CUST-F1", "Maria Costa", "", "", "333 Bay Rd", "Miami", "FL", "33101", ""],
        ["CUST-F2", "Maria Costa", "mcosta@example.com", "(555) 777-2222", "333 Bay Rd", "Miami", "FL", "33101", "2026-04-01"],
    ],
)


# ---------------------------------------------------------------------------
# UC9: Expense report dedup - same receipt submitted twice
# Tests: date+vendor+amount triplet match, near-duplicate description handling
# ---------------------------------------------------------------------------
write_csv(
    "uc09_expense_reports.csv",
    ["expense_id", "employee", "expense_date", "vendor", "amount", "description", "submitted_at"],
    [
        # Same employee submitted same receipt twice, slight description variation
        ["EXP-001", "Tom R", "2026-03-15", "Marriott", "234.56", "Hotel client visit", "2026-03-18 09:00"],
        ["EXP-019", "Tom R", "2026-03-15", "Marriott Hotels", "234.56", "Hotel - client mtg", "2026-04-02 14:30"],
        # Two employees attended same dinner and each submitted - NOT a dedup case
        ["EXP-005", "Tom R", "2026-03-16", "Steakhouse", "187.45", "Client dinner with team", "2026-03-19 10:15"],
        ["EXP-006", "Sara K", "2026-03-16", "Steakhouse", "187.45", "Client dinner", "2026-03-19 11:00"],
        # Same employee, same vendor, same date, but two legitimate purchases (lunch + dinner)
        ["EXP-010", "Mike P", "2026-03-20", "Local Cafe", "12.50", "Lunch with vendor", "2026-03-21 09:00"],
        ["EXP-011", "Mike P", "2026-03-20", "Local Cafe", "8.75", "Coffee meeting", "2026-03-21 09:01"],
        # Triple-submission case (employee resubmitted thinking it failed)
        ["EXP-022", "Anna L", "2026-03-25", "Uber", "45.20", "Airport transfer", "2026-03-26 08:00"],
        ["EXP-024", "Anna L", "2026-03-25", "Uber", "45.20", "Airport transfer to hotel", "2026-03-26 08:15"],
        ["EXP-027", "Anna L", "2026-03-25", "UBER", "45.20", "airport->hotel", "2026-03-26 09:30"],
        # Vendor name variation (legitimate single submission)
        ["EXP-033", "Ben T", "2026-03-28", "Office Depot", "89.99", "Printer ink", "2026-03-30 13:00"],
        # Same amount, different vendor, same employee (NOT a dupe)
        ["EXP-040", "Tom R", "2026-04-01", "Hyatt", "234.56", "Hotel different city", "2026-04-03 10:00"],
        # Control rows
        ["EXP-050", "Sara K", "2026-04-05", "American Airlines", "412.00", "Flight to Chicago", "2026-04-06 09:00"],
        ["EXP-051", "Mike P", "2026-04-05", "Hertz", "189.50", "Rental car", "2026-04-06 10:00"],
        ["EXP-052", "Anna L", "2026-04-06", "Subway", "11.25", "Lunch", "2026-04-07 12:00"],
    ],
)


# ---------------------------------------------------------------------------
# UC10: Pre-analysis cleanup of client data dump - the kitchen sink
# Tests: combination of every dirty-data pattern in a single file
# ---------------------------------------------------------------------------
write_csv(
    "uc10_client_data_dump_messy.csv",
    ["id", "full_name", "email", "phone", "company", "notes"],
    [
        # Trailing whitespace on every field
        ["1", "Alice Johnson  ", " alice@ex.com ", "(555) 100-1000", "Ex Corp ", "VIP client"],
        ["2", "alice johnson", "alice@ex.com", "5551001000", "Ex Corp", ""],
        # Smart quote / unicode pollution
        ["3", "Bob O\u2019Brien", "bob.obrien@example.com", "(555) 200-2000", "OBrien LLC", "intro by Alice"],
        ["4", "Bob O'Brien", "BOB.OBRIEN@example.com", "555-200-2000", "O'Brien LLC", "Intro by Alice"],
        # Full-width / Unicode duplicate
        ["5", "Carol Wu", "carol@example.com", "(555) 300-3000", "WuCo", "follow up"],
        ["6", "Carol Wu", "carol\uff20example.com", "(555) 300-3000", "WuCo", "follow up Q2"],
        # Email + phone disagree (same email, different phone)
        ["7", "Derek Hall", "dhall@example.com", "(555) 400-4000", "Hall Industries", ""],
        ["8", "Derek Hall", "dhall@example.com", "(555) 400-4099", "Hall Inds.", ""],
        # Pure dupes
        ["9", "Eve Martin", "eve@example.com", "(555) 500-5000", "Martin Co", "lead"],
        ["10", "Eve Martin", "eve@example.com", "(555) 500-5000", "Martin Co", "lead"],
        # Genuinely unique (control)
        ["11", "Frank Ng", "fng@example.com", "(555) 600-6000", "Ng Group", ""],
        ["12", "Grace Park", "gpark@example.com", "(555) 700-7000", "Park & Co", ""],
        # Company name fuzzy variation, same person
        ["13", "Henry Lo", "hlo@example.com", "(555) 800-8000", "Lo Holdings Inc.", "renewal"],
        ["14", "Henry Lo", "hlo@example.com", "(555) 800-8000", "Lo Holdings, Inc", "renewal Q2"],
        # Embedded tab in name (data extraction artifact)
        ["15", "Ivan\tKim", "ikim@example.com", "(555) 900-9000", "Kim Solutions", ""],
        ["16", "Ivan Kim", "ikim@example.com", "(555) 900-9000", "Kim Solutions", ""],
    ],
)


# ---------------------------------------------------------------------------
# UC11: Survey response dedup - same respondent, multiple devices
# Tests: same email + similar timestamps = duplicate submission
# ---------------------------------------------------------------------------
write_csv(
    "uc11_survey_responses.csv",
    ["response_id", "submitted_at", "ip_address", "respondent_email", "q1_satisfaction", "q2_recommend", "q3_comments"],
    [
        # Same person submitted from phone then laptop within 3 minutes
        ["R0001", "2026-04-20 09:15:22", "192.168.1.50", "user1@example.com", "5", "Yes", "Great service"],
        ["R0002", "2026-04-20 09:17:45", "10.0.0.12", "user1@example.com", "5", "Yes", "Great service"],
        # Same person, slight answer variation (changed mind on second submit)
        ["R0010", "2026-04-20 10:30:00", "192.168.1.51", "user2@example.com", "4", "Yes", "Good but could improve"],
        ["R0011", "2026-04-20 10:35:12", "192.168.1.52", "user2@example.com", "5", "Yes", "Good but could improve"],
        # Email case variant
        ["R0020", "2026-04-20 11:00:00", "192.168.1.55", "User3@Example.com", "3", "Maybe", "OK"],
        ["R0021", "2026-04-20 11:02:30", "192.168.1.55", "user3@example.com", "3", "Maybe", "OK"],
        # Same household, two different people same WiFi (NOT a dupe)
        ["R0030", "2026-04-20 14:00:00", "73.55.10.100", "spouse1@example.com", "5", "Yes", "Loved it"],
        ["R0031", "2026-04-20 14:30:00", "73.55.10.100", "spouse2@example.com", "4", "Yes", "Liked it"],
        # Same email, but submitted 2 weeks apart (probably legitimate re-survey)
        ["R0040", "2026-04-05 08:00:00", "192.168.2.10", "user4@example.com", "3", "Maybe", "Mixed"],
        ["R0041", "2026-04-19 15:00:00", "192.168.2.11", "user4@example.com", "5", "Yes", "Got better"],
        # Truly unique respondents (control)
        ["R0050", "2026-04-20 08:00:00", "100.10.10.10", "unique1@example.com", "5", "Yes", "Excellent"],
        ["R0051", "2026-04-20 09:00:00", "100.10.10.11", "unique2@example.com", "4", "Yes", "Solid"],
        ["R0052", "2026-04-20 10:00:00", "100.10.10.12", "unique3@example.com", "2", "No", "Disappointing"],
        ["R0053", "2026-04-20 11:00:00", "100.10.10.13", "unique4@example.com", "5", "Yes", "Recommended"],
    ],
)


# ---------------------------------------------------------------------------
# UC12: Lead list cleanup before client handoff
# Tests: name variations, email variations, lead-source preservation
# ---------------------------------------------------------------------------
write_csv(
    "uc12_lead_list_handoff.csv",
    ["lead_id", "first_name", "last_name", "email", "phone", "company", "lead_source", "captured_date"],
    [
        # Same lead from 3 different sources
        ["L001", "Patricia", "Anders", "p.anders@bigco.com", "(555) 100-2222", "Big Co", "trade_show", "2026-02-15"],
        ["L045", "Pat", "Anders", "p.anders@bigco.com", "5551002222", "Big Co Inc", "linkedin", "2026-03-01"],
        ["L091", "Patty", "Anders", "panders@bigco.com", "(555) 100-2222", "BigCo", "webinar", "2026-03-22"],
        # Same lead, different name spellings
        ["L010", "Mohammed", "Khan", "m.khan@example.com", "(555) 200-3333", "Khan Industries", "trade_show", "2026-02-15"],
        ["L032", "Muhammad", "Khan", "mkhan@example.com", "(555) 200-3333", "Khan Industries", "cold_outreach", "2026-02-28"],
        ["L067", "Mohammad", "Khan", "m.khan@example.com", "(555) 200-3333", "Khan Industries Inc", "referral", "2026-03-10"],
        # Lead with role/title in name field (junk that should be cleaned)
        ["L020", "Sarah", "Lee, VP Marketing", "slee@target.com", "(555) 300-4444", "Target Corp", "linkedin", "2026-02-20"],
        ["L058", "Sarah", "Lee", "slee@target.com", "(555) 300-4444", "Target Corp", "webinar", "2026-03-05"],
        # Genuine separate leads at the same company (NOT dupes)
        ["L080", "John", "Adams", "jadams@enterprise.com", "(555) 400-5555", "Enterprise Inc", "trade_show", "2026-03-15"],
        ["L081", "John", "Adams II", "jadams2@enterprise.com", "(555) 400-5556", "Enterprise Inc", "trade_show", "2026-03-15"],
        # Unique leads (control)
        ["L100", "Wei", "Chen", "wchen@startup.io", "(555) 500-6666", "Startup IO", "referral", "2026-03-20"],
        ["L101", "Olivia", "Mendez", "omendez@agency.com", "(555) 600-7777", "Agency Co", "linkedin", "2026-03-21"],
        ["L102", "Raj", "Patel", "rpatel@firm.com", "(555) 700-8888", "Firm LLC", "cold_outreach", "2026-03-22"],
        ["L103", "Emma", "Wright", "ewright@consulting.com", "(555) 800-9999", "Wright Consulting", "webinar", "2026-03-23"],
    ],
)


# ---------------------------------------------------------------------------
# UC13: Email list dedup across multiple lead-source files
# Tests: pure email-based dedup with source attribution preservation
# (Survivor selection should preserve EARLIEST capture date and ALL sources)
# ---------------------------------------------------------------------------
write_csv(
    "uc13_combined_lead_sources.csv",
    ["email", "captured_date", "source_channel", "campaign", "consent"],
    [
        # Same email captured by 4 different campaigns
        ["lead1@example.com", "2025-09-15", "facebook_ad", "back_to_school_2025", "yes"],
        ["lead1@example.com", "2025-11-20", "google_ad", "black_friday_2025", "yes"],
        ["lead1@example.com", "2026-01-08", "organic_form", "homepage_popup", "yes"],
        ["lead1@example.com", "2026-03-12", "facebook_ad", "spring_2026", "yes"],
        # Email with consent disagreement across sources (one says no - keep the no)
        ["lead2@example.com", "2025-10-01", "facebook_ad", "halloween_2025", "yes"],
        ["lead2@example.com", "2025-12-15", "google_ad", "holiday_2025", "no"],
        ["lead2@example.com", "2026-02-01", "organic_form", "valentines_2026", "yes"],
        # Gmail dot variants from different campaigns - same person, charged 3x by ESP
        ["lead.three@gmail.com", "2025-08-01", "facebook_ad", "summer_2025", "yes"],
        ["leadthree@gmail.com", "2025-10-15", "google_ad", "fall_2025", "yes"],
        ["l.e.a.d.three@gmail.com", "2026-02-20", "organic_form", "winter_2026", "yes"],
        # Controls: each captured once
        ["solo1@example.com", "2025-09-10", "facebook_ad", "back_to_school_2025", "yes"],
        ["solo2@example.com", "2025-11-05", "google_ad", "black_friday_2025", "yes"],
        ["solo3@example.com", "2026-01-20", "organic_form", "homepage_popup", "yes"],
        ["solo4@example.com", "2026-03-08", "facebook_ad", "spring_2026", "yes"],
        ["solo5@example.com", "2026-04-15", "referral", "ambassador_program", "yes"],
    ],
)


# ---------------------------------------------------------------------------
# UC14: Audience reconciliation across FB + Google + organic
# Tests: cross-platform IDs (no shared key in some rows - email is the bridge)
# ---------------------------------------------------------------------------
write_csv(
    "uc14_audience_cross_platform.csv",
    ["email", "fb_id", "google_click_id", "platform", "first_seen", "last_seen"],
    [
        # Same person across all 3 platforms
        ["audience1@example.com", "fb_77881122", "", "facebook", "2026-01-05", "2026-04-20"],
        ["audience1@example.com", "", "gclid_AAA111", "google_ads", "2026-02-10", "2026-04-18"],
        ["audience1@example.com", "", "", "organic_form", "2026-03-15", "2026-03-15"],
        # Same person, FB + Google only
        ["audience2@example.com", "fb_99887766", "", "facebook", "2026-02-01", "2026-04-15"],
        ["audience2@example.com", "", "gclid_BBB222", "google_ads", "2026-02-20", "2026-04-22"],
        # Same FB ID across two emails (likely person changed primary email)
        ["old.email@example.com", "fb_55443322", "", "facebook", "2025-06-10", "2025-12-01"],
        ["new.email@example.com", "fb_55443322", "", "facebook", "2026-01-15", "2026-04-20"],
        # FB record with no email (anonymous pixel hit)
        ["", "fb_11223344", "", "facebook", "2026-03-01", "2026-04-01"],
        # Google record with no email
        ["", "", "gclid_CCC333", "google_ads", "2026-03-10", "2026-04-10"],
        # Single-platform identified users (control)
        ["fbonly@example.com", "fb_44556677", "", "facebook", "2026-03-20", "2026-04-20"],
        ["googleonly@example.com", "", "gclid_DDD444", "google_ads", "2026-03-25", "2026-04-22"],
        ["organiconly@example.com", "", "", "organic_form", "2026-04-01", "2026-04-01"],
        # Edge case: same email, FB ID and Google ID both populated in one row
        ["combined@example.com", "fb_88990011", "gclid_EEE555", "manual_merge", "2026-04-10", "2026-04-25"],
        # Phone-only record (no email, no platform IDs - common for SMS list)
        ["", "", "", "sms_list", "2026-04-05", "2026-04-05"],
    ],
)


# ---------------------------------------------------------------------------
# UC15: Suppression list combine - unsubscribes / bounces / complaints
# Tests: combine multiple suppression sources, preserve strongest reason
# (any reason to suppress => suppress; never re-mail)
# ---------------------------------------------------------------------------
write_csv(
    "uc15_suppression_combined.csv",
    ["email", "suppression_reason", "suppressed_date", "source_list"],
    [
        # Same email suppressed for 3 different reasons (any one is enough)
        ["supp1@example.com", "unsubscribe", "2025-08-15", "list_main"],
        ["supp1@example.com", "hard_bounce", "2025-09-20", "list_promo"],
        ["supp1@example.com", "spam_complaint", "2025-10-01", "list_main"],
        # Same email, same reason, multiple sources (true dedup case)
        ["supp2@example.com", "unsubscribe", "2026-01-10", "list_main"],
        ["supp2@example.com", "unsubscribe", "2026-01-10", "list_promo"],
        ["supp2@example.com", "unsubscribe", "2026-01-10", "list_newsletter"],
        # Email case variant - should still suppress
        ["Supp3@Example.com", "unsubscribe", "2025-12-05", "list_main"],
        ["supp3@example.com", "hard_bounce", "2026-02-15", "list_promo"],
        # Gmail dot variants - all the same person; ALL should suppress
        ["bounced@gmail.com", "hard_bounce", "2026-03-01", "list_main"],
        ["b.o.u.n.c.e.d@gmail.com", "hard_bounce", "2026-03-01", "list_promo"],
        # Spam complaints from one person (highest severity)
        ["complainer@example.com", "spam_complaint", "2026-02-20", "list_main"],
        # Soft bounces (different from hard - may not justify suppression depending on policy)
        ["soft@example.com", "soft_bounce_x3", "2026-03-15", "list_main"],
        # Controls: single-record suppressions
        ["solo_unsub@example.com", "unsubscribe", "2026-04-01", "list_main"],
        ["solo_bounce@example.com", "hard_bounce", "2026-04-05", "list_promo"],
        ["solo_complaint@example.com", "spam_complaint", "2026-04-10", "list_main"],
        # Edge: trailing whitespace
        ["padded@example.com   ", "unsubscribe", "2026-04-12", "list_main"],
        ["padded@example.com", "unsubscribe", "2026-04-12", "list_promo"],
    ],
)


# ===========================================================================
# EDGE CASE FILES - test Tier 1 spec items not naturally covered above
# ===========================================================================

# ---------------------------------------------------------------------------
# EC1: Encoding test - Windows-1252 (Latin-1) encoded file
# Tests: spec item 1 (auto-detect file encoding). Same data as UC1 but with
#        non-ASCII characters in cp1252 encoding. Many real exports from
#        Windows accounting tools come this way.
# ---------------------------------------------------------------------------
write_csv(
    "ec01_encoding_windows1252.csv",
    ["customer_id", "name", "email", "city"],
    [
        ["E001", "Bj\xf6rn Andersson", "bjorn@example.com", "Malm\xf6"],
        ["E002", "Bjorn Andersson", "bjorn@example.com", "Malmo"],
        ["E003", "Fran\xe7ois Dupont", "f.dupont@example.com", "Montr\xe9al"],
        ["E004", "Francois Dupont", "f.dupont@example.com", "Montreal"],
        ["E005", "Jos\xe9 Garc\xeda", "jgarcia@example.com", "M\xe9xico"],
        ["E006", "Jose Garcia", "jgarcia@example.com", "Mexico"],
        ["E007", "M\xfcller GmbH", "kontakt@muller.de", "K\xf6ln"],
        ["E008", "Muller GmbH", "kontakt@muller.de", "Koln"],
        ["E009", "Unique Person", "unique@example.com", "London"],
        ["E010", "Another Unique", "another@example.com", "Paris"],
    ],
    encoding="cp1252",
)


# ---------------------------------------------------------------------------
# EC2: Delimiter test - semicolon-separated (common in European exports)
# Tests: spec item 2 (auto-detect delimiter)
# ---------------------------------------------------------------------------
write_csv(
    "ec02_delimiter_semicolon.csv",
    ["id", "name", "email", "amount"],
    [
        ["1", "Test User", "test1@example.com", "1.234,56"],
        ["2", "Test User", "TEST1@example.com", "1.234,56"],
        ["3", "Other User", "test2@example.com", "987,00"],
        ["4", "Other User", "test2@example.com", "987,00"],
        ["5", "Unique", "unique@example.com", "500,00"],
    ],
    delimiter=";",
)


# ---------------------------------------------------------------------------
# EC3: Tab-separated edge case
# Tests: spec item 2 (auto-detect delimiter for TSV)
# ---------------------------------------------------------------------------
write_csv(
    "ec03_delimiter_tab.tsv",
    ["sku", "product", "price"],
    [
        ["A-100", "Widget", "9.99"],
        ["A-100", "Widget Standard", "9.99"],
        ["A-101", "Widget Plus", "12.99"],
        ["A-100 ", "WIDGET", "9.99"],
        ["B-200", "Gadget", "15.00"],
    ],
    delimiter="\t",
)


# ---------------------------------------------------------------------------
# EC4: UTF-8 BOM file (Excel-on-Windows default for "Save as CSV UTF-8")
# Tests: spec item 1 - UTF-8 BOM detection. Critical because csv libraries
# read the BOM as part of the first column header otherwise.
# ---------------------------------------------------------------------------
def write_csv_with_bom(name, header, rows):
    path = OUT / name
    with open(path, "w", newline="", encoding="utf-8-sig") as f:
        w = csv.writer(f)
        w.writerow(header)
        w.writerows(rows)
    print(f"wrote {name} ({len(rows)} rows, UTF-8 BOM)")


write_csv_with_bom(
    "ec04_utf8_bom.csv",
    ["customer_id", "name", "email"],
    [
        ["B001", "Anders\xe9n", "andersen@example.com"],
        ["B002", "Andersen", "andersen@example.com"],
        ["B003", "Smith", "smith@example.com"],
        ["B004", "SMITH", "Smith@Example.com"],
        ["B005", "Lee", "lee@example.com"],
    ],
)


print("\nAll test files written.")
print("Files created:")
for f in sorted(OUT.glob("*.csv")) + sorted(OUT.glob("*.tsv")):
    size = f.stat().st_size
    print(f"  {f.name}  ({size} bytes)")
