"""
Generator for quote-variant and malformed-CSV fixtures.

Produces:
  test_data/22_quote_variants.csv   - well-formed 5-column CSV; payload cells
                                      contain every common Unicode quote
                                      variant. Tests cleaner's character
                                      transformation logic.
  test_data/23_csv_malformed.csv    - intentionally malformed structure.
                                      2-column nominal header. Each row
                                      demonstrates one structural failure
                                      mode. Tests parser robustness, not
                                      transformation logic.

Both written as raw bytes (no csv.writer) so we control exact contents.
"""
from pathlib import Path

ROOT = Path(__file__).parent
TD = ROOT / "test_data"
TD.mkdir(exist_ok=True)


# ============================================================================
# 22_quote_variants.csv
# ============================================================================
# 5-column well-formed CSV. Every cell that contains an embedded ASCII double
# quote is properly escaped by doubling. Every payload is the actual character
# variant we want the cleaner to act on.
#
# Layout:
#   case_id | category | char_name | codepoint | payload
#
# Every row parses cleanly. The "test" is: read this file, run the cleaner,
# observe what each character became in the output. There is no single
# "expected" output because policy choices vary (some operators want curly
# folded to ASCII, some want it preserved, some want guillemets translated,
# others not). The corpus catalogs the inputs; QUOTE-CASES.md catalogs the
# expected outputs under the cleaner's documented default policy.

def csv_escape(s: str) -> str:
    """RFC-4180 escape: wrap in double quotes, double any internal quote."""
    return '"' + s.replace('"', '""') + '"'

def make_row(case_id, category, char_name, codepoint, payload):
    fields = [case_id, category, char_name, codepoint, payload]
    return ",".join(csv_escape(f) for f in fields) + "\n"

quote_rows = [
    # ---- Section A: ASCII baseline (control rows; these should pass through unchanged) ----
    ("Q01", "ascii",      "QUOTATION MARK (straight double)",     "U+0022",
        'plain "double" quoted text'),
    ("Q02", "ascii",      "APOSTROPHE (straight single)",         "U+0027",
        "plain 'single' quoted text"),

    # ---- Section B: Curly / smart quotes (Word and Outlook autocorrect produce these) ----
    ("Q03", "curly",      "LEFT DOUBLE QUOTATION MARK alone",     "U+201C",
        "opens but no close: \u201Chello"),
    ("Q04", "curly",      "RIGHT DOUBLE QUOTATION MARK alone",    "U+201D",
        "closes with no open: hello\u201D"),
    ("Q05", "curly",      "Word-style quoted phrase",             "U+201C U+201D",
        "Word said \u201Chello world\u201D today"),
    ("Q06", "curly",      "LEFT SINGLE QUOTATION MARK",           "U+2018",
        "left curly single: \u2018marked"),
    ("Q07", "curly",      "RIGHT SINGLE / smart apostrophe",      "U+2019",
        "smart apostrophe: don\u2019t and won\u2019t"),
    ("Q08", "curly",      "Curly single-quoted phrase",           "U+2018 U+2019",
        "she said \u2018ok\u2019 quietly"),

    # ---- Section C: Low and reversed quotes (German, Czech, Polish typography) ----
    ("Q09", "low",        "DOUBLE LOW-9 (German opening)",        "U+201E",
        "German style: \u201Ehallo"),
    ("Q10", "low",        "SINGLE LOW-9",                         "U+201A",
        "single low: \u201Atag"),
    ("Q11", "low",        "DOUBLE HIGH-REVERSED-9",               "U+201F",
        "high reversed double: \u201Fphrase"),
    ("Q12", "low",        "SINGLE HIGH-REVERSED-9",               "U+201B",
        "high reversed single: \u201Bword"),
    ("Q13", "low",        "German-paired phrase (low-9 + curly)", "U+201E U+201C",
        "\u201Ehallo welt\u201C"),

    # ---- Section D: Guillemets (French quotes, also used as quotation in some locales) ----
    ("Q14", "guillemet",  "LEFT DOUBLE ANGLE QUOTATION",          "U+00AB",
        "french open: \u00ABbonjour"),
    ("Q15", "guillemet",  "RIGHT DOUBLE ANGLE QUOTATION",         "U+00BB",
        "french close: bonjour\u00BB"),
    ("Q16", "guillemet",  "Guillemet-paired phrase",              "U+00AB U+00BB",
        "\u00ABbonjour le monde\u00BB"),
    ("Q17", "guillemet",  "SINGLE LEFT ANGLE",                    "U+2039",
        "single open: \u2039x"),
    ("Q18", "guillemet",  "SINGLE RIGHT ANGLE",                   "U+203A",
        "single close: x\u203A"),

    # ---- Section E: CJK / fullwidth ----
    ("Q19", "fullwidth",  "FULLWIDTH QUOTATION MARK",             "U+FF02",
        "fullwidth dquote: \uFF02value\uFF02"),
    ("Q20", "fullwidth",  "FULLWIDTH APOSTROPHE",                 "U+FF07",
        "fullwidth apos: \uFF07value\uFF07"),
    ("Q21", "cjk",        "CJK CORNER BRACKETS",                  "U+300C U+300D",
        "japanese: \u300C\u3053\u3093\u306B\u3061\u306F\u300D"),

    # ---- Section F: Primes (constantly mistaken for quotes by users) ----
    ("Q22", "prime",      "PRIME (foot mark or minutes)",         "U+2032",
        "5\u2032 (5 feet) or 30\u2032 latitude"),
    ("Q23", "prime",      "DOUBLE PRIME (inch mark or seconds)",  "U+2033",
        "5\u2033 (5 inches) or 11\u2033 latitude"),
    ("Q24", "prime",      "Compound foot+inch measurement",       "U+2032 U+2033",
        "she is 5\u2032 11\u2033 tall"),

    # ---- Section G: Heavy / decorative ----
    ("Q25", "heavy",      "HEAVY SINGLE TURNED COMMA QUOTE",      "U+275B",
        "\u275Bheavy turned"),
    ("Q26", "heavy",      "HEAVY SINGLE COMMA QUOTE",             "U+275C",
        "heavy comma\u275C"),
    ("Q27", "heavy",      "HEAVY DOUBLE TURNED COMMA QUOTE",      "U+275D",
        "\u275Dheavy double turned\u275E"),

    # ---- Section H: Modifier letters (look like quotes, semantically letters) ----
    ("Q28", "modifier",   "MODIFIER LETTER APOSTROPHE",           "U+02BC",
        "Hawai\u02BBi (apostrophe is the okina, a real letter)"),
    ("Q29", "modifier",   "MODIFIER LETTER PRIME",                "U+02B9",
        "math notation a\u02B9"),
    ("Q30", "modifier",   "MODIFIER LETTER DOUBLE PRIME",         "U+02BA",
        "math notation a\u02BA"),

    # ---- Section I: Mixing (real-world chaos) ----
    ("Q31", "mixed",      "ASCII + curly + smart apos all in one cell", "mixed",
        "ASCII \"x\" curly \u201Cy\u201D and don\u2019t forget"),
    ("Q32", "mixed",      "Curly outer with ASCII inner (Word paste pattern)", "U+201C ... \" ... \" ... U+201D",
        "\u201CHe said \"hi\" to me\u201D"),
    ("Q33", "mixed",      "Asymmetric: curly open + ASCII close",  "U+201C ... \"",
        "\u201Cmismatched\""),
    ("Q34", "mixed",      "Asymmetric: ASCII open + curly close",  "\" ... U+201D",
        "\"mismatched\u201D"),
    ("Q35", "mixed",      "Three quote styles nested in one cell", "all three",
        "\u201Couter \u2018middle \"inner\" middle\u2019 outer\u201D"),
    ("Q36", "mixed",      "Curly quotes wrapping the ENTIRE cell content (parser sees as data)", "U+201C ... U+201D",
        "\u201Cwhole-cell-wrapped\u201D"),
]

header_22 = "case_id,category,char_name,codepoint,payload\n"
out_22 = header_22 + "".join(make_row(*r) for r in quote_rows)

(TD / "22_quote_variants.csv").write_bytes(out_22.encode("utf-8"))
print(f"Wrote {TD / '22_quote_variants.csv'} ({len(quote_rows)} cases)")


# ============================================================================
# 23_csv_malformed.csv
# ============================================================================
# Header is nominally 2 columns: case_id,payload
# Each row demonstrates ONE structural malformation. Most rows will NOT have
# exactly 2 fields when parsed; that's the entire point. The parser will
# raise, skip, or misalign depending on its policy.
#
# Design rules:
#  - case_id is always plain ASCII with no commas or quotes (parser-safe).
#    Even when the rest of the row is destroyed, the case_id is identifiable.
#  - Cases ordered safest-first. Cascade-destructive cases (unbalanced
#    openers that swallow subsequent rows) live below a DANGER_ZONE banner row.
#  - The file is written as raw bytes; csv.writer is NOT used because it
#    would auto-escape and undo the malformations.

# --- Bounded malformations (each affects only its own row) ---
bounded_rows = [
    # case_id, raw_line_after_caseid
    # Each entry is appended after "case_id,"

    # M01 - Unquoted cell containing a comma. Row has 4 fields, header has 2.
    ("M01", "Smith, John,30,NY"),

    # M02 - Properly quoted cell with comma. WELL-FORMED control. 4 fields though.
    ("M02", '"Smith, John",30,NY'),

    # M03 - Stray straight double-quote in unquoted cell. 4 fields, weird quoting.
    ("M03", 'John "Slim" Smith,30,NY'),

    # M04 - Stray curly quote in unquoted cell. 4 fields. ASCII parser may not
    # treat curly as a quote char, so curly stays as data; comma still splits.
    ("M04", "John \u201CSlim\u201D Smith,30,NY"),

    # M05 - Quoted cell with UNESCAPED inner quotes. Most parsers misparse this.
    # The inner ASCII " breaks the quoting state mid-cell.
    ("M05", '"He said "hi" to me",30,NY'),

    # M06 - Quoted cell with PROPERLY escaped inner quotes. WELL-FORMED control.
    ("M06", '"He said ""hi"" to me",30,NY'),

    # M07 - Non-standard backslash-escaping. RFC 4180 doesn't recognize this;
    # the inner \" reads as backslash + literal quote. Most parsers break here.
    ("M07", '"He said \\"hi\\" to me",30,NY'),

    # M08 - Cell wrapped in single quotes (some Excel exports do this).
    # Standard CSV parsers treat the apostrophes as data, not as quoting.
    ("M08", "'John Smith',30,NY"),

    # M09 - Cell wrapped in CURLY quotes only. Parser sees curly as data.
    # No escaping happens. Comma still splits the row normally.
    ("M09", "\u201Cvalue with, comma\u201D,30,NY"),

    # M10 - Mismatched: ASCII open + curly close. Parser opens a quoted cell
    # but never finds an ASCII close, so it hunts for the next " in the file.
    ("M10", '"opens ASCII closes curly\u201D,30,NY'),

    # M11 - Whitespace OUTSIDE the quotes. Per RFC 4180 this is technically
    # malformed; pandas tolerates it, csv.reader strict mode rejects it.
    ("M11", '  "value"  ,30,NY'),

    # M12 - Empty quoted cell. WELL-FORMED control.
    ("M12", '"",30,NY'),

    # M13 - Whitespace-only quoted cell. Whitespace MUST be preserved
    # (it's quoted, so it's intentional).
    ("M13", '"   ",30,NY'),

    # M14 - Apostrophe inside an unquoted cell. NOT a CSV quote character.
    # Most parsers handle this fine. Worth testing as a negative control.
    ("M14", "O'Connor,30,NY"),

    # M15 - Excel force-text leading apostrophe. The apostrophe is part of
    # the cell value when read by csv.reader (Excel itself strips it on read,
    # but exports written from Excel may or may not retain it).
    ("M15", "'12345,30,NY"),

    # M16 - Triple ASCII quotes. Per RFC 4180 this parses as: open quote,
    # then escaped quote ("" -> "), then... what? Depends on parser.
    ("M16", '"""value""",30,NY'),

    # M17 - Quadruple ASCII quotes wrapping value (sometimes seen in
    # double-roundtripped Excel exports).
    ("M17", '""""value"""",30,NY'),

    # M18 - Row has MORE fields than header.
    ("M18", "extra1,extra2,extra3,extra4"),

    # M19 - Row has FEWER fields than header.
    ("M19", ""),

    # M20 - Properly quoted multi-line cell (embedded newline). WELL-FORMED.
    # This row physically spans two lines in the file.
    ("M20", '"line1\nline2",30,NY'),

    # M21 - Bare LF inside an UNQUOTED cell (rare, but seen in hand-edited
    # files). Most parsers treat the LF as the row terminator and the
    # remainder as a new row. Bounded but weird.
    ("M21", "value with\nbare LF,30,NY"),

    # M22 - Curly outer quote with ASCII inner AND a comma. The combination
    # that comes from "I copied this from Word into a CSV". Tests several
    # things at once: parser sees curly as data, ASCII inner triggers quote
    # state, comma splits.
    ("M22", "\u201CHe said \"wait, what?\" to me\u201D,30,NY"),

    # M23 - Tab character in unquoted cell. Not malformed in CSV (tab is data),
    # but test that the cleaner doesn't treat it as a column delimiter.
    ("M23", "John\tSmith,30,NY"),

    # M24 - Carriage-return only (no LF) inside unquoted cell. Some parsers
    # treat lone CR as a row terminator; some don't.
    ("M24", "value with\rbare CR,30,NY"),
]

# --- Cascade-destructive cases (each likely eats subsequent rows) ---
# Placed at the END so the cascade can only damage the destructive rows
# themselves, not the bounded test cases above.
destructive_rows = [
    # M90 - Unbalanced opening quote with no close anywhere. Most parsers
    # will read until they hit another quote OR end-of-file. If there's
    # another quoted cell later, this case "borrows" its closing quote and
    # smashes everything in between.
    ("M90", '"opens but no close, payload runs on into oblivion'),

    # M91 - Opens, has internal newline, never closes. Multi-line damage.
    ("M91", '"opens with a newline inside\nbut never closes the quote'),

    # M92 - File ends mid-quoted-cell (no terminating newline either).
    # This MUST be the last row in the file. We append it without trailing
    # \n in the writer below.
    ("M92", '"final row never closes its quote'),
]

# Build the file bytes
header_23 = "case_id,payload\n"
banner = "BANNER_DANGER_ZONE,EVERYTHING BELOW THIS LINE MAY CASCADE INTO SUBSEQUENT ROWS WHEN PARSED\n"

bounded_serialized = "".join(f"{cid},{payload}\n" for cid, payload in bounded_rows)

destructive_serialized_parts = []
for i, (cid, payload) in enumerate(destructive_rows):
    last = (i == len(destructive_rows) - 1)
    # Last row gets no trailing newline (M92 specifically tests "file ends
    # mid-quoted-cell with no terminator").
    destructive_serialized_parts.append(f"{cid},{payload}" + ("" if last else "\n"))
destructive_serialized = "".join(destructive_serialized_parts)

out_23 = header_23 + bounded_serialized + banner + destructive_serialized

(TD / "23_csv_malformed.csv").write_bytes(out_23.encode("utf-8"))
print(f"Wrote {TD / '23_csv_malformed.csv'} "
      f"({len(bounded_rows)} bounded + {len(destructive_rows)} destructive cases)")