"""
Generator for encoding / code-page test fixtures.

Produces ~31 files in test_data/encodings/ covering:
  - UTF-8 (with and without BOM)
  - UTF-16 LE/BE (with and without BOM)
  - Western single-byte: cp1252, ISO-8859-1, ISO-8859-15, Mac Roman
  - Eastern European: cp1250, ISO-8859-2
  - Cyrillic: cp1251, KOI8-R
  - CJK: Shift_JIS, GB18030, Big5, EUC-KR
  - Pathological: invalid UTF-8, truncated multibyte, lying BOM,
    mixed-encoding concatenation, ASCII-only (encoding-ambiguous)

Plus a manifest CSV (expected_detection.csv) that tells the user what
encoding each file is, what their detector should reasonably return,
and what the decoded content should match.
"""
from pathlib import Path
import csv

ROOT = Path(__file__).parent
TD = ROOT / "test_data" / "encodings"
TD.mkdir(parents=True, exist_ok=True)

# ============================================================================
# Canonical content per encoding family
# ============================================================================
# These are the UTF-8 ground-truth strings. Every encoded file in this
# corpus, when correctly decoded, must produce the corresponding string
# exactly (modulo BOM stripping).

# Western_basic: characters that exist in Latin-1, cp1252, Mac Roman, and
# all UTF variants. No euro, no smart quotes, no em-dash. Uses only chars
# that the older single-byte encodings can faithfully represent.
WESTERN_BASIC = (
    "id,name,city,note\n"
    "1,Alice,New York,plain ASCII\n"
    "2,Caf\u00e9 M\u00fcller,K\u00f6ln,Latin-1 accents\n"
    "3,Na\u00efve Fa\u00e7ade,Z\u00fcrich,more accents\n"
    "4,Espa\u00f1a,D\u00fcsseldorf,Spanish n-tilde\n"
)

# Western_extended: includes euro sign, smart quotes, em-dash. These DO NOT
# exist in ISO-8859-1 (Latin-1). They DO exist in cp1252 (in the 0x80-0x9F
# range that Latin-1 leaves undefined). They exist in UTF-8/16. So this
# content distinguishes cp1252 from Latin-1 - a file that decodes correctly
# as cp1252 but produces mojibake or replacement chars as Latin-1 must
# really be cp1252.
WESTERN_EXTENDED = (
    "id,name,note\n"
    "1,\u20ac100 product,euro sign U+20AC\n"
    "2,\u201csmart\u201d quotes,curly U+201C and U+201D\n"
    "3,caf\u00e9 \u2014 r\u00e9sum\u00e9,em-dash U+2014\n"
    "4,quote\u2019s ok,smart apostrophe U+2019\n"
)

# Eastern European: Czech, Polish, Hungarian, Slovak accents that are in
# cp1250 / ISO-8859-2 but NOT in cp1252 / Latin-1. A file with these chars
# decoded as cp1252 produces mojibake.
EASTERN_EUROPEAN = (
    "id,name,city,language\n"
    "1,P\u0159\u00edli\u0161,Praha,Czech\n"
    "2,\u017b\u00f3\u0142\u0107,Warszawa,Polish\n"
    "3,T\u0171r\u0151,Budapest,Hungarian\n"
    "4,Spa\u0148ski,Bratislava,Slovak\n"
)

# Cyrillic: Russian text, only encodes in cp1251, KOI8-R, UTF.
CYRILLIC = (
    "id,name,city\n"
    "1,\u0418\u0432\u0430\u043d,\u041c\u043e\u0441\u043a\u0432\u0430\n"
    "2,\u0410\u043d\u043d\u0430,\u0421\u0430\u043d\u043a\u0442-\u041f\u0435\u0442\u0435\u0440\u0431\u0443\u0440\u0433\n"
    "3,\u0414\u043c\u0438\u0442\u0440\u0438\u0439,\u041d\u043e\u0432\u043e\u0441\u0438\u0431\u0438\u0440\u0441\u043a\n"
)

# Japanese: encodes in Shift_JIS (cp932), UTF.
JAPANESE = (
    "id,name,city\n"
    "1,\u7530\u4e2d\u592a\u90ce,\u6771\u4eac\n"
    "2,\u9234\u6728\u82b1\u5b50,\u5927\u962a\n"
    "3,Alice Smith,\u6a2a\u6d5c\n"
)

# Chinese simplified: encodes in GB18030, GBK, UTF.
CHINESE_SIMPLIFIED = (
    "id,name,city\n"
    "1,\u5f20\u4e09,\u5317\u4eac\n"
    "2,\u674e\u56db,\u4e0a\u6d77\n"
    "3,Alice Smith,\u6df1\u5733\n"
)

# Chinese traditional: encodes in Big5, UTF.
CHINESE_TRADITIONAL = (
    "id,name,city\n"
    "1,\u5f35\u4e09,\u53f0\u5317\n"
    "2,\u674e\u56db,\u9999\u6e2f\n"
    "3,Alice Smith,\u65b0\u7af9\n"
)

# Korean: encodes in EUC-KR (cp949), UTF.
KOREAN = (
    "id,name,city\n"
    "1,\uae40\ucca0\uc218,\uc11c\uc6b8\n"
    "2,\ubc15\uc601\ud76c,\ubd80\uc0b0\n"
    "3,Alice Smith,\uc778\ucc9c\n"
)


# ============================================================================
# Helper: write file as encoded bytes; record manifest entry.
# ============================================================================
manifest = []  # rows for expected_detection.csv

def write_encoded(filename, canonical_id, content, encoding,
                  bom_prefix=b"", expected_detection=None,
                  decode_notes=""):
    """Encode `content` with `encoding`, prepend `bom_prefix`, write to
    test_data/encodings/`filename`. Record a manifest row.

    expected_detection is a |-separated string of acceptable detector
    outputs (because some files are genuinely ambiguous - charset-normalizer
    might call ASCII-only content "utf_8" or "ascii" or "cp1252" and any
    of those is a correct answer if they decode to the same string).
    """
    encoded = content.encode(encoding)
    data = bom_prefix + encoded
    path = TD / filename
    path.write_bytes(data)
    manifest.append({
        "filename": filename,
        "canonical_content_id": canonical_id,
        "encoding": encoding,
        "has_bom": "yes" if bom_prefix else "no",
        "byte_length": len(data),
        "expected_detection": expected_detection or encoding,
        "decode_notes": decode_notes,
    })


def write_raw(filename, raw_bytes, canonical_id, encoding,
              has_bom, expected_detection, decode_notes):
    """Write arbitrary raw bytes (used for pathological cases)."""
    path = TD / filename
    path.write_bytes(raw_bytes)
    manifest.append({
        "filename": filename,
        "canonical_content_id": canonical_id,
        "encoding": encoding,
        "has_bom": has_bom,
        "byte_length": len(raw_bytes),
        "expected_detection": expected_detection,
        "decode_notes": decode_notes,
    })


# ============================================================================
# Group A: Western_basic (ASCII + Latin-1 character set only)
# ============================================================================

# E01 - UTF-8, no BOM. The modern default.
write_encoded("E01_western_basic_utf8.csv", "WESTERN_BASIC",
              WESTERN_BASIC, "utf-8",
              expected_detection="utf_8|utf-8",
              decode_notes="UTF-8 no BOM. Modern default.")

# E02 - UTF-8 with BOM. Excel's "CSV UTF-8" export default.
write_encoded("E02_western_basic_utf8bom.csv", "WESTERN_BASIC",
              WESTERN_BASIC, "utf-8",
              bom_prefix=b"\xef\xbb\xbf",
              expected_detection="utf_8|utf_8_sig|utf-8|utf-8-sig",
              decode_notes="UTF-8 with BOM. Excel CSV UTF-8 export. BOM must be stripped on read.")

# E03 - cp1252. Excel default "CSV" on US/UK/Western Windows.
write_encoded("E03_western_basic_cp1252.csv", "WESTERN_BASIC",
              WESTERN_BASIC, "cp1252",
              expected_detection="cp1252|windows-1252|iso-8859-1|latin-1",
              decode_notes=("Western single-byte. For this content (no euro, no smart "
                            "quotes, no em-dash), cp1252 and Latin-1 produce IDENTICAL "
                            "decoded bytes. Detector cannot distinguish - any of "
                            "cp1252/Latin-1/Latin-9 is a correct answer."))

# E04 - ISO-8859-1 (Latin-1). Older Western standard.
write_encoded("E04_western_basic_latin1.csv", "WESTERN_BASIC",
              WESTERN_BASIC, "iso-8859-1",
              expected_detection="iso-8859-1|latin-1|cp1252|latin_1",
              decode_notes=("Latin-1. Identical bytes to cp1252 for this content. "
                            "Detector ambiguity is expected and acceptable."))

# E05 - ISO-8859-15 (Latin-9). Latin-1 plus euro and a few others.
write_encoded("E05_western_basic_latin9.csv", "WESTERN_BASIC",
              WESTERN_BASIC, "iso-8859-15",
              expected_detection="iso-8859-15|latin-9|iso-8859-1|cp1252",
              decode_notes=("Latin-9. For this content with no euro sign, decodes "
                            "identically to Latin-1. Detector may pick any."))

# E06 - Mac Roman. Older Mac CSVs.
write_encoded("E06_western_basic_macroman.csv", "WESTERN_BASIC",
              WESTERN_BASIC, "mac-roman",
              expected_detection="mac-roman|macroman",
              decode_notes=("Mac Roman. Different byte values for the accented "
                            "chars vs cp1252/Latin-1, so this one is distinguishable."))

# E07 - UTF-16 LE with BOM. Windows "Unicode Text" export from Excel.
write_encoded("E07_western_basic_utf16le.csv", "WESTERN_BASIC",
              WESTERN_BASIC, "utf-16-le",
              bom_prefix=b"\xff\xfe",
              expected_detection="utf-16|utf-16-le|utf_16|utf_16_le",
              decode_notes="UTF-16 LE with BOM. Excel 'Unicode Text' export.")

# E08 - UTF-16 BE with BOM. Rarer but spec'd.
write_encoded("E08_western_basic_utf16be.csv", "WESTERN_BASIC",
              WESTERN_BASIC, "utf-16-be",
              bom_prefix=b"\xfe\xff",
              expected_detection="utf-16|utf-16-be|utf_16|utf_16_be",
              decode_notes="UTF-16 BE with BOM. Less common but valid.")

# E09 - UTF-16 LE without BOM. Effectively undetectable without heuristics.
write_encoded("E09_western_basic_utf16le_nobom.csv", "WESTERN_BASIC",
              WESTERN_BASIC, "utf-16-le",
              expected_detection="utf-16|utf-16-le|UNRELIABLE",
              decode_notes=("UTF-16 LE without BOM. Detection is heuristic and "
                            "unreliable; bytes look like 'every other byte is null' "
                            "for ASCII-heavy content, which charset-normalizer may "
                            "or may not catch. If detector returns wrong encoding "
                            "here, that is the buyer's responsibility to manually "
                            "specify - flag in error message."))


# ============================================================================
# Group B: Western_extended (needs cp1252 or UTF; Latin-1 cannot represent it)
# ============================================================================

# E10 - UTF-8.
write_encoded("E10_western_extended_utf8.csv", "WESTERN_EXTENDED",
              WESTERN_EXTENDED, "utf-8",
              expected_detection="utf_8|utf-8",
              decode_notes="UTF-8. Has euro, smart quotes, em-dash.")

# E11 - cp1252. The interesting case: this content uses cp1252's
# 0x80-0x9F range (where euro=0x80, smart quotes=0x91-0x94, em-dash=0x97).
# Latin-1 leaves that range undefined, so this file CANNOT be misread
# as Latin-1 without producing replacement characters or errors.
write_encoded("E11_western_extended_cp1252.csv", "WESTERN_EXTENDED",
              WESTERN_EXTENDED, "cp1252",
              expected_detection="cp1252|windows-1252",
              decode_notes=("cp1252. Content uses 0x80-0x9F range (euro, smart "
                            "quotes, em-dash). Decoding as Latin-1 produces "
                            "control characters or replacement chars - this file "
                            "is the cleanest cp1252-vs-Latin-1 discriminator."))

# E12 - UTF-16 LE with BOM.
write_encoded("E12_western_extended_utf16le.csv", "WESTERN_EXTENDED",
              WESTERN_EXTENDED, "utf-16-le",
              bom_prefix=b"\xff\xfe",
              expected_detection="utf-16|utf-16-le",
              decode_notes="UTF-16 LE with BOM. Same content as E10/E11.")


# ============================================================================
# Group C: Eastern European
# ============================================================================

# E13 - UTF-8.
write_encoded("E13_eastern_european_utf8.csv", "EASTERN_EUROPEAN",
              EASTERN_EUROPEAN, "utf-8",
              expected_detection="utf_8|utf-8",
              decode_notes="UTF-8 baseline for Czech/Polish/Hungarian/Slovak content.")

# E14 - cp1250. Polish/Czech/Hungarian Windows.
write_encoded("E14_eastern_european_cp1250.csv", "EASTERN_EUROPEAN",
              EASTERN_EUROPEAN, "cp1250",
              expected_detection="cp1250|windows-1250",
              decode_notes=("cp1250. Decoding as cp1252 produces mojibake "
                            "(Polish slash-l would become U+0142 vs ascii letter, etc.). "
                            "Real distinguishing test."))

# E15 - ISO-8859-2 (Latin-2).
write_encoded("E15_eastern_european_iso88592.csv", "EASTERN_EUROPEAN",
              EASTERN_EUROPEAN, "iso-8859-2",
              expected_detection="iso-8859-2|latin-2|iso8859_2",
              decode_notes=("ISO-8859-2 / Latin-2. Different byte assignments "
                            "than cp1250 for the same characters."))


# ============================================================================
# Group D: Cyrillic
# ============================================================================

# E16 - UTF-8.
write_encoded("E16_cyrillic_utf8.csv", "CYRILLIC",
              CYRILLIC, "utf-8",
              expected_detection="utf_8|utf-8",
              decode_notes="UTF-8 baseline for Russian content.")

# E17 - cp1251. Russian Windows.
write_encoded("E17_cyrillic_cp1251.csv", "CYRILLIC",
              CYRILLIC, "cp1251",
              expected_detection="cp1251|windows-1251",
              decode_notes="cp1251. The dominant Russian Windows encoding.")

# E18 - KOI8-R. Older Russian Linux/Unix.
write_encoded("E18_cyrillic_koi8r.csv", "CYRILLIC",
              CYRILLIC, "koi8-r",
              expected_detection="koi8-r|koi8_r",
              decode_notes=("KOI8-R. Older Unix Russian encoding. Distinct byte "
                            "patterns from cp1251."))


# ============================================================================
# Group E: CJK
# ============================================================================

# E19 / E20 - Japanese
write_encoded("E19_japanese_utf8.csv", "JAPANESE",
              JAPANESE, "utf-8",
              expected_detection="utf_8|utf-8",
              decode_notes="UTF-8 baseline for Japanese content.")

write_encoded("E20_japanese_shiftjis.csv", "JAPANESE",
              JAPANESE, "shift_jis",
              expected_detection="shift_jis|shift-jis|cp932|sjis",
              decode_notes=("Shift_JIS. Excel on Japanese Windows defaults to this. "
                            "cp932 is Microsoft's extended variant; either name is "
                            "acceptable."))

# E21 / E22 - Chinese simplified
write_encoded("E21_chinese_simplified_utf8.csv", "CHINESE_SIMPLIFIED",
              CHINESE_SIMPLIFIED, "utf-8",
              expected_detection="utf_8|utf-8",
              decode_notes="UTF-8 baseline for simplified Chinese.")

write_encoded("E22_chinese_simplified_gb18030.csv", "CHINESE_SIMPLIFIED",
              CHINESE_SIMPLIFIED, "gb18030",
              expected_detection="gb18030|gbk|gb2312",
              decode_notes=("GB18030. Mainland China default. GB18030 supersets "
                            "GBK supersets GB2312; for this content any is acceptable."))

# E23 / E24 - Chinese traditional
write_encoded("E23_chinese_traditional_utf8.csv", "CHINESE_TRADITIONAL",
              CHINESE_TRADITIONAL, "utf-8",
              expected_detection="utf_8|utf-8",
              decode_notes="UTF-8 baseline for traditional Chinese.")

write_encoded("E24_chinese_traditional_big5.csv", "CHINESE_TRADITIONAL",
              CHINESE_TRADITIONAL, "big5",
              expected_detection="big5|big5_hkscs|cp950",
              decode_notes=("Big5. Taiwan and Hong Kong default. cp950 is "
                            "Microsoft's variant."))

# E25 / E26 - Korean
write_encoded("E25_korean_utf8.csv", "KOREAN",
              KOREAN, "utf-8",
              expected_detection="utf_8|utf-8",
              decode_notes="UTF-8 baseline for Korean.")

write_encoded("E26_korean_euckr.csv", "KOREAN",
              KOREAN, "euc-kr",
              expected_detection="euc-kr|euc_kr|cp949",
              decode_notes="EUC-KR. Korean Windows default. cp949 is the MS variant.")


# ============================================================================
# Group F: Pathological cases
# ============================================================================

# E27 - ASCII only. Encoding-ambiguous: detector might say UTF-8, ASCII,
# cp1252, Latin-1 - all decode to identical content.
ascii_content = (
    "id,name,city\n"
    "1,Alice,New York\n"
    "2,Bob,Chicago\n"
    "3,Carol,San Francisco\n"
)
write_raw("E27_pathological_ascii_only.csv",
          ascii_content.encode("ascii"),
          canonical_id="ASCII_ONLY",
          encoding="ascii",
          has_bom="no",
          expected_detection="ascii|utf_8|utf-8|cp1252|iso-8859-1|AMBIGUOUS",
          decode_notes=("Pure ASCII. Multiple encodings produce identical bytes "
                        "for this content. Any of ASCII, UTF-8, cp1252, Latin-1 "
                        "is a correct detection answer because all four decode to "
                        "the same string. Detector confidence should be high; "
                        "specific label is interchangeable."))

# E28 - Invalid UTF-8 bytes mid-file. UTF-8 byte 0xC3 followed by 0x28 is
# NOT a valid UTF-8 sequence (0x28 is not a valid continuation byte).
# A strict UTF-8 decoder will raise UnicodeDecodeError.
invalid_utf8 = (
    b"id,name,city\n"
    b"1,Alice,New York\n"
    b"2,B\xc3\x28b,Chicago\n"   # invalid UTF-8 in middle of word
    b"3,Carol,San Francisco\n"
)
write_raw("E28_pathological_invalid_utf8.csv",
          invalid_utf8,
          canonical_id="INVALID_UTF8",
          encoding="invalid-utf8",
          has_bom="no",
          expected_detection="cp1252|iso-8859-1|REJECT_UTF8",
          decode_notes=("File starts as if UTF-8 but contains an invalid byte "
                        "sequence (0xC3 0x28). A strict UTF-8 decoder errors. "
                        "Detector should reject UTF-8 and fall back to a "
                        "single-byte encoding; cp1252 will produce mojibake "
                        "but parse without error. Cleaner should warn the user "
                        "that encoding detection was uncertain."))

# E29 - Truncated UTF-8 multibyte sequence at end of file. 0xE4 starts a
# 3-byte UTF-8 sequence (some CJK chars) but the file ends before the
# 2nd and 3rd continuation bytes arrive.
truncated_utf8 = (
    "id,name,city\n"
    "1,Alice,New York\n"
    "2,Bob,Chicago\n"
).encode("utf-8") + b"3,\xe4"  # truncated mid-character

write_raw("E29_pathological_truncated_utf8.csv",
          truncated_utf8,
          canonical_id="TRUNCATED_UTF8",
          encoding="invalid-utf8-truncated",
          has_bom="no",
          expected_detection="utf_8_with_errors|cp1252|REJECT",
          decode_notes=("Valid UTF-8 throughout, but the last byte (0xE4) starts "
                        "a 3-byte sequence that's never completed. Strict UTF-8 "
                        "decoder errors at EOF. errors='replace' produces \\ufffd. "
                        "Real-world cause: file was truncated by a transfer "
                        "interruption or a buggy export. Cleaner should treat as "
                        "corrupt-input error, not silent data loss."))

# E30 - "Lying BOM": file starts with UTF-8 BOM (EF BB BF) but the body is
# actually cp1252-encoded bytes (using chars in 0x80-0x9F range that aren't
# valid UTF-8 continuation bytes). Sometimes happens when a wrapper script
# slaps a BOM on the front of an arbitrary file thinking it'll help.
lying_bom = (
    b"\xef\xbb\xbf"   # UTF-8 BOM, lying
    + WESTERN_EXTENDED.encode("cp1252")  # actual content is cp1252
)
write_raw("E30_pathological_lying_bom.csv",
          lying_bom,
          canonical_id="WESTERN_EXTENDED",
          encoding="cp1252-with-utf8-bom",
          has_bom="yes (lying)",
          expected_detection="utf_8_FAILS|cp1252|AMBIGUOUS",
          decode_notes=("File has UTF-8 BOM but body is cp1252. UTF-8 decoder "
                        "will see 0x80 (euro in cp1252) as an invalid UTF-8 "
                        "continuation byte and error out. Better detectors "
                        "recover by ignoring the BOM and trying cp1252. "
                        "Cleaner should warn 'BOM suggested UTF-8 but content "
                        "decoded as cp1252' so the user knows their file is "
                        "lying about itself."))

# E31 - Mixed-encoding concatenation. cp1252 lines followed by UTF-8 lines.
# Real-world cause: someone used `cat file1.csv file2.csv > merged.csv` on
# files exported from different sources. The result is undecodable as a
# single encoding.
mixed_concat = (
    "id,name,city\n".encode("cp1252")
    + "1,M\u00fcller,K\u00f6ln\n".encode("cp1252")  # cp1252 bytes for ü, ö
    + "2,M\u00fcller,K\u00f6ln\n".encode("utf-8")   # UTF-8 bytes for same chars
    + "3,Alice,New York\n".encode("utf-8")
)
write_raw("E31_pathological_mixed_concat.csv",
          mixed_concat,
          canonical_id="MIXED_CONCAT",
          encoding="cp1252+utf8-concatenated",
          has_bom="no",
          expected_detection="LOW_CONFIDENCE|cp1252|utf_8|REJECT",
          decode_notes=("First half cp1252, second half UTF-8. No single "
                        "encoding decodes both halves correctly. UTF-8 decoder "
                        "errors on row 1. cp1252 decoder produces mojibake on "
                        "rows 2-3. charset-normalizer detection confidence "
                        "should be low. Right behavior for the cleaner: refuse "
                        "to process and tell the user the file contains mixed "
                        "encodings."))


# ============================================================================
# Write the manifest
# ============================================================================
manifest_path = ROOT / "test_data" / "encodings" / "expected_detection.csv"
with manifest_path.open("w", encoding="utf-8", newline="") as f:
    writer = csv.DictWriter(
        f,
        fieldnames=["filename", "canonical_content_id", "encoding",
                    "has_bom", "byte_length", "expected_detection",
                    "decode_notes"],
    )
    writer.writeheader()
    for row in manifest:
        writer.writerow(row)

print(f"Wrote {len(manifest)} encoded test files in {TD}")
print(f"Manifest: {manifest_path}")

# Also write canonical content references as separate UTF-8 reference files
# so the user can do a full decoded-content diff.
ref_dir = ROOT / "test_data" / "encodings" / "reference"
ref_dir.mkdir(exist_ok=True)
for cid, content in [
    ("WESTERN_BASIC", WESTERN_BASIC),
    ("WESTERN_EXTENDED", WESTERN_EXTENDED),
    ("EASTERN_EUROPEAN", EASTERN_EUROPEAN),
    ("CYRILLIC", CYRILLIC),
    ("JAPANESE", JAPANESE),
    ("CHINESE_SIMPLIFIED", CHINESE_SIMPLIFIED),
    ("CHINESE_TRADITIONAL", CHINESE_TRADITIONAL),
    ("KOREAN", KOREAN),
    ("ASCII_ONLY", ascii_content),
]:
    (ref_dir / f"{cid}.utf8.txt").write_bytes(content.encode("utf-8"))
print(f"Reference content: {ref_dir}")
