#!/usr/bin/env python3
"""scripts/prepare_tinystories.py — pack TinyStories text into uint8 .bin shards.

Reads ``data/tinystories/TinyStories-train.txt`` and ``TinyStories-valid.txt``,
encodes them with the byte tokenizer (no BPE), and writes flat uint8 arrays
to ``train.bin`` / ``valid.bin`` next to the input. Reports token counts.

The trainer memmaps these files, so for a ~2 GB train shard we never load
the whole thing into RAM.
"""
from __future__ import annotations

import argparse
import time
from pathlib import Path

import numpy as np


def pack_text_file(in_path: Path, out_path: Path, chunk_bytes: int = 64 * 1024 * 1024) -> int:
    n = 0
    t0 = time.time()
    with in_path.open("rb") as fin, out_path.open("wb") as fout:
        while True:
            chunk = fin.read(chunk_bytes)
            if not chunk:
                break
            arr = np.frombuffer(chunk, dtype=np.uint8)
            arr.tofile(fout)
            n += arr.size
            mb = n / (1024 * 1024)
            elapsed = time.time() - t0
            print(f"  {mb:>8.1f} MiB packed  ({elapsed:.1f}s)")
    return n


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("--data-dir", type=Path, default=Path("data/tinystories"))
    args = ap.parse_args()

    pairs = [
        ("TinyStories-train.txt", "train.bin"),
        ("TinyStories-valid.txt", "valid.bin"),
    ]
    for src, dst in pairs:
        in_path = args.data_dir / src
        out_path = args.data_dir / dst
        if not in_path.exists():
            raise SystemExit(f"missing input: {in_path}")
        print(f"packing {in_path} -> {out_path}")
        n = pack_text_file(in_path, out_path)
        print(f"  done. {n:,} bytes / tokens")


if __name__ == "__main__":
    main()