#!/usr/bin/env python3 """scripts/prepare_tinystories.py — pack TinyStories text into uint8 .bin shards. Reads ``data/tinystories/TinyStories-train.txt`` and ``TinyStories-valid.txt``, encodes them with the byte tokenizer (no BPE), and writes flat uint8 arrays to ``train.bin`` / ``valid.bin`` next to the input. Reports token counts. The trainer memmaps these files, so for a ~2 GB train shard we never load the whole thing into RAM. """ from __future__ import annotations import argparse import time from pathlib import Path import numpy as np def pack_text_file(in_path: Path, out_path: Path, chunk_bytes: int = 64 * 1024 * 1024) -> int: n = 0 t0 = time.time() with in_path.open("rb") as fin, out_path.open("wb") as fout: while True: chunk = fin.read(chunk_bytes) if not chunk: break arr = np.frombuffer(chunk, dtype=np.uint8) arr.tofile(fout) n += arr.size mb = n / (1024 * 1024) elapsed = time.time() - t0 print(f" {mb:>8.1f} MiB packed ({elapsed:.1f}s)") return n def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("--data-dir", type=Path, default=Path("data/tinystories")) args = ap.parse_args() pairs = [ ("TinyStories-train.txt", "train.bin"), ("TinyStories-valid.txt", "valid.bin"), ] for src, dst in pairs: in_path = args.data_dir / src out_path = args.data_dir / dst if not in_path.exists(): raise SystemExit(f"missing input: {in_path}") print(f"packing {in_path} -> {out_path}") n = pack_text_file(in_path, out_path) print(f" done. {n:,} bytes / tokens") if __name__ == "__main__": main()