Tilelli-llm / scripts /prepare_tinystories.py
TilelliLab's picture
Mirror small files (code, paper, results)
f86dc09 verified
Raw
History Blame Contribute Delete
1.77 kB
#!/usr/bin/env python3
"""scripts/prepare_tinystories.py — pack TinyStories text into uint8 .bin shards.
Reads ``data/tinystories/TinyStories-train.txt`` and ``TinyStories-valid.txt``,
encodes them with the byte tokenizer (no BPE), and writes flat uint8 arrays
to ``train.bin`` / ``valid.bin`` next to the input. Reports token counts.
The trainer memmaps these files, so for a ~2 GB train shard we never load
the whole thing into RAM.
"""
from __future__ import annotations
import argparse
import time
from pathlib import Path
import numpy as np
def pack_text_file(in_path: Path, out_path: Path, chunk_bytes: int = 64 * 1024 * 1024) -> int:
n = 0
t0 = time.time()
with in_path.open("rb") as fin, out_path.open("wb") as fout:
while True:
chunk = fin.read(chunk_bytes)
if not chunk:
break
arr = np.frombuffer(chunk, dtype=np.uint8)
arr.tofile(fout)
n += arr.size
mb = n / (1024 * 1024)
elapsed = time.time() - t0
print(f" {mb:>8.1f} MiB packed ({elapsed:.1f}s)")
return n
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--data-dir", type=Path, default=Path("data/tinystories"))
args = ap.parse_args()
pairs = [
("TinyStories-train.txt", "train.bin"),
("TinyStories-valid.txt", "valid.bin"),
]
for src, dst in pairs:
in_path = args.data_dir / src
out_path = args.data_dir / dst
if not in_path.exists():
raise SystemExit(f"missing input: {in_path}")
print(f"packing {in_path} -> {out_path}")
n = pack_text_file(in_path, out_path)
print(f" done. {n:,} bytes / tokens")
if __name__ == "__main__":
main()