| import os |
| import numpy as np |
| from transformers import AutoTokenizer |
| from tqdm import tqdm |
|
|
| def process_data(): |
| |
| input_file_path = "data/raw/merged_text/corpus.txt" |
| tokenizer_path = "Tokenizer/BPE" |
| output_dir = "data/bin" |
| val_split_ratio = 0.1 |
|
|
| os.makedirs(output_dir, exist_ok=True) |
|
|
| |
| print(f"Loading tokenizer from {tokenizer_path}...") |
| tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) |
| |
| |
| eos_id = tokenizer.eos_token_id |
| print(f"Vocab size: {tokenizer.vocab_size}") |
| print(f"EOS ID: {eos_id}") |
|
|
| |
| print(f"Reading {input_file_path}...") |
| with open(input_file_path, 'r', encoding='utf-8') as f: |
| |
| lines = f.readlines() |
| |
| print(f"Total lines: {len(lines):,}") |
|
|
| |
| |
| |
| print("Tokenizing...") |
| all_tokens = [] |
| |
| |
| for line in tqdm(lines): |
| text = line.strip() |
| if not text: |
| continue |
| |
| |
| |
| tokens = tokenizer.encode(text) |
| tokens.append(eos_id) |
| all_tokens.extend(tokens) |
|
|
| token_count = len(all_tokens) |
| print(f"Total tokens: {token_count:,}") |
|
|
| |
| |
| ids = np.array(all_tokens, dtype=np.uint16) |
|
|
| |
| val_count = int(token_count * val_split_ratio) |
| train_ids = ids[:-val_count] |
| val_ids = ids[-val_count:] |
|
|
| print(f"Train tokens: {len(train_ids):,}") |
| print(f"Val tokens: {len(val_ids):,}") |
|
|
| |
| train_ids.tofile(os.path.join(output_dir, "train.bin")) |
| val_ids.tofile(os.path.join(output_dir, "val.bin")) |
| |
| print(f"✅ Saved binary files to {output_dir}/") |
|
|
| if __name__ == "__main__": |
| process_data() |
|
|