| import json |
| import re |
| from transformers import AutoTokenizer |
|
|
| |
| dataset_path = "all_dataset_train.jsonl" |
| model_path = "/root/autodl-tmp/output_7B_FULL_cotSFT/v8-20250720-210226/checkpoint-58" |
| required_fields = ["input", "output"] |
| max_token_length = 8192 |
|
|
| |
| print("Loading tokenizer...") |
| tokenizer = AutoTokenizer.from_pretrained(model_path) |
|
|
| |
| def has_control_chars(text): |
| return bool(re.search(r"[\x00-\x1F\x7F]", text)) |
|
|
| |
| print("Checking dataset...\n") |
| with open(dataset_path, "r", encoding="utf-8") as f: |
| for idx, line in enumerate(f, 1): |
| try: |
| data = json.loads(line) |
| except json.JSONDecodeError as e: |
| print(f"[Line {idx}] ❌ JSON decode error: {e}") |
| continue |
|
|
| |
| for field in required_fields: |
| if field not in data: |
| print(f"[Line {idx}] ❌ Missing required field: '{field}'") |
| elif not data[field].strip(): |
| print(f"[Line {idx}] ❌ Field '{field}' is empty") |
|
|
| |
| input_text = data.get("input", "") |
| output_text = data.get("output", "") |
| if has_control_chars(input_text + output_text): |
| print(f"[Line {idx}] ⚠️ Contains control characters") |
|
|
| |
| try: |
| tokens = tokenizer(input_text + output_text, return_tensors="pt") |
| token_len = tokens["input_ids"].shape[1] |
| if token_len > max_token_length: |
| print(f"[Line {idx}] ⚠️ Too many tokens: {token_len} > {max_token_length}") |
| except Exception as e: |
| print(f"[Line {idx}] ❌ Tokenization error: {e}") |
|
|
| print("\n✅ Dataset check complete.") |
|
|