| | """ |
| | Generate training datasets for ALL frameworks automatically. |
| | |
| | This script auto-discovers all chunk files and processes them, |
| | generating separate datasets for each framework PLUS a combined dataset. |
| | |
| | Usage: |
| | python scripts/generate_all_frameworks.py |
| | |
| | Output Structure: |
| | data/processed/training_crewai/ |
| | - positive_pairs.json |
| | - triplets.json |
| | data/processed/training_langgraph/ |
| | - positive_pairs.json |
| | - triplets.json |
| | data/processed/training_combined/ |
| | - positive_pairs.json (ALL frameworks merged) |
| | - triplets.json (ALL frameworks merged) |
| | """ |
| |
|
| | import sys |
| | import json |
| | from pathlib import Path |
| | from typing import List, Tuple |
| | from dataclasses import asdict |
| |
|
| | |
| | PROJECT_ROOT = Path(__file__).parent.parent |
| | sys.path.insert(0, str(PROJECT_ROOT)) |
| |
|
| | from src.task_3_data_engineering.export.pairs_triplets_generator import ( |
| | generate_pairs_and_triplets, |
| | PositivePair, |
| | Triplet |
| | ) |
| |
|
| |
|
| | def discover_all_chunk_files() -> List[Tuple[Path, str]]: |
| | """ |
| | Discover all chunk files in the workspace. |
| | |
| | Returns: |
| | List of (chunk_path, framework_name) tuples |
| | """ |
| | chunk_files = [] |
| | |
| | |
| | local_paths = [ |
| | PROJECT_ROOT / "data" / "processed" / "chunks" / "Local_saved_files" / "chunks.jsonl", |
| | PROJECT_ROOT / "data" / "processed" / "chunks" / "sample_code" / "chunks.jsonl", |
| | ] |
| | |
| | for path in local_paths: |
| | if path.exists(): |
| | |
| | if "Local_saved_files" in str(path): |
| | framework = "crewai" |
| | elif "sample_code" in str(path): |
| | framework = "sample" |
| | else: |
| | framework = path.parent.name |
| | chunk_files.append((path, framework)) |
| | |
| | |
| | repos_dir = PROJECT_ROOT / "data" / "processed" / "repos" |
| | if repos_dir.exists(): |
| | for repo_dir in repos_dir.iterdir(): |
| | if repo_dir.is_dir(): |
| | for jsonl_file in repo_dir.glob("*_chunks.jsonl"): |
| | |
| | framework = jsonl_file.stem.replace("_chunks", "").split("_")[0] |
| | chunk_files.append((jsonl_file, framework)) |
| | |
| | return chunk_files |
| |
|
| |
|
| | def merge_datasets(all_pairs: List[List[PositivePair]], |
| | all_triplets: List[List[Triplet]], |
| | output_dir: Path) -> None: |
| | """Merge all framework datasets into combined files (JSON + JSONL).""" |
| | output_dir.mkdir(parents=True, exist_ok=True) |
| | |
| | |
| | combined_pairs = [] |
| | for pairs in all_pairs: |
| | combined_pairs.extend(pairs) |
| | |
| | combined_triplets = [] |
| | for triplets in all_triplets: |
| | combined_triplets.extend(triplets) |
| | |
| | |
| | pairs_json_path = output_dir / "positive_pairs.json" |
| | with open(pairs_json_path, "w", encoding="utf-8") as f: |
| | json.dump([asdict(p) for p in combined_pairs], f, indent=2, ensure_ascii=False) |
| | print(f"β
Combined positive pairs (JSON): {pairs_json_path}") |
| | |
| | |
| | pairs_jsonl_path = output_dir / "positive_pairs.jsonl" |
| | with open(pairs_jsonl_path, "w", encoding="utf-8") as f: |
| | for p in combined_pairs: |
| | f.write(json.dumps(asdict(p), ensure_ascii=False) + "\n") |
| | print(f"β
Combined positive pairs (JSONL): {pairs_jsonl_path}") |
| | |
| | |
| | triplets_json_path = output_dir / "triplets.json" |
| | with open(triplets_json_path, "w", encoding="utf-8") as f: |
| | json.dump([asdict(t) for t in combined_triplets], f, indent=2, ensure_ascii=False) |
| | print(f"β
Combined triplets (JSON): {triplets_json_path}") |
| | |
| | |
| | triplets_jsonl_path = output_dir / "triplets.jsonl" |
| | with open(triplets_jsonl_path, "w", encoding="utf-8") as f: |
| | for t in combined_triplets: |
| | f.write(json.dumps(asdict(t), ensure_ascii=False) + "\n") |
| | print(f"β
Combined triplets (JSONL): {triplets_jsonl_path}") |
| | |
| | return len(combined_pairs), len(combined_triplets) |
| |
|
| |
|
| | def main(): |
| | """Generate datasets for all discovered frameworks + combined dataset.""" |
| | print("=" * 80) |
| | print("π MULTI-FRAMEWORK TRAINING DATA GENERATOR") |
| | print("=" * 80) |
| | |
| | |
| | print("\nπ Discovering chunk files...") |
| | chunk_files = discover_all_chunk_files() |
| | |
| | if not chunk_files: |
| | print("β No chunk files found!") |
| | print("\nPlease ensure chunks exist in:") |
| | print(" - data/processed/chunks/Local_saved_files/") |
| | print(" - data/processed/repos/*/") |
| | return |
| | |
| | print(f"β
Found {len(chunk_files)} chunk file(s):\n") |
| | for path, framework in chunk_files: |
| | print(f" π¦ {framework}: {path.name}") |
| | |
| | |
| | print("\n" + "=" * 80) |
| | print("π PROCESSING INDIVIDUAL FRAMEWORKS") |
| | print("=" * 80 + "\n") |
| | |
| | results = [] |
| | all_pairs = [] |
| | all_triplets = [] |
| | |
| | for i, (chunks_path, framework) in enumerate(chunk_files, 1): |
| | print(f"\n[{i}/{len(chunk_files)}] Processing {framework.upper()}...") |
| | print("-" * 60) |
| | |
| | output_dir = PROJECT_ROOT / "data" / "processed" / f"training_{framework}" |
| | |
| | try: |
| | pairs, triplets = generate_pairs_and_triplets( |
| | chunks_path=chunks_path, |
| | output_dir=output_dir, |
| | num_pairs=100, |
| | num_triplets=100, |
| | variance=5, |
| | export_format="both" |
| | ) |
| | |
| | |
| | all_pairs.append(pairs) |
| | all_triplets.append(triplets) |
| | |
| | results.append({ |
| | "framework": framework, |
| | "status": "β
SUCCESS", |
| | "pairs": len(pairs), |
| | "variations": sum(len(p.variations) for p in pairs), |
| | "triplets": len(triplets), |
| | "output": output_dir |
| | }) |
| | |
| | except Exception as e: |
| | results.append({ |
| | "framework": framework, |
| | "status": f"β FAILED: {str(e)}", |
| | "output": output_dir |
| | }) |
| | |
| | |
| | print("\n" + "=" * 80) |
| | print("π CREATING COMBINED DATASET (ALL FRAMEWORKS)") |
| | print("=" * 80 + "\n") |
| | |
| | combined_dir = PROJECT_ROOT / "data" / "processed" / "training_combined" |
| | total_pairs, total_triplets = merge_datasets(all_pairs, all_triplets, combined_dir) |
| | |
| | |
| | print("\n" + "=" * 80) |
| | print("π FINAL SUMMARY") |
| | print("=" * 80 + "\n") |
| | |
| | print("INDIVIDUAL FRAMEWORK DATASETS:") |
| | print("-" * 40) |
| | for result in results: |
| | print(f"\nπ¦ {result['framework'].upper()}") |
| | print(f" Status: {result['status']}") |
| | if "pairs" in result: |
| | print(f" - positive_pairs.json: {result['pairs']} docs ({result['variations']} variations)") |
| | print(f" - triplets.json: {result['triplets']} docs") |
| | print(f" π {result['output']}") |
| | |
| | print("\n\nCOMBINED DATASET (ALL FRAMEWORKS):") |
| | print("-" * 40) |
| | print(f"π {combined_dir}") |
| | print(f" - positive_pairs.json: {total_pairs} docs") |
| | print(f" - triplets.json: {total_triplets} docs") |
| | |
| | |
| | successful = sum(1 for r in results if "SUCCESS" in r["status"]) |
| | total_files = (successful * 4) + 4 |
| | |
| | print(f"\n\nπ TOTAL FILES GENERATED: {total_files}") |
| | print(f" - {successful} frameworks Γ 4 files = {successful * 4} files") |
| | print(f" - Combined dataset = 4 files") |
| | print("=" * 80) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|