| | import json |
| | from pathlib import Path |
| | from typing import Iterator, Dict |
| |
|
| | |
| | |
| | |
| | files = [ |
| | "pool_multiple_choice_chunk_01.json", |
| | "pool_multiple_choice_chunk_02.json", |
| | "pool_multiple_choice_chunk_03.json", |
| | "pool_multiple_choice_chunk_04.json", |
| | "pool_numerical_chunk_01.json", |
| | "pool_numerical_chunk_02.json", |
| | "pool_numerical_chunk_03.json", |
| | "pool_regression_chunk_01.json", |
| | ] |
| |
|
| | out_path = Path("merged_train.json") |
| |
|
| | |
| | |
| | |
| | def iter_records(path: Path) -> Iterator[Dict]: |
| | """ |
| | Yields records from a file that can be: |
| | - JSONL (one JSON object per line), or |
| | - a single JSON array, or |
| | - a single JSON object. |
| | """ |
| | text = path.read_text(encoding="utf-8") |
| | |
| | try: |
| | data = json.loads(text) |
| | if isinstance(data, list): |
| | for rec in data: |
| | yield rec |
| | elif isinstance(data, dict): |
| | yield data |
| | else: |
| | raise ValueError(f"Unsupported top-level JSON type in {path}") |
| | except json.JSONDecodeError: |
| | |
| | for i, line in enumerate(text.splitlines(), 1): |
| | line = line.strip() |
| | if not line: |
| | continue |
| | try: |
| | yield json.loads(line) |
| | except json.JSONDecodeError as e: |
| | raise ValueError(f"Invalid JSON on line {i} in {path}: {e}") from e |
| |
|
| | |
| | |
| | |
| | out_path.parent.mkdir(parents=True, exist_ok=True) |
| |
|
| | count = 0 |
| | with out_path.open("w", encoding="utf-8") as out: |
| | out.write("[\n") |
| | first = True |
| | for fp in files: |
| | for rec in iter_records(Path(fp)): |
| | if not first: |
| | out.write(",\n") |
| | out.write(json.dumps(rec, ensure_ascii=False)) |
| | first = False |
| | count += 1 |
| | out.write("\n]") |
| |
|
| | print(f"✓ Wrote {count} records to {out_path.resolve()}") |
| |
|