Spaces:

ruanchaves
/

napolab

Running

File size: 6,101 Bytes

#!/usr/bin/env python3
"""

Script to extract data from JSON files in a repository folder

and save it as a CSV file for import into the benchmark.

"""

import pandas as pd
import json
import os
import sys
import argparse
from pathlib import Path

def is_valid_json_file(file_path):
    """

    Check if a file is a valid JSON file containing a dict.

    

    Args:

        file_path (str): Path to the JSON file

        

    Returns:

        bool: True if valid JSON dict, False otherwise

    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        return isinstance(data, dict)
    except (json.JSONDecodeError, FileNotFoundError, UnicodeDecodeError):
        return False

def find_json_files(repo_path):
    """

    Recursively find all JSON files in the repository folder.

    

    Args:

        repo_path (str): Path to the repository folder

        

    Returns:

        list: List of paths to valid JSON files

    """
    json_files = []
    repo_path = Path(repo_path)
    
    if not repo_path.exists():
        print(f"Error: Repository path '{repo_path}' does not exist.")
        return []
    
    if not repo_path.is_dir():
        print(f"Error: Repository path '{repo_path}' is not a directory.")
        return []
    
    print(f"Scanning repository: {repo_path}")
    
    for file_path in repo_path.rglob("*.json"):
        if is_valid_json_file(file_path):
            json_files.append(file_path)
            print(f"Found valid JSON file: {file_path}")
    
    print(f"Total valid JSON files found: {len(json_files)}")
    return json_files

def extract_data_from_json(json_file_path):
    """

    Extract data from a single JSON file.

    

    Args:

        json_file_path (Path): Path to the JSON file

        

    Returns:

        dict or None: Extracted data or None if extraction failed

    """
    try:
        with open(json_file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Check if required fields exist
        if 'config_general' not in data or 'results' not in data:
            return None
        
        config_general = data['config_general']
        results = data['results']
        
        # Extract model information
        model_name = config_general.get('model_name', '')
        model_private = config_general.get('model_private', False)
        model_num_parameters = config_general.get('model_num_parameters', 0)
        
        # Extract results
        all_grouped = results.get('all_grouped', {})
        
        # Extract metrics
        assin2_rte = all_grouped.get('assin2_rte', 0.0)
        assin2_sts = all_grouped.get('assin2_sts', 0.0)
        faquad_nli = all_grouped.get('faquad_nli', 0.0)
        hatebr_offensive = all_grouped.get('hatebr_offensive', 0.0)
        
        # Create row data
        row_data = {
            'json_file': str(json_file_path),
            'model_name': model_name,
            'model_private': model_private,
            'model_num_parameters': model_num_parameters,
            'assin2_rte': assin2_rte,
            'assin2_sts': assin2_sts,
            'faquad_nli': faquad_nli,
            'hatebr_offensive': hatebr_offensive
        }
        
        return row_data
        
    except Exception as e:
        print(f"Error processing {json_file_path}: {e}")
        return None

def extract_portuguese_leaderboard(repo_path):
    """

    Extract data from JSON files in the repository folder and save as CSV.

    

    Args:

        repo_path (str): Path to the repository folder

    """
    
    print("Scanning repository for JSON files...")
    
    # Find all JSON files
    json_files = find_json_files(repo_path)
    
    if not json_files:
        print("No valid JSON files found in the repository.")
        return
    
    # Prepare data for DataFrame
    data = []
    
    # Process each JSON file
    for i, json_file in enumerate(json_files):
        print(f"Processing file {i+1}/{len(json_files)}: {json_file.name}")
        
        row_data = extract_data_from_json(json_file)
        if row_data:
            data.append(row_data)
        
        # Print progress every 10 files
        if (i + 1) % 10 == 0:
            print(f"  Processed {i + 1} files...")
    
    if not data:
        print("No valid data extracted from JSON files.")
        return
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Write to CSV
    output_file = 'portuguese_leaderboard.csv'
    df.to_csv(output_file, index=False)
    
    print(f"\nSuccessfully extracted {len(df)} models to {output_file}")
    
    # Show first few entries as preview
    print("\nFirst 5 entries:")
    print(df.head().to_string(index=False))
    
    # Show some statistics
    if not df.empty:
        print(f"\nStatistics:")
        print(f"Total models: {len(df)}")
        print(f"Private models: {df['model_private'].sum()}")
        print(f"Public models: {(~df['model_private']).sum()}")
        
        # Average scores
        print(f"\nAverage scores:")
        print(df[['assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']].mean().round(2))
        
        # Show data types and info
        print(f"\nDataFrame info:")
        print(df.info())

def main():
    """Main function to run the extraction."""
    parser = argparse.ArgumentParser(description='Extract Portuguese LLM Leaderboard data from JSON files')
    parser.add_argument('repo_path', help='Path to the repository folder containing JSON files')
    
    args = parser.parse_args()
    
    print("Portuguese LLM Leaderboard Data Extractor")
    print("=" * 50)
    
    try:
        extract_portuguese_leaderboard(args.repo_path)
        print("\nExtraction completed successfully!")
    except Exception as e:
        print(f"Error during extraction: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()