Spaces:

mariagrandury
/

language-gap-in-hf-hub

Running

File size: 4,062 Bytes

a938b8a

import os
import pickle
from datetime import datetime

import matplotlib.pyplot as plt
import pandas as pd
from huggingface_hub import HfApi

# Define colors for each language
LANGUAGE_COLORS = {
    "english": "orange",
    "spanish": "blue",
}


def fetch_models(cache_file="models_cache.pkl"):
    """Fetch and filter models from HuggingFace Hub with caching"""
    # Check if cached data exists and is less than 24 hours old
    if os.path.exists(cache_file):
        cache_age = datetime.now().timestamp() - os.path.getmtime(cache_file)
        if cache_age < 24 * 3600:  # 24 hours in seconds
            print("Loading models from cache...")
            with open(cache_file, "rb") as f:
                return pickle.load(f)
        else:
            print("Cache is older than 24 hours, fetching fresh data...")
    else:
        print("No cache found, fetching models from Hugging Face Hub...")

    hf_api = HfApi()
    all_models = list(hf_api.list_models(full=True))

    # Filter models by language
    english_filter = filter(
        lambda m: any(tag == "language:en" for tag in m.tags)
        and not any(
            tag.startswith("language:") and tag != "language:en" for tag in m.tags
        ),
        all_models,
    )
    spanish_filter = filter(
        lambda m: any(tag == "language:es" for tag in m.tags)
        and not any(
            tag.startswith("language:") and tag != "language:es" for tag in m.tags
        ),
        all_models,
    )

    filtered_models = {
        "english": list(english_filter),
        "spanish": list(spanish_filter),
    }

    # Cache the filtered models
    print("Saving models to cache...")
    with open(cache_file, "wb") as f:
        pickle.dump(filtered_models, f)

    return filtered_models


def create_stack_area_plot(models, output_dir):
    """Create stacked area plot for English and Spanish models"""
    # Prepare data for all languages
    all_dates = []
    languages = ["english", "spanish"]
    for lang in languages:
        all_dates.extend([d.created_at.date() for d in models[lang]])

    if not all_dates:
        print("No models found for any language. Skipping plot creation.")
        return

    # Create a common date range for all languages
    min_date = min(all_dates)
    max_date = max(all_dates)
    date_range = pd.date_range(start=min_date, end=max_date, freq="MS")

    # Create separate DataFrames for each language
    dfs = {}
    for lang in languages:
        dates = [d.created_at.date() for d in models[lang]]
        df = pd.DataFrame({"Date": dates})
        df["Count"] = 1
        df["Date"] = pd.to_datetime(df["Date"])
        # Reindex to common date range and fill missing values with 0
        df_grouped = df.groupby(pd.Grouper(key="Date", freq="MS")).sum()
        df_grouped = df_grouped.reindex(date_range, fill_value=0)
        dfs[lang] = df_grouped.cumsum()

    # Plot stacked area for English and Spanish
    plt.figure(figsize=(10, 6))
    plt.stackplot(
        date_range,
        [dfs[lang]["Count"].values for lang in languages],
        labels=["English", "Spanish"],
        colors=[LANGUAGE_COLORS[lang] for lang in languages],
    )

    plt.xlabel("Date", fontsize=10)
    plt.ylabel("Cumulative Number of Models", fontsize=10)
    plt.xticks(rotation=45, fontsize=10)
    plt.legend(loc="upper left")
    plt.tight_layout()
    plt.savefig(f"{output_dir}/models_stack_area_en_es.png")
    plt.close()


def main():
    # Create output directory if it doesn't exist
    output_dir = "plots"
    os.makedirs(output_dir, exist_ok=True)

    # Fetch models
    print("Fetching models from Hugging Face Hub...")
    models = fetch_models()

    # Print model counts
    print("\nModel counts:")
    for lang, models_list in models.items():
        print(f"{lang.capitalize()}: {len(models_list)}")

    # Create visualization
    print("\nCreating stack area plot...")
    create_stack_area_plot(models, output_dir)

    print(f"Plot has been saved to the '{output_dir}' directory")


if __name__ == "__main__":
    main()