language-gap-in-hf-hub / hub_models_by_language.py
mariagrandury's picture
specify resource type in plot names
a938b8a
import os
import pickle
from datetime import datetime
import matplotlib.pyplot as plt
import pandas as pd
from huggingface_hub import HfApi
# Define colors for each language
LANGUAGE_COLORS = {
"english": "orange",
"spanish": "blue",
}
def fetch_models(cache_file="models_cache.pkl"):
"""Fetch and filter models from HuggingFace Hub with caching"""
# Check if cached data exists and is less than 24 hours old
if os.path.exists(cache_file):
cache_age = datetime.now().timestamp() - os.path.getmtime(cache_file)
if cache_age < 24 * 3600: # 24 hours in seconds
print("Loading models from cache...")
with open(cache_file, "rb") as f:
return pickle.load(f)
else:
print("Cache is older than 24 hours, fetching fresh data...")
else:
print("No cache found, fetching models from Hugging Face Hub...")
hf_api = HfApi()
all_models = list(hf_api.list_models(full=True))
# Filter models by language
english_filter = filter(
lambda m: any(tag == "language:en" for tag in m.tags)
and not any(
tag.startswith("language:") and tag != "language:en" for tag in m.tags
),
all_models,
)
spanish_filter = filter(
lambda m: any(tag == "language:es" for tag in m.tags)
and not any(
tag.startswith("language:") and tag != "language:es" for tag in m.tags
),
all_models,
)
filtered_models = {
"english": list(english_filter),
"spanish": list(spanish_filter),
}
# Cache the filtered models
print("Saving models to cache...")
with open(cache_file, "wb") as f:
pickle.dump(filtered_models, f)
return filtered_models
def create_stack_area_plot(models, output_dir):
"""Create stacked area plot for English and Spanish models"""
# Prepare data for all languages
all_dates = []
languages = ["english", "spanish"]
for lang in languages:
all_dates.extend([d.created_at.date() for d in models[lang]])
if not all_dates:
print("No models found for any language. Skipping plot creation.")
return
# Create a common date range for all languages
min_date = min(all_dates)
max_date = max(all_dates)
date_range = pd.date_range(start=min_date, end=max_date, freq="MS")
# Create separate DataFrames for each language
dfs = {}
for lang in languages:
dates = [d.created_at.date() for d in models[lang]]
df = pd.DataFrame({"Date": dates})
df["Count"] = 1
df["Date"] = pd.to_datetime(df["Date"])
# Reindex to common date range and fill missing values with 0
df_grouped = df.groupby(pd.Grouper(key="Date", freq="MS")).sum()
df_grouped = df_grouped.reindex(date_range, fill_value=0)
dfs[lang] = df_grouped.cumsum()
# Plot stacked area for English and Spanish
plt.figure(figsize=(10, 6))
plt.stackplot(
date_range,
[dfs[lang]["Count"].values for lang in languages],
labels=["English", "Spanish"],
colors=[LANGUAGE_COLORS[lang] for lang in languages],
)
plt.xlabel("Date", fontsize=10)
plt.ylabel("Cumulative Number of Models", fontsize=10)
plt.xticks(rotation=45, fontsize=10)
plt.legend(loc="upper left")
plt.tight_layout()
plt.savefig(f"{output_dir}/models_stack_area_en_es.png")
plt.close()
def main():
# Create output directory if it doesn't exist
output_dir = "plots"
os.makedirs(output_dir, exist_ok=True)
# Fetch models
print("Fetching models from Hugging Face Hub...")
models = fetch_models()
# Print model counts
print("\nModel counts:")
for lang, models_list in models.items():
print(f"{lang.capitalize()}: {len(models_list)}")
# Create visualization
print("\nCreating stack area plot...")
create_stack_area_plot(models, output_dir)
print(f"Plot has been saved to the '{output_dir}' directory")
if __name__ == "__main__":
main()