|
import os |
|
import pickle |
|
from datetime import datetime |
|
|
|
import matplotlib.pyplot as plt |
|
import pandas as pd |
|
from huggingface_hub import HfApi |
|
|
|
|
|
LANGUAGE_COLORS = { |
|
"english": "orange", |
|
"spanish": "blue", |
|
} |
|
|
|
|
|
def fetch_models(cache_file="models_cache.pkl"): |
|
"""Fetch and filter models from HuggingFace Hub with caching""" |
|
|
|
if os.path.exists(cache_file): |
|
cache_age = datetime.now().timestamp() - os.path.getmtime(cache_file) |
|
if cache_age < 24 * 3600: |
|
print("Loading models from cache...") |
|
with open(cache_file, "rb") as f: |
|
return pickle.load(f) |
|
else: |
|
print("Cache is older than 24 hours, fetching fresh data...") |
|
else: |
|
print("No cache found, fetching models from Hugging Face Hub...") |
|
|
|
hf_api = HfApi() |
|
all_models = list(hf_api.list_models(full=True)) |
|
|
|
|
|
english_filter = filter( |
|
lambda m: any(tag == "language:en" for tag in m.tags) |
|
and not any( |
|
tag.startswith("language:") and tag != "language:en" for tag in m.tags |
|
), |
|
all_models, |
|
) |
|
spanish_filter = filter( |
|
lambda m: any(tag == "language:es" for tag in m.tags) |
|
and not any( |
|
tag.startswith("language:") and tag != "language:es" for tag in m.tags |
|
), |
|
all_models, |
|
) |
|
|
|
filtered_models = { |
|
"english": list(english_filter), |
|
"spanish": list(spanish_filter), |
|
} |
|
|
|
|
|
print("Saving models to cache...") |
|
with open(cache_file, "wb") as f: |
|
pickle.dump(filtered_models, f) |
|
|
|
return filtered_models |
|
|
|
|
|
def create_stack_area_plot(models, output_dir): |
|
"""Create stacked area plot for English and Spanish models""" |
|
|
|
all_dates = [] |
|
languages = ["english", "spanish"] |
|
for lang in languages: |
|
all_dates.extend([d.created_at.date() for d in models[lang]]) |
|
|
|
if not all_dates: |
|
print("No models found for any language. Skipping plot creation.") |
|
return |
|
|
|
|
|
min_date = min(all_dates) |
|
max_date = max(all_dates) |
|
date_range = pd.date_range(start=min_date, end=max_date, freq="MS") |
|
|
|
|
|
dfs = {} |
|
for lang in languages: |
|
dates = [d.created_at.date() for d in models[lang]] |
|
df = pd.DataFrame({"Date": dates}) |
|
df["Count"] = 1 |
|
df["Date"] = pd.to_datetime(df["Date"]) |
|
|
|
df_grouped = df.groupby(pd.Grouper(key="Date", freq="MS")).sum() |
|
df_grouped = df_grouped.reindex(date_range, fill_value=0) |
|
dfs[lang] = df_grouped.cumsum() |
|
|
|
|
|
plt.figure(figsize=(10, 6)) |
|
plt.stackplot( |
|
date_range, |
|
[dfs[lang]["Count"].values for lang in languages], |
|
labels=["English", "Spanish"], |
|
colors=[LANGUAGE_COLORS[lang] for lang in languages], |
|
) |
|
|
|
plt.xlabel("Date", fontsize=10) |
|
plt.ylabel("Cumulative Number of Models", fontsize=10) |
|
plt.xticks(rotation=45, fontsize=10) |
|
plt.legend(loc="upper left") |
|
plt.tight_layout() |
|
plt.savefig(f"{output_dir}/models_stack_area_en_es.png") |
|
plt.close() |
|
|
|
|
|
def main(): |
|
|
|
output_dir = "plots" |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
print("Fetching models from Hugging Face Hub...") |
|
models = fetch_models() |
|
|
|
|
|
print("\nModel counts:") |
|
for lang, models_list in models.items(): |
|
print(f"{lang.capitalize()}: {len(models_list)}") |
|
|
|
|
|
print("\nCreating stack area plot...") |
|
create_stack_area_plot(models, output_dir) |
|
|
|
print(f"Plot has been saved to the '{output_dir}' directory") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|