Spaces:

mariagrandury
/

language-gap-in-hf-hub

Running

App Files Files Community

language-gap-in-hf-hub / hub_models_by_language.py

mariagrandury

specify resource type in plot names

a938b8a 18 days ago

raw

history blame contribute delete

4.06 kB

	import os
	import pickle
	from datetime import datetime

	import matplotlib.pyplot as plt
	import pandas as pd
	from huggingface_hub import HfApi

	# Define colors for each language
	LANGUAGE_COLORS = {
	"english": "orange",
	"spanish": "blue",
	}


	def fetch_models(cache_file="models_cache.pkl"):
	"""Fetch and filter models from HuggingFace Hub with caching"""
	# Check if cached data exists and is less than 24 hours old
	if os.path.exists(cache_file):
	cache_age = datetime.now().timestamp() - os.path.getmtime(cache_file)
	if cache_age < 24 * 3600: # 24 hours in seconds
	print("Loading models from cache...")
	with open(cache_file, "rb") as f:
	return pickle.load(f)
	else:
	print("Cache is older than 24 hours, fetching fresh data...")
	else:
	print("No cache found, fetching models from Hugging Face Hub...")

	hf_api = HfApi()
	all_models = list(hf_api.list_models(full=True))

	# Filter models by language
	english_filter = filter(
	lambda m: any(tag == "language:en" for tag in m.tags)
	and not any(
	tag.startswith("language:") and tag != "language:en" for tag in m.tags
	),
	all_models,
	)
	spanish_filter = filter(
	lambda m: any(tag == "language:es" for tag in m.tags)
	and not any(
	tag.startswith("language:") and tag != "language:es" for tag in m.tags
	),
	all_models,
	)

	filtered_models = {
	"english": list(english_filter),
	"spanish": list(spanish_filter),
	}

	# Cache the filtered models
	print("Saving models to cache...")
	with open(cache_file, "wb") as f:
	pickle.dump(filtered_models, f)

	return filtered_models


	def create_stack_area_plot(models, output_dir):
	"""Create stacked area plot for English and Spanish models"""
	# Prepare data for all languages
	all_dates = []
	languages = ["english", "spanish"]
	for lang in languages:
	all_dates.extend([d.created_at.date() for d in models[lang]])

	if not all_dates:
	print("No models found for any language. Skipping plot creation.")
	return

	# Create a common date range for all languages
	min_date = min(all_dates)
	max_date = max(all_dates)
	date_range = pd.date_range(start=min_date, end=max_date, freq="MS")

	# Create separate DataFrames for each language
	dfs = {}
	for lang in languages:
	dates = [d.created_at.date() for d in models[lang]]
	df = pd.DataFrame({"Date": dates})
	df["Count"] = 1
	df["Date"] = pd.to_datetime(df["Date"])
	# Reindex to common date range and fill missing values with 0
	df_grouped = df.groupby(pd.Grouper(key="Date", freq="MS")).sum()
	df_grouped = df_grouped.reindex(date_range, fill_value=0)
	dfs[lang] = df_grouped.cumsum()

	# Plot stacked area for English and Spanish
	plt.figure(figsize=(10, 6))
	plt.stackplot(
	date_range,
	[dfs[lang]["Count"].values for lang in languages],
	labels=["English", "Spanish"],
	colors=[LANGUAGE_COLORS[lang] for lang in languages],
	)

	plt.xlabel("Date", fontsize=10)
	plt.ylabel("Cumulative Number of Models", fontsize=10)
	plt.xticks(rotation=45, fontsize=10)
	plt.legend(loc="upper left")
	plt.tight_layout()
	plt.savefig(f"{output_dir}/models_stack_area_en_es.png")
	plt.close()


	def main():
	# Create output directory if it doesn't exist
	output_dir = "plots"
	os.makedirs(output_dir, exist_ok=True)

	# Fetch models
	print("Fetching models from Hugging Face Hub...")
	models = fetch_models()

	# Print model counts
	print("\nModel counts:")
	for lang, models_list in models.items():
	print(f"{lang.capitalize()}: {len(models_list)}")

	# Create visualization
	print("\nCreating stack area plot...")
	create_stack_area_plot(models, output_dir)

	print(f"Plot has been saved to the '{output_dir}' directory")


	if __name__ == "__main__":
	main()