Spaces:
Sleeping
Sleeping
""" | |
Run this LOCALLY to build assets/ from your real dataset. | |
1) Put your CSV/Parquet with at least 'tagline' (and optional 'description') columns. | |
2) Adjust INPUT_PATH below. | |
3) python prepare_assets.py | |
Then commit assets/ into your Space repo (or upload to a Dataset repo). | |
""" | |
import os, json, numpy as np, pandas as pd | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
from logic.cleaning import clean_dataframe | |
# ---- CHANGE THIS ---- | |
INPUT_PATH = "/mnt/data/hf-slogan-space/data/raw_slogans.csv" # e.g., export from your notebook | |
ASSETS_DIR = "assets" | |
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" | |
NORMALIZE = True # set False if you prefer L2 | |
def main(): | |
os.makedirs(ASSETS_DIR, exist_ok=True) | |
# Load | |
if INPUT_PATH.endswith(".csv"): | |
df = pd.read_csv(INPUT_PATH) | |
elif INPUT_PATH.endswith(".parquet"): | |
df = pd.read_parquet(INPUT_PATH) | |
else: | |
raise ValueError("Use CSV or Parquet for INPUT_PATH") | |
# Clean using your real rules | |
df_clean = clean_dataframe(df) | |
df_clean.to_parquet(os.path.join(ASSETS_DIR, "slogans_clean.parquet"), index=False) | |
# Choose text field | |
if "description" in df_clean.columns: | |
texts = df_clean["description"].fillna(df_clean["tagline"]).astype(str).tolist() | |
text_col, fallback_col = "description", "tagline" | |
else: | |
texts = df_clean["tagline"].astype(str).tolist() | |
text_col, fallback_col = "tagline", "tagline" | |
# Encode | |
encoder = SentenceTransformer(MODEL_NAME) | |
emb = encoder.encode(texts, batch_size=64, convert_to_numpy=True, normalize_embeddings=NORMALIZE) | |
# Save embeddings numpy (optional; not required at runtime) | |
np.save(os.path.join(ASSETS_DIR, "embeddings.npy"), emb) | |
# Build FAISS index | |
dim = emb.shape[1] | |
if NORMALIZE: | |
index = faiss.IndexFlatIP(dim) # cosine if normalized | |
else: | |
index = faiss.IndexFlatL2(dim) | |
index.add(emb) | |
faiss.write_index(index, os.path.join(ASSETS_DIR, "faiss.index")) | |
meta = { | |
"model_name": MODEL_NAME, | |
"dim": int(dim), | |
"normalized": NORMALIZE, | |
"metric": "ip" if NORMALIZE else "l2", | |
"row_count": int(len(df_clean)), | |
"text_col": text_col, | |
"fallback_col": fallback_col, | |
} | |
with open(os.path.join(ASSETS_DIR, "meta.json"), "w") as f: | |
json.dump(meta, f, indent=2) | |
print("✅ Assets built in", ASSETS_DIR) | |
print(meta) | |
if __name__ == "__main__": | |
main() | |