Spaces:
Sleeping
Sleeping
File size: 6,200 Bytes
1b43026 236cc2d c3c36d1 af67693 c3c36d1 236cc2d c3c36d1 af67693 c3c36d1 1b43026 af67693 daece56 1b43026 af67693 04fccda 236cc2d 04fccda af67693 daece56 af67693 c3c36d1 af67693 c3c36d1 daece56 236cc2d 1b43026 daece56 c3c36d1 236cc2d af67693 daece56 236cc2d c3c36d1 236cc2d c3c36d1 04fccda c3c36d1 236cc2d c3c36d1 af67693 c3c36d1 236cc2d daece56 af67693 c3c36d1 daece56 c3c36d1 daece56 af67693 daece56 af67693 236cc2d daece56 af67693 daece56 af67693 daece56 c3c36d1 236cc2d c3c36d1 1b43026 04fccda 1b43026 04fccda 1b43026 04fccda 1b43026 04fccda 236cc2d 04fccda 1b43026 04fccda 236cc2d 04fccda 1b43026 04fccda c3c36d1 04fccda c3c36d1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
\
import os, json, numpy as np, pandas as pd
import gradio as gr
import faiss
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from logic.cleaning import clean_dataframe
from logic.search import SloganSearcher
ASSETS_DIR = "assets"
DATA_PATH = "data/slogan.csv"
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
NORMALIZE = True
GEN_MODEL_NAME = "google/flan-t5-base"
NUM_GEN_CANDIDATES = 6
MAX_NEW_TOKENS = 24
TEMPERATURE = 0.9
TOP_P = 0.95
NOVELTY_SIM_THRESHOLD = 0.80
META_PATH = os.path.join(ASSETS_DIR, "meta.json")
PARQUET_PATH = os.path.join(ASSETS_DIR, "slogans_clean.parquet")
INDEX_PATH = os.path.join(ASSETS_DIR, "faiss.index")
EMB_PATH = os.path.join(ASSETS_DIR, "embeddings.npy")
def _log(m): print(f"[SLOGAN-SPACE] {m}", flush=True)
def _build_assets():
if not os.path.exists(DATA_PATH):
raise FileNotFoundError(f"Dataset not found at {DATA_PATH} (CSV with columns: 'tagline', 'description').")
os.makedirs(ASSETS_DIR, exist_ok=True)
_log(f"Loading dataset: {DATA_PATH}")
df = pd.read_csv(DATA_PATH)
_log(f"Rows before cleaning: {len(df)}")
df = clean_dataframe(df)
_log(f"Rows after cleaning: {len(df)}")
if "description" in df.columns and df["description"].notna().any():
texts = df["description"].fillna(df["tagline"]).astype(str).tolist()
text_col, fallback_col = "description", "tagline"
else:
texts = df["tagline"].astype(str).tolist()
text_col, fallback_col = "tagline", "tagline"
_log(f"Encoding with {MODEL_NAME} (normalize={NORMALIZE}) …")
encoder = SentenceTransformer(MODEL_NAME)
emb = encoder.encode(texts, batch_size=64, convert_to_numpy=True, normalize_embeddings=NORMALIZE)
dim = emb.shape[1]
index = faiss.IndexFlatIP(dim) if NORMALIZE else faiss.IndexFlatL2(dim)
index.add(emb)
_log("Persisting assets …")
df.to_parquet(PARQUET_PATH, index=False)
faiss.write_index(index, INDEX_PATH)
np.save(EMB_PATH, emb)
meta = {
"model_name": MODEL_NAME,
"dim": int(dim),
"normalized": NORMALIZE,
"metric": "ip" if NORMALIZE else "l2",
"row_count": int(len(df)),
"text_col": text_col,
"fallback_col": fallback_col,
}
with open(META_PATH, "w") as f:
json.dump(meta, f, indent=2)
_log("Assets built successfully.")
def _ensure_assets():
need = False
for p in (META_PATH, PARQUET_PATH, INDEX_PATH):
if not os.path.exists(p):
_log(f"Missing asset: {p}")
need = True
if need:
_log("Building assets from scratch …")
_build_assets()
return
try:
pd.read_parquet(PARQUET_PATH)
except Exception as e:
_log(f"Parquet read failed ({e}); rebuilding assets.")
_build_assets()
_ensure_assets()
searcher = SloganSearcher(assets_dir=ASSETS_DIR, use_rerank=False)
meta = json.load(open(META_PATH))
_encoder = SentenceTransformer(meta["model_name"])
_gen_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
_gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)
# ---- Prompt (adjust if you want your exact wording) ----
def _prompt_for(description: str) -> str:
return (
"You are a professional slogan writer. "
"Write ONE original, catchy startup slogan under 8 words, Title Case, no punctuation. "
"Do not copy examples. Description:\n"
f"{description}\nSlogan:"
)
def _generate_candidates(description: str, n: int = NUM_GEN_CANDIDATES):
prompt = _prompt_for(description)
inputs = _gen_tokenizer([prompt]*n, return_tensors="pt", padding=True, truncation=True)
outputs = _gen_model.generate(
**inputs,
do_sample=True,
temperature=TEMPERATURE,
top_p=TOP_P,
num_return_sequences=n,
max_new_tokens=MAX_NEW_TOKENS,
eos_token_id=_gen_tokenizer.eos_token_id,
)
texts = _gen_tokenizer.batch_decode(outputs, skip_special_tokens=True)
return [t.replace("Slogan:", "").strip().strip('"') for t in texts if t.strip()]
def _pick_most_novel(candidates, retrieved_texts):
if not candidates:
return None
R = _encoder.encode(retrieved_texts, convert_to_numpy=True, normalize_embeddings=True) if retrieved_texts else None
best, best_novelty = None, -1e9
for c in candidates:
c_emb = _encoder.encode([c], convert_to_numpy=True, normalize_embeddings=True)
if R is None or len(retrieved_texts) == 0:
max_sim = 0.0
else:
sims = np.dot(R, c_emb[0]) # cosine
max_sim = float(np.max(sims))
novelty = 1.0 - max_sim
if (max_sim < NOVELTY_SIM_THRESHOLD and novelty > best_novelty) or best is None and novelty > best_novelty:
best, best_novelty = c, novelty
return best
def run_pipeline(user_description: str):
if not user_description or not user_description.strip():
return "Please enter a description."
retrieved_df = searcher.search(user_description, top_k=3, rerank_top_n=10)
retrieved_texts = retrieved_df["display"].tolist() if not retrieved_df.empty else []
gens = _generate_candidates(user_description, NUM_GEN_CANDIDATES)
generated = _pick_most_novel(gens, retrieved_texts) or (gens[0] if gens else "—")
lines = []
lines.append("### 🔎 Top 3 similar slogans")
if retrieved_texts:
for i, s in enumerate(retrieved_texts, 1):
lines.append(f"{i}. {s}")
else:
lines.append("_No similar slogans found._")
lines.append("\n### ✨ AI-generated suggestion")
lines.append(generated)
return "\n".join(lines)
with gr.Blocks(title="Slogan Finder") as demo:
gr.Markdown("# 🔎 Slogan Finder\nDescribe your product/company; get 3 similar slogans + 1 AI-generated suggestion.")
query = gr.Textbox(label="Describe your product/company", placeholder="AI-powered patient financial navigation platform...")
btn = gr.Button("Get slogans", variant="primary")
out = gr.Markdown()
btn.click(run_pipeline, inputs=[query], outputs=out)
demo.queue(max_size=64).launch()
|