File size: 6,200 Bytes
1b43026
236cc2d
c3c36d1
 
af67693
c3c36d1
236cc2d
 
c3c36d1
af67693
c3c36d1
 
1b43026
af67693
daece56
1b43026
af67693
04fccda
 
 
 
 
236cc2d
04fccda
af67693
daece56
af67693
 
c3c36d1
af67693
c3c36d1
daece56
236cc2d
1b43026
daece56
c3c36d1
236cc2d
af67693
daece56
236cc2d
c3c36d1
236cc2d
c3c36d1
04fccda
 
 
 
 
 
c3c36d1
236cc2d
c3c36d1
af67693
 
c3c36d1
 
 
 
236cc2d
daece56
 
 
af67693
c3c36d1
 
 
 
 
 
 
 
 
daece56
c3c36d1
daece56
 
af67693
 
 
daece56
af67693
 
 
236cc2d
daece56
 
 
af67693
daece56
af67693
daece56
c3c36d1
236cc2d
 
c3c36d1
 
1b43026
 
04fccda
 
 
 
1b43026
04fccda
 
1b43026
 
 
 
04fccda
 
 
 
1b43026
04fccda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236cc2d
04fccda
 
 
 
 
1b43026
04fccda
 
236cc2d
04fccda
 
 
 
 
 
 
 
1b43026
 
04fccda
 
 
 
 
 
 
 
 
c3c36d1
 
 
04fccda
 
 
 
 
c3c36d1
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
\
import os, json, numpy as np, pandas as pd
import gradio as gr
import faiss

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

from logic.cleaning import clean_dataframe
from logic.search import SloganSearcher

ASSETS_DIR = "assets"
DATA_PATH  = "data/slogan.csv"

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
NORMALIZE  = True

GEN_MODEL_NAME = "google/flan-t5-base"
NUM_GEN_CANDIDATES = 6
MAX_NEW_TOKENS = 24
TEMPERATURE = 0.9
TOP_P = 0.95
NOVELTY_SIM_THRESHOLD = 0.80

META_PATH    = os.path.join(ASSETS_DIR, "meta.json")
PARQUET_PATH = os.path.join(ASSETS_DIR, "slogans_clean.parquet")
INDEX_PATH   = os.path.join(ASSETS_DIR, "faiss.index")
EMB_PATH     = os.path.join(ASSETS_DIR, "embeddings.npy")

def _log(m): print(f"[SLOGAN-SPACE] {m}", flush=True)

def _build_assets():
    if not os.path.exists(DATA_PATH):
        raise FileNotFoundError(f"Dataset not found at {DATA_PATH} (CSV with columns: 'tagline', 'description').")
    os.makedirs(ASSETS_DIR, exist_ok=True)

    _log(f"Loading dataset: {DATA_PATH}")
    df = pd.read_csv(DATA_PATH)

    _log(f"Rows before cleaning: {len(df)}")
    df = clean_dataframe(df)
    _log(f"Rows after cleaning: {len(df)}")

    if "description" in df.columns and df["description"].notna().any():
        texts = df["description"].fillna(df["tagline"]).astype(str).tolist()
        text_col, fallback_col = "description", "tagline"
    else:
        texts = df["tagline"].astype(str).tolist()
        text_col, fallback_col = "tagline", "tagline"

    _log(f"Encoding with {MODEL_NAME} (normalize={NORMALIZE}) …")
    encoder = SentenceTransformer(MODEL_NAME)
    emb = encoder.encode(texts, batch_size=64, convert_to_numpy=True, normalize_embeddings=NORMALIZE)

    dim = emb.shape[1]
    index = faiss.IndexFlatIP(dim) if NORMALIZE else faiss.IndexFlatL2(dim)
    index.add(emb)

    _log("Persisting assets …")
    df.to_parquet(PARQUET_PATH, index=False)
    faiss.write_index(index, INDEX_PATH)
    np.save(EMB_PATH, emb)

    meta = {
        "model_name": MODEL_NAME,
        "dim": int(dim),
        "normalized": NORMALIZE,
        "metric": "ip" if NORMALIZE else "l2",
        "row_count": int(len(df)),
        "text_col": text_col,
        "fallback_col": fallback_col,
    }
    with open(META_PATH, "w") as f:
        json.dump(meta, f, indent=2)
    _log("Assets built successfully.")

def _ensure_assets():
    need = False
    for p in (META_PATH, PARQUET_PATH, INDEX_PATH):
        if not os.path.exists(p):
            _log(f"Missing asset: {p}")
            need = True
    if need:
        _log("Building assets from scratch …")
        _build_assets()
        return
    try:
        pd.read_parquet(PARQUET_PATH)
    except Exception as e:
        _log(f"Parquet read failed ({e}); rebuilding assets.")
        _build_assets()

_ensure_assets()

searcher = SloganSearcher(assets_dir=ASSETS_DIR, use_rerank=False)

meta = json.load(open(META_PATH))
_encoder = SentenceTransformer(meta["model_name"])

_gen_tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
_gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)

# ---- Prompt (adjust if you want your exact wording) ----
def _prompt_for(description: str) -> str:
    return (
        "You are a professional slogan writer. "
        "Write ONE original, catchy startup slogan under 8 words, Title Case, no punctuation. "
        "Do not copy examples. Description:\n"
        f"{description}\nSlogan:"
    )

def _generate_candidates(description: str, n: int = NUM_GEN_CANDIDATES):
    prompt = _prompt_for(description)
    inputs = _gen_tokenizer([prompt]*n, return_tensors="pt", padding=True, truncation=True)
    outputs = _gen_model.generate(
        **inputs,
        do_sample=True,
        temperature=TEMPERATURE,
        top_p=TOP_P,
        num_return_sequences=n,
        max_new_tokens=MAX_NEW_TOKENS,
        eos_token_id=_gen_tokenizer.eos_token_id,
    )
    texts = _gen_tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return [t.replace("Slogan:", "").strip().strip('"') for t in texts if t.strip()]

def _pick_most_novel(candidates, retrieved_texts):
    if not candidates:
        return None
    R = _encoder.encode(retrieved_texts, convert_to_numpy=True, normalize_embeddings=True) if retrieved_texts else None
    best, best_novelty = None, -1e9
    for c in candidates:
        c_emb = _encoder.encode([c], convert_to_numpy=True, normalize_embeddings=True)
        if R is None or len(retrieved_texts) == 0:
            max_sim = 0.0
        else:
            sims = np.dot(R, c_emb[0])  # cosine
            max_sim = float(np.max(sims))
        novelty = 1.0 - max_sim
        if (max_sim < NOVELTY_SIM_THRESHOLD and novelty > best_novelty) or best is None and novelty > best_novelty:
            best, best_novelty = c, novelty
    return best

def run_pipeline(user_description: str):
    if not user_description or not user_description.strip():
        return "Please enter a description."
    retrieved_df = searcher.search(user_description, top_k=3, rerank_top_n=10)
    retrieved_texts = retrieved_df["display"].tolist() if not retrieved_df.empty else []
    gens = _generate_candidates(user_description, NUM_GEN_CANDIDATES)
    generated = _pick_most_novel(gens, retrieved_texts) or (gens[0] if gens else "—")
    lines = []
    lines.append("### 🔎 Top 3 similar slogans")
    if retrieved_texts:
        for i, s in enumerate(retrieved_texts, 1):
            lines.append(f"{i}. {s}")
    else:
        lines.append("_No similar slogans found._")
    lines.append("\n### ✨ AI-generated suggestion")
    lines.append(generated)
    return "\n".join(lines)

with gr.Blocks(title="Slogan Finder") as demo:
    gr.Markdown("# 🔎 Slogan Finder\nDescribe your product/company; get 3 similar slogans + 1 AI-generated suggestion.")
    query = gr.Textbox(label="Describe your product/company", placeholder="AI-powered patient financial navigation platform...")
    btn = gr.Button("Get slogans", variant="primary")
    out = gr.Markdown()
    btn.click(run_pipeline, inputs=[query], outputs=out)

demo.queue(max_size=64).launch()