evannh's picture
Create app.py
08aea01 verified
import gradio as gr
import pandas as pd
import nltk
from nltk.corpus import stopwords
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer
# Charger les stopwords
try:
stop_words = stopwords.words('english')
except LookupError:
nltk.download('stopwords')
stop_words = stopwords.words('english')
# Stopwords personnalisés
custom_stopwords = ["made", "sure"]
stop_words.extend(custom_stopwords)
# Pipeline BERTopic personnalisé
def generate_topics(file):
docs = pd.read_csv(file.name)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(docs['text'].tolist(), show_progress_bar=True)
umap_model = UMAP(
n_neighbors=20,
n_components=5,
min_dist=0.0,
metric='cosine',
random_state=42
)
hdbscan_model = HDBSCAN(
min_cluster_size=60,
min_samples=1,
metric='euclidean',
cluster_selection_method='eom',
prediction_data=True
)
vectorizer_model = CountVectorizer(
stop_words=stop_words,
min_df=1,
ngram_range=(1, 3)
)
ctfidf_model = ClassTfidfTransformer()
representation_model = MaximalMarginalRelevance(diversity=0.7)
BERT_model = BERTopic(
embedding_model=embedding_model,
umap_model=umap_model,
hdbscan_model=hdbscan_model,
vectorizer_model=vectorizer_model,
ctfidf_model=ctfidf_model,
representation_model=representation_model,
verbose=True
)
topics, _ = BERT_model.fit_transform(docs['text'].tolist())
BERT_model.reduce_outliers(docs['text'].tolist(), topics)
fig = BERT_model.visualize_documents(docs['text'].tolist())
return fig
# Interface Gradio
demo = gr.Interface(
fn=generate_topics,
inputs=gr.File(label="Upload bbc-text.csv"),
outputs=gr.Plot(label="Topic Map"),
title="Topic Modeling avec BERTopic",
description="Téléversez un fichier CSV avec une colonne 'text' pour générer une visualisation thématique interactive."
)
if __name__ == "__main__":
demo.launch()