Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd | |
import nltk | |
from nltk.corpus import stopwords | |
from umap import UMAP | |
from hdbscan import HDBSCAN | |
from sentence_transformers import SentenceTransformer | |
from sklearn.feature_extraction.text import CountVectorizer | |
from bertopic import BERTopic | |
from bertopic.representation import MaximalMarginalRelevance | |
from bertopic.vectorizers import ClassTfidfTransformer | |
# Charger les stopwords | |
try: | |
stop_words = stopwords.words('english') | |
except LookupError: | |
nltk.download('stopwords') | |
stop_words = stopwords.words('english') | |
# Stopwords personnalisés | |
custom_stopwords = ["made", "sure"] | |
stop_words.extend(custom_stopwords) | |
# Pipeline BERTopic personnalisé | |
def generate_topics(file): | |
docs = pd.read_csv(file.name) | |
embedding_model = SentenceTransformer("all-MiniLM-L6-v2") | |
embeddings = embedding_model.encode(docs['text'].tolist(), show_progress_bar=True) | |
umap_model = UMAP( | |
n_neighbors=20, | |
n_components=5, | |
min_dist=0.0, | |
metric='cosine', | |
random_state=42 | |
) | |
hdbscan_model = HDBSCAN( | |
min_cluster_size=60, | |
min_samples=1, | |
metric='euclidean', | |
cluster_selection_method='eom', | |
prediction_data=True | |
) | |
vectorizer_model = CountVectorizer( | |
stop_words=stop_words, | |
min_df=1, | |
ngram_range=(1, 3) | |
) | |
ctfidf_model = ClassTfidfTransformer() | |
representation_model = MaximalMarginalRelevance(diversity=0.7) | |
BERT_model = BERTopic( | |
embedding_model=embedding_model, | |
umap_model=umap_model, | |
hdbscan_model=hdbscan_model, | |
vectorizer_model=vectorizer_model, | |
ctfidf_model=ctfidf_model, | |
representation_model=representation_model, | |
verbose=True | |
) | |
topics, _ = BERT_model.fit_transform(docs['text'].tolist()) | |
BERT_model.reduce_outliers(docs['text'].tolist(), topics) | |
fig = BERT_model.visualize_documents(docs['text'].tolist()) | |
return fig | |
# Interface Gradio | |
demo = gr.Interface( | |
fn=generate_topics, | |
inputs=gr.File(label="Upload bbc-text.csv"), | |
outputs=gr.Plot(label="Topic Map"), | |
title="Topic Modeling avec BERTopic", | |
description="Téléversez un fichier CSV avec une colonne 'text' pour générer une visualisation thématique interactive." | |
) | |
if __name__ == "__main__": | |
demo.launch() |