import gradio as gr import pandas as pd import nltk from nltk.corpus import stopwords from umap import UMAP from hdbscan import HDBSCAN from sentence_transformers import SentenceTransformer from sklearn.feature_extraction.text import CountVectorizer from bertopic import BERTopic from bertopic.representation import MaximalMarginalRelevance from bertopic.vectorizers import ClassTfidfTransformer # Charger les stopwords try: stop_words = stopwords.words('english') except LookupError: nltk.download('stopwords') stop_words = stopwords.words('english') # Stopwords personnalisés custom_stopwords = ["made", "sure"] stop_words.extend(custom_stopwords) # Pipeline BERTopic personnalisé def generate_topics(file): docs = pd.read_csv(file.name) embedding_model = SentenceTransformer("all-MiniLM-L6-v2") embeddings = embedding_model.encode(docs['text'].tolist(), show_progress_bar=True) umap_model = UMAP( n_neighbors=20, n_components=5, min_dist=0.0, metric='cosine', random_state=42 ) hdbscan_model = HDBSCAN( min_cluster_size=60, min_samples=1, metric='euclidean', cluster_selection_method='eom', prediction_data=True ) vectorizer_model = CountVectorizer( stop_words=stop_words, min_df=1, ngram_range=(1, 3) ) ctfidf_model = ClassTfidfTransformer() representation_model = MaximalMarginalRelevance(diversity=0.7) BERT_model = BERTopic( embedding_model=embedding_model, umap_model=umap_model, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model, ctfidf_model=ctfidf_model, representation_model=representation_model, verbose=True ) topics, _ = BERT_model.fit_transform(docs['text'].tolist()) BERT_model.reduce_outliers(docs['text'].tolist(), topics) fig = BERT_model.visualize_documents(docs['text'].tolist()) return fig # Interface Gradio demo = gr.Interface( fn=generate_topics, inputs=gr.File(label="Upload bbc-text.csv"), outputs=gr.Plot(label="Topic Map"), title="Topic Modeling avec BERTopic", description="Téléversez un fichier CSV avec une colonne 'text' pour générer une visualisation thématique interactive." ) if __name__ == "__main__": demo.launch()