import os import tempfile import pandas as pd import gradio as gr from pydub import AudioSegment from faster_whisper import WhisperModel from pyannote.audio import Pipeline as DiarizationPipeline # Initialisation des modèles whisper_model = WhisperModel("large-v2", device="cpu", compute_type="int8") token = os.getenv("HF_TOKEN") # Ajoute cette variable dans les secrets de ton Space si besoin diari_pipeline = DiarizationPipeline.from_pretrained( "pyannote/speaker-diarization-3.1", use_auth_token=token ) # Pipeline de traitement : # .mp3 # ↓ (converti .wav) # .wav # ↓ # faster-whisper → segments (texte + timestamps) # ↓ # pyannote-audio → diarisation (segments + speaker X) # ↓ # Fusion des deux → transcription enrichie avec speaker + timestamp def convert_mp3_to_wav(mp3_path): wav_path = tempfile.mktemp(suffix=".wav") audio = AudioSegment.from_file(mp3_path, format="mp3") audio = audio.set_channels(1).set_frame_rate(16000) audio.export(wav_path, format="wav") return wav_path def transcribe_and_diarize(audio_file): wav_path = convert_mp3_to_wav(audio_file) # Transcription segments, _ = whisper_model.transcribe(wav_path, language="fr", beam_size=5) # Diarisation diarization = diari_pipeline(wav_path) speakers = [] for turn, _, speaker in diarization.itertracks(yield_label=True): speakers.append({ "start": turn.start, "end": turn.end, "speaker": speaker }) # Fusion transcription + speaker final_output = [] for seg in segments: seg_start = seg.start seg_end = seg.end text = seg.text.strip() speaker = "Inconnu" for s in speakers: if s["start"] <= seg_start <= s["end"]: speaker = s["speaker"] break final_output.append({ "start": seg_start, "end": seg_end, "speaker": speaker, "text": text }) df = pd.DataFrame(final_output) txt_lines = [f"[{row['start']:.2f}s - {row['end']:.2f}s] {row['speaker']} : {row['text']}" for _, row in df.iterrows()] txt_output = "\n".join(txt_lines) txt_path = tempfile.mktemp(suffix=".txt") with open(txt_path, "w", encoding="utf-8") as f: f.write(txt_output) csv_path = tempfile.mktemp(suffix=".csv") df.to_csv(csv_path, index=False) return txt_output, csv_path, txt_path # Interface Gradio gr.Interface( fn=transcribe_and_diarize, inputs=gr.Audio(type="filepath", label="Fichier audio MP3"), outputs=[ gr.Textbox(label="Transcription avec locuteurs"), gr.File(label="Télécharger le CSV"), gr.File(label="Télécharger le TXT") ], title="Transcription + Diarisation (FR)", description="Charge un fichier MP3. Transcription FR + séparation des locuteurs + export CSV/TXT." ).launch()