Spaces:
Sleeping
Sleeping
import os | |
import tempfile | |
import pandas as pd | |
import gradio as gr | |
from pydub import AudioSegment | |
from faster_whisper import WhisperModel | |
from pyannote.audio import Pipeline as DiarizationPipeline | |
# Initialisation des modèles | |
whisper_model = WhisperModel("large-v2", device="cpu", compute_type="int8") | |
token = os.getenv("HF_TOKEN") # Ajoute cette variable dans les secrets de ton Space si besoin | |
diari_pipeline = DiarizationPipeline.from_pretrained( | |
"pyannote/speaker-diarization-3.1", | |
use_auth_token=token | |
) | |
# Pipeline de traitement : | |
# .mp3 | |
# ↓ (converti .wav) | |
# .wav | |
# ↓ | |
# faster-whisper → segments (texte + timestamps) | |
# ↓ | |
# pyannote-audio → diarisation (segments + speaker X) | |
# ↓ | |
# Fusion des deux → transcription enrichie avec speaker + timestamp | |
def convert_mp3_to_wav(mp3_path): | |
wav_path = tempfile.mktemp(suffix=".wav") | |
audio = AudioSegment.from_file(mp3_path, format="mp3") | |
audio = audio.set_channels(1).set_frame_rate(16000) | |
audio.export(wav_path, format="wav") | |
return wav_path | |
def transcribe_and_diarize(audio_file): | |
wav_path = convert_mp3_to_wav(audio_file) | |
# Transcription | |
segments, _ = whisper_model.transcribe(wav_path, language="fr", beam_size=5) | |
# Diarisation | |
diarization = diari_pipeline(wav_path) | |
speakers = [] | |
for turn, _, speaker in diarization.itertracks(yield_label=True): | |
speakers.append({ | |
"start": turn.start, | |
"end": turn.end, | |
"speaker": speaker | |
}) | |
# Fusion transcription + speaker | |
final_output = [] | |
for seg in segments: | |
seg_start = seg.start | |
seg_end = seg.end | |
text = seg.text.strip() | |
speaker = "Inconnu" | |
for s in speakers: | |
if s["start"] <= seg_start <= s["end"]: | |
speaker = s["speaker"] | |
break | |
final_output.append({ | |
"start": seg_start, | |
"end": seg_end, | |
"speaker": speaker, | |
"text": text | |
}) | |
df = pd.DataFrame(final_output) | |
txt_lines = [f"[{row['start']:.2f}s - {row['end']:.2f}s] {row['speaker']} : {row['text']}" for _, row in df.iterrows()] | |
txt_output = "\n".join(txt_lines) | |
txt_path = tempfile.mktemp(suffix=".txt") | |
with open(txt_path, "w", encoding="utf-8") as f: | |
f.write(txt_output) | |
csv_path = tempfile.mktemp(suffix=".csv") | |
df.to_csv(csv_path, index=False) | |
return txt_output, csv_path, txt_path | |
# Interface Gradio | |
gr.Interface( | |
fn=transcribe_and_diarize, | |
inputs=gr.Audio(type="filepath", label="Fichier audio MP3"), | |
outputs=[ | |
gr.Textbox(label="Transcription avec locuteurs"), | |
gr.File(label="Télécharger le CSV"), | |
gr.File(label="Télécharger le TXT") | |
], | |
title="Transcription + Diarisation (FR)", | |
description="Charge un fichier MP3. Transcription FR + séparation des locuteurs + export CSV/TXT." | |
).launch() |