evannh's picture
Update app.py
0688b58 verified
import os
import tempfile
import pandas as pd
import gradio as gr
from pydub import AudioSegment
from faster_whisper import WhisperModel
from pyannote.audio import Pipeline as DiarizationPipeline
# Initialisation des modèles
whisper_model = WhisperModel("large-v2", device="cpu", compute_type="int8")
token = os.getenv("HF_TOKEN") # Ajoute cette variable dans les secrets de ton Space si besoin
diari_pipeline = DiarizationPipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
use_auth_token=token
)
# Pipeline de traitement :
# .mp3
# ↓ (converti .wav)
# .wav
# ↓
# faster-whisper → segments (texte + timestamps)
# ↓
# pyannote-audio → diarisation (segments + speaker X)
# ↓
# Fusion des deux → transcription enrichie avec speaker + timestamp
def convert_mp3_to_wav(mp3_path):
wav_path = tempfile.mktemp(suffix=".wav")
audio = AudioSegment.from_file(mp3_path, format="mp3")
audio = audio.set_channels(1).set_frame_rate(16000)
audio.export(wav_path, format="wav")
return wav_path
def transcribe_and_diarize(audio_file):
wav_path = convert_mp3_to_wav(audio_file)
# Transcription
segments, _ = whisper_model.transcribe(wav_path, language="fr", beam_size=5)
# Diarisation
diarization = diari_pipeline(wav_path)
speakers = []
for turn, _, speaker in diarization.itertracks(yield_label=True):
speakers.append({
"start": turn.start,
"end": turn.end,
"speaker": speaker
})
# Fusion transcription + speaker
final_output = []
for seg in segments:
seg_start = seg.start
seg_end = seg.end
text = seg.text.strip()
speaker = "Inconnu"
for s in speakers:
if s["start"] <= seg_start <= s["end"]:
speaker = s["speaker"]
break
final_output.append({
"start": seg_start,
"end": seg_end,
"speaker": speaker,
"text": text
})
df = pd.DataFrame(final_output)
txt_lines = [f"[{row['start']:.2f}s - {row['end']:.2f}s] {row['speaker']} : {row['text']}" for _, row in df.iterrows()]
txt_output = "\n".join(txt_lines)
txt_path = tempfile.mktemp(suffix=".txt")
with open(txt_path, "w", encoding="utf-8") as f:
f.write(txt_output)
csv_path = tempfile.mktemp(suffix=".csv")
df.to_csv(csv_path, index=False)
return txt_output, csv_path, txt_path
# Interface Gradio
gr.Interface(
fn=transcribe_and_diarize,
inputs=gr.Audio(type="filepath", label="Fichier audio MP3"),
outputs=[
gr.Textbox(label="Transcription avec locuteurs"),
gr.File(label="Télécharger le CSV"),
gr.File(label="Télécharger le TXT")
],
title="Transcription + Diarisation (FR)",
description="Charge un fichier MP3. Transcription FR + séparation des locuteurs + export CSV/TXT."
).launch()