File size: 4,962 Bytes
9552602
5dd29f5
 
b41b158
9552602
 
 
 
 
2109747
 
 
 
 
 
 
9552602
b41b158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9552602
b41b158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9552602
 
2109747
9552602
 
 
 
 
 
 
 
b41b158
9552602
2109747
 
9552602
2109747
9552602
 
2109747
9552602
 
 
 
 
 
 
 
 
2109747
 
 
 
0467024
 
 
2109747
 
 
 
0467024
2109747
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43bdbc0
c0be2e6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import gradio as gr
from speechbrain.inference import EncoderClassifier
import torch
import requests
import subprocess
import os
import uuid
import yt_dlp

model = None  # Lazy-loaded model

def get_model():
    global model
    if model is None:
        model = EncoderClassifier.from_hparams("Jzuluaga/accent-id-commonaccent_ecapa")
    return model

def extract_id_from_url(url):
    urlRet = url.split("/")[-1]
    if '?' in urlRet:
        urlRet = urlRet.split("?")[0]
    return urlRet

def fetch_loom_download_url(id):
    response = requests.post(url=f"https://www.loom.com/api/campaigns/sessions/{id}/transcoded-url")
    if response.status_code == 200:
        return response.json()["url"]
    else:
        print("Error while retrieving response: ", response.status_code)
        exit



def download_loom_video(url, filename):
    headers = {
        "User-Agent": "Mozilla/5.0"
    }

    try:
        with requests.get(url, headers=headers, stream=True) as response:
            response.raise_for_status()

            with open(filename, "wb") as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)

        print(f"Downloaded video to {filename}")
        return filename

    except requests.exceptions.RequestException as e:
        print(f"Failed to download Loom video: {e}")
        return None

def download_direct_mp4(url, filename):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(filename, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        return filename
    except Exception as e:
        print(f"Error downloading direct mp4: {e}")
        return None
        

def download_video_from_url(url):
    if "loom.com" in url:
        video_id = extract_id_from_url(url)
        print(video_id)
        direct_url = fetch_loom_download_url(video_id)
        print(direct_url)
        filename = f"LoomVideo_{video_id}.mp4"
        success = download_loom_video(direct_url, filename)
        print(success)
        return filename if success else None

    elif url.endswith(".mp4"):
        filename = f"video_{uuid.uuid4()}.mp4"
        result = download_direct_mp4(url, filename)
        return result

    else:
        # fallback to yt_dlp for youtube, vimeo, etc.
        out_path = f"video_{uuid.uuid4()}.mp4"
        ydl_opts = {
            'format': 'bestaudio/best',
            'outtmpl': out_path,
            'quiet': True,
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
        return out_path



def extract_audio(video_file):
    audio_path = f"audio_{uuid.uuid4()}.wav"
    cmd = [
        "ffmpeg", "-i", video_file, "-vn",
        "-acodec", "pcm_s16le", "-ac", "1", "-ar", "16000",
        audio_path, "-y"
    ]
    subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return audio_path


def classify_accent(input_file_or_url):
    model = get_model()
    
    # Check if it's a URL
    if isinstance(input_file_or_url, str) and input_file_or_url.startswith("http"):
        video_path = download_video_from_url(input_file_or_url)
    else:
        video_path = input_file_or_url.name if hasattr(input_file_or_url, "name") else input_file_or_url

    audio_path = extract_audio(video_path)
    out_probs, top_prob, top_idx, label = model.classify_file(audio_path)

    top_labels = model.hparams.label_encoder.decode_ndim(torch.topk(out_probs, 3).indices.squeeze())
    confidences = torch.topk(out_probs, 3).values.squeeze().tolist()
    result = "\n".join([f"{l}: {p*100:.2f}%" for l, p in zip(top_labels, confidences)])

    return label[0], f"{top_prob.item()*100:.2f}%", result

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# Accent Identifier")
    gr.Markdown(
        "Upload a video or audio file, or paste a link (e.g. direct .mp4 URL or Loom video) to identify the speaker's accent."
    )

    with gr.Row():
        with gr.Column():
            input_file = gr.File(label="Upload video/audio file", file_types=[".mp4", ".wav", ".mp3"])
            url_input = gr.Textbox(label="...or paste a direct mp4 URL/loom link")
            submit_btn = gr.Button("Classify Accent")

        with gr.Column():
            label_output = gr.Textbox(label="Top Prediction")
            confidence_output = gr.Textbox(label="Confidence")
            top3_output = gr.Textbox(label="Top 3 Predictions")

    def handle_inputs(file, url):
        if url:
            return classify_accent(url)
        elif file:
            return classify_accent(file)
        else:
            return "No input", "", ""

    submit_btn.click(handle_inputs, inputs=[input_file, url_input], outputs=[label_output, confidence_output, top3_output])

if __name__ == "__main__":
    demo.launch(share=True)