|
|
import os |
|
|
import io |
|
|
import tempfile |
|
|
import datetime |
|
|
import textwrap |
|
|
|
|
|
import numpy as np |
|
|
import torch |
|
|
import librosa |
|
|
import gradio as gr |
|
|
|
|
|
from reportlab.pdfgen import canvas |
|
|
from reportlab.lib.pagesizes import letter |
|
|
from reportlab.lib.utils import ImageReader |
|
|
from reportlab.lib import colors |
|
|
from reportlab.pdfbase import pdfmetrics |
|
|
from reportlab.pdfbase.ttfonts import TTFont |
|
|
|
|
|
from transformers import ( |
|
|
WhisperProcessor, |
|
|
AutoModelForSpeechSeq2Seq, |
|
|
AutoFeatureExtractor, |
|
|
AutoModel, |
|
|
) |
|
|
from transformers import pipeline as hf_pipeline |
|
|
|
|
|
|
|
|
try: |
|
|
import scipy.signal as _sg |
|
|
from scipy.signal import windows as _win |
|
|
|
|
|
if not hasattr(_sg, "hann"): |
|
|
_sg.hann = _win.hann |
|
|
except Exception: |
|
|
_sg = None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pdfmetrics.registerFont(TTFont("PlayfairBold", "PlayfairDisplay-Bold.ttf")) |
|
|
pdfmetrics.registerFont(TTFont("Geneva", "Geneva.ttf")) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ACCENT = colors.HexColor("#8b5cf6") |
|
|
PRIMARY = colors.HexColor("#3b0c3f") |
|
|
LIGHT_GRAY = colors.HexColor("#e6e6e6") |
|
|
GOLD = colors.HexColor("#f4c542") |
|
|
WHITE = colors.white |
|
|
BLACK = colors.black |
|
|
|
|
|
ENGINE_URL = "https://www.tourdefierce.vip/ai-music-detector" |
|
|
LOGO_FILE = "logo.jpg" |
|
|
|
|
|
ASR_MODEL = "openai/whisper-small" |
|
|
CLF_MODEL = "microsoft/wavlm-base-plus-sv" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
processor = WhisperProcessor.from_pretrained(ASR_MODEL) |
|
|
asr_model = AutoModelForSpeechSeq2Seq.from_pretrained(ASR_MODEL) |
|
|
asr_pipe = hf_pipeline( |
|
|
"automatic-speech-recognition", |
|
|
model=asr_model, |
|
|
tokenizer=processor.tokenizer, |
|
|
feature_extractor=processor.feature_extractor, |
|
|
) |
|
|
|
|
|
clf_processor = AutoFeatureExtractor.from_pretrained(CLF_MODEL) |
|
|
clf_model = AutoModel.from_pretrained(CLF_MODEL) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def compute_autotune_index(y, sr): |
|
|
"""Heuristic autotune index: low pitch variance -> more 'quantized' -> higher score.""" |
|
|
f0, voiced, _ = librosa.pyin( |
|
|
y, |
|
|
sr=sr, |
|
|
fmin=librosa.note_to_hz("C2"), |
|
|
fmax=librosa.note_to_hz("C6"), |
|
|
) |
|
|
|
|
|
if f0 is None: |
|
|
return 0.0 |
|
|
|
|
|
f0 = f0[voiced > 0.5] |
|
|
|
|
|
if len(f0) < 10: |
|
|
return 0.0 |
|
|
|
|
|
log_f0 = np.log(f0) |
|
|
std = np.std(log_f0) |
|
|
|
|
|
|
|
|
max_std = 0.25 |
|
|
score = 1 - np.clip(std / max_std, 0, 1) |
|
|
return float(score * 100.0) |
|
|
|
|
|
|
|
|
def extract_embeddings(y, sr): |
|
|
inp = clf_processor(y, sampling_rate=sr, return_tensors="pt") |
|
|
with torch.no_grad(): |
|
|
out = clf_model(**inp).last_hidden_state.mean(dim=1).squeeze() |
|
|
return out.cpu().numpy() |
|
|
|
|
|
|
|
|
def calculate_ai_probability(emb, y, sr, autotune_idx): |
|
|
""" |
|
|
Heuristic AI probability in [0, 1]. |
|
|
|
|
|
Uses: |
|
|
- Embedding norm |
|
|
- Dynamic range |
|
|
- Autotune index |
|
|
""" |
|
|
|
|
|
norm = np.linalg.norm(emb) |
|
|
norm_min, norm_max = 40, 140 |
|
|
norm_scaled = np.clip((norm - norm_min) / (norm_max - norm_min), 0, 1) |
|
|
|
|
|
|
|
|
S = np.abs(librosa.stft(y)) |
|
|
rms = librosa.feature.rms(S=S)[0] |
|
|
dyn_range = np.percentile(rms, 95) - np.percentile(rms, 5) |
|
|
dyn_scaled = 1.0 - np.clip((dyn_range - 0.02) / 0.1, 0, 1) |
|
|
|
|
|
|
|
|
at_scaled = autotune_idx / 100.0 |
|
|
|
|
|
|
|
|
raw = 0.4 * norm_scaled + 0.3 * dyn_scaled + 0.3 * at_scaled |
|
|
|
|
|
|
|
|
ai_prob = float(np.clip(raw * 0.95 + 0.05, 0.05, 0.99)) |
|
|
return ai_prob |
|
|
|
|
|
|
|
|
def detect_key(y, sr): |
|
|
chroma = librosa.feature.chroma_cqt(y=y, sr=sr) |
|
|
chroma_mean = chroma.mean(axis=1) |
|
|
key_index = int(np.argmax(chroma_mean)) |
|
|
|
|
|
KEYS = ["C", "C#", "D", "Eb", "E", "F", "F#", "G", "Ab", "A", "Bb", "B"] |
|
|
root = KEYS[key_index] |
|
|
|
|
|
maj_energy = chroma_mean[(key_index + 4) % 12] + chroma_mean[(key_index + 7) % 12] |
|
|
min_energy = chroma_mean[(key_index + 3) % 12] + chroma_mean[(key_index + 7) % 12] |
|
|
|
|
|
return f"{root} major" if maj_energy >= min_energy else f"{root} minor" |
|
|
|
|
|
|
|
|
def detect_bpm(y, sr): |
|
|
onset_env = librosa.onset.onset_strength(y=y, sr=sr) |
|
|
tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr) |
|
|
if tempo is None or len(tempo) == 0: |
|
|
return 0.0 |
|
|
return float(tempo[0]) |
|
|
|
|
|
|
|
|
def estimate_voice_type(y, sr): |
|
|
"""Very rough tessitura-based suggestion.""" |
|
|
f0, voiced, _ = librosa.pyin( |
|
|
y, |
|
|
sr=sr, |
|
|
fmin=librosa.note_to_hz("C2"), |
|
|
fmax=librosa.note_to_hz("C6"), |
|
|
) |
|
|
|
|
|
if f0 is None or np.sum(voiced) < 5: |
|
|
return "Unable to estimate voice type from this clip." |
|
|
|
|
|
f0 = f0[voiced > 0.5] |
|
|
median_hz = np.median(f0) |
|
|
median_note = librosa.hz_to_note(median_hz) |
|
|
|
|
|
|
|
|
if median_hz < librosa.note_to_hz("G3"): |
|
|
base = "lower voice (baritone / alto range)" |
|
|
elif median_hz < librosa.note_to_hz("C4"): |
|
|
base = "mid voice (baritenor / mezzo range)" |
|
|
else: |
|
|
base = "high voice (tenor or soprano range)" |
|
|
|
|
|
return f"Given the tessitura, this song is best suited for a {base}." |
|
|
|
|
|
|
|
|
def compute_production_polish(y, sr): |
|
|
"""0-100: how polished / produced the track sounds.""" |
|
|
S = np.abs(librosa.stft(y)) |
|
|
rms = librosa.feature.rms(S=S)[0] |
|
|
|
|
|
dyn_range = np.percentile(rms, 95) - np.percentile(rms, 5) |
|
|
dyn_score = 1.0 - np.clip((dyn_range - 0.015) / 0.12, 0, 1) |
|
|
|
|
|
flatness = np.mean(librosa.feature.spectral_flatness(S=S)) |
|
|
flat_score = np.clip((flatness - 0.1) / 0.4, 0, 1) |
|
|
|
|
|
polish = 0.6 * dyn_score + 0.4 * flat_score |
|
|
return float(polish * 100.0) |
|
|
|
|
|
|
|
|
def compute_shade_score(ai_percent, autotune_idx, polish_idx): |
|
|
""" |
|
|
Shade Meter 0–100: |
|
|
- 60% AI likelihood |
|
|
- 25% autotune index |
|
|
- 15% production polish |
|
|
""" |
|
|
shade = 0.6 * ai_percent + 0.25 * autotune_idx + 0.15 * polish_idx |
|
|
return float(np.clip(shade, 0, 100)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def wrap_paragraph(text, width=90): |
|
|
lines = [] |
|
|
for para in text.splitlines(): |
|
|
if not para.strip(): |
|
|
lines.append("") |
|
|
continue |
|
|
lines.extend(textwrap.wrap(para, width=width)) |
|
|
return lines |
|
|
|
|
|
|
|
|
def build_scientific_analysis(ai_pct, human_pct, autotune_idx, shade, key_sig, bpm, polish_idx): |
|
|
lines = [] |
|
|
lines.append("Overview") |
|
|
lines.append( |
|
|
f"This clip was analyzed using a hybrid signal-processing and deep-learning stack. " |
|
|
f"Based on embedding statistics, dynamic range, spectral behavior, and pitch stability, " |
|
|
f"the system estimates a {ai_pct:.1f}% probability that the source material is AI-generated, " |
|
|
f"and a {human_pct:.1f}% probability that it is primarily human-performed." |
|
|
) |
|
|
lines.append("") |
|
|
lines.append("Pitch & Autotune") |
|
|
lines.append( |
|
|
f"Fundamental frequency tracking suggests an autotune index of {autotune_idx:.1f}/100. " |
|
|
f"Lower scores indicate more organic pitch variance, while higher scores indicate quantized or " |
|
|
f"grid-snapped intonation." |
|
|
) |
|
|
lines.append("") |
|
|
lines.append("Rhythm & Tempo") |
|
|
lines.append( |
|
|
f"Tempo estimation places this performance at approximately {bpm:.1f} beats per minute. " |
|
|
f"The detected tempo is derived from onset strength peaks and may vary slightly with different sections " |
|
|
f"of the recording." |
|
|
) |
|
|
lines.append("") |
|
|
lines.append("Timbre & Production") |
|
|
lines.append( |
|
|
f"Timbre and dynamics analysis yields a production polish score of {polish_idx:.1f}/100. " |
|
|
f"Higher scores correspond to compressed, consistently loud, and spectrally uniform material, " |
|
|
f"often associated with heavily produced or synthetic audio." |
|
|
) |
|
|
lines.append("") |
|
|
lines.append("Musical Context") |
|
|
lines.append( |
|
|
f"Harmonic analysis indicates that the material centers around {key_sig}. " |
|
|
f"This key estimate is based on chroma energy distribution over the length of the clip." |
|
|
) |
|
|
return "\n".join(lines) |
|
|
|
|
|
|
|
|
def build_clapback(ai_pct, human_pct, autotune_idx, shade, key_sig, bpm, voice_text): |
|
|
tone_lines = [] |
|
|
tone_lines.append("CLAPBACK SUMMARY") |
|
|
tone_lines.append("") |
|
|
if ai_pct >= 75: |
|
|
tone_lines.append( |
|
|
f"This track is giving **full robot fantasy** with an AI likelihood of {ai_pct:.1f}%. " |
|
|
f"If there was a human involved, they were probably just pressing 'render.'" |
|
|
) |
|
|
elif ai_pct >= 40: |
|
|
tone_lines.append( |
|
|
f"This performance lives in the uncanny valley with an AI likelihood of {ai_pct:.1f}%. " |
|
|
f"Some human in there, but the machines are definitely helping." |
|
|
) |
|
|
else: |
|
|
tone_lines.append( |
|
|
f"With only {ai_pct:.1f}% AI likelihood, this one is serving mostly human realness. " |
|
|
f"Congrats: your soul is still in the mix." |
|
|
) |
|
|
|
|
|
tone_lines.append("") |
|
|
if autotune_idx >= 70: |
|
|
tone_lines.append( |
|
|
f"Autotune index {autotune_idx:.1f}/100: every note is so locked to the grid it should pay rent there." |
|
|
) |
|
|
elif autotune_idx >= 35: |
|
|
tone_lines.append( |
|
|
f"Autotune index {autotune_idx:.1f}/100: tasteful correction, but we definitely hear the safety net." |
|
|
) |
|
|
else: |
|
|
tone_lines.append( |
|
|
f"Autotune index {autotune_idx:.1f}/100: pitch is flying mostly solo — brave, messy, and very human." |
|
|
) |
|
|
|
|
|
tone_lines.append("") |
|
|
tone_lines.append( |
|
|
f"Shade Meter score: {shade:.1f}/100. " |
|
|
f"Zero would mean unplugged, unprocessed, angel-on-a-stool vibes. " |
|
|
f"You're sitting at {shade:.1f}, which means there's at least a mild breeze of manufactured perfection " |
|
|
f"blowing through this mix." |
|
|
) |
|
|
|
|
|
tone_lines.append("") |
|
|
tone_lines.append( |
|
|
f"Musically, the track hangs out around {key_sig} at about {bpm:.1f} BPM, so if you’re clapping back on TikTok, " |
|
|
f"now you know what tempo to drag them in." |
|
|
) |
|
|
|
|
|
tone_lines.append("") |
|
|
tone_lines.append(f"Voice-tessitura take: {voice_text}") |
|
|
|
|
|
return "\n".join(tone_lines) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def scale_color(val, invert=False): |
|
|
""" |
|
|
For score boxes: |
|
|
- green: good |
|
|
- gold: medium |
|
|
- red: high risk |
|
|
""" |
|
|
if invert: |
|
|
|
|
|
if val <= 25: |
|
|
return colors.green |
|
|
if val <= 75: |
|
|
return GOLD |
|
|
return colors.red |
|
|
else: |
|
|
if val >= 75: |
|
|
return colors.green |
|
|
if val >= 25: |
|
|
return GOLD |
|
|
return colors.red |
|
|
|
|
|
|
|
|
def make_pdf( |
|
|
ai_score, |
|
|
human_score, |
|
|
atune, |
|
|
shade, |
|
|
key_sig, |
|
|
bpm, |
|
|
transcript, |
|
|
scientific_text, |
|
|
clapback_text, |
|
|
clip_title, |
|
|
polish_idx, |
|
|
): |
|
|
buffer = io.BytesIO() |
|
|
c = canvas.Canvas(buffer, pagesize=letter) |
|
|
W, H = letter |
|
|
|
|
|
|
|
|
c.setFillColor(WHITE) |
|
|
c.rect(0, 0, W, H, fill=1) |
|
|
|
|
|
|
|
|
try: |
|
|
c.drawImage(LOGO_FILE, 40, H - 120, width=90, height=90) |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
|
|
|
c.setFillColor(PRIMARY) |
|
|
c.setFont("PlayfairBold", 32) |
|
|
c.drawString(150, H - 60, "Tour de Fierce") |
|
|
|
|
|
c.setFillColor(ACCENT) |
|
|
c.setFont("Geneva", 14) |
|
|
c.drawString(150, H - 82, "Audio Clapback Report™") |
|
|
|
|
|
|
|
|
c.setFillColor(BLACK) |
|
|
c.setFont("Geneva", 10) |
|
|
c.drawString(150, H - 98, f"Generated: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M')}") |
|
|
c.setFont("Geneva", 12) |
|
|
c.drawString(40, H - 145, f"Clip analyzed: {clip_title}") |
|
|
|
|
|
|
|
|
try: |
|
|
import qrcode |
|
|
|
|
|
qr = qrcode.make(ENGINE_URL) |
|
|
buf = io.BytesIO() |
|
|
qr.save(buf, format="PNG") |
|
|
buf.seek(0) |
|
|
c.drawImage(ImageReader(buf), W - 120, H - 140, width=80, height=80) |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
|
|
|
c.setStrokeColor(LIGHT_GRAY) |
|
|
c.line(40, H - 165, W - 40, H - 165) |
|
|
|
|
|
|
|
|
c.setFillColor(scale_color(ai_score, invert=True)) |
|
|
c.rect(40, H - 260, 150, 80, fill=1) |
|
|
c.setFillColor(WHITE) |
|
|
c.setFont("Geneva", 11) |
|
|
c.drawString(55, H - 195, "AI Likelihood") |
|
|
c.setFont("PlayfairBold", 26) |
|
|
c.drawString(55, H - 220, f"{ai_score:.1f}%") |
|
|
|
|
|
c.setFillColor(scale_color(human_score)) |
|
|
c.rect(210, H - 260, 150, 80, fill=1) |
|
|
c.setFillColor(WHITE) |
|
|
c.setFont("Geneva", 11) |
|
|
c.drawString(225, H - 195, "Human Likelihood") |
|
|
c.setFont("PlayfairBold", 26) |
|
|
c.drawString(225, H - 220, f"{human_score:.1f}%") |
|
|
|
|
|
c.setFillColor(scale_color(atune, invert=True)) |
|
|
c.rect(380, H - 260, 150, 80, fill=1) |
|
|
c.setFillColor(WHITE) |
|
|
c.setFont("Geneva", 11) |
|
|
c.drawString(395, H - 195, "Autotune Index") |
|
|
c.setFont("PlayfairBold", 26) |
|
|
c.drawString(395, H - 220, f"{atune:.1f}/100") |
|
|
|
|
|
|
|
|
c.setFillColor(BLACK) |
|
|
c.setFont("Geneva", 12) |
|
|
c.drawString(40, H - 295, "Shade Meter") |
|
|
|
|
|
|
|
|
bar_y = H - 310 |
|
|
bar_height = 14 |
|
|
bar_width = 490 |
|
|
|
|
|
c.setFillColor(LIGHT_GRAY) |
|
|
c.roundRect(40, bar_y, bar_width, bar_height, 7, fill=1) |
|
|
|
|
|
|
|
|
c.setFillColor(ACCENT) |
|
|
fill_w = (shade / 100.0) * bar_width |
|
|
c.roundRect(40, bar_y, fill_w, bar_height, 7, fill=1) |
|
|
|
|
|
c.setFillColor(BLACK) |
|
|
c.setFont("Geneva", 10) |
|
|
c.drawString(540, bar_y + 1, f"{shade:.1f}/100") |
|
|
|
|
|
|
|
|
shade_blurb = ( |
|
|
"The Shade Meter provides a comprehensive analysis of the uploaded file, representing exactly " |
|
|
"how much shade you are entitled to direct toward the source of the clip. The ideal score is 0%, " |
|
|
"indicating real, acoustic instruments and un-pitch-corrected vocals. Moderate scores may reflect " |
|
|
"MIDI instruments or noticeably processed vocals. A 100 is the ultimate shade parade, with 100% " |
|
|
"confidence that the clip was generated by an AI system." |
|
|
) |
|
|
c.setFont("Geneva", 9) |
|
|
ytxt = H - 330 |
|
|
for line in wrap_paragraph(shade_blurb, width=95): |
|
|
c.drawString(40, ytxt, line) |
|
|
ytxt -= 11 |
|
|
|
|
|
|
|
|
ytxt -= 5 |
|
|
c.setFont("PlayfairBold", 18) |
|
|
c.setFillColor(PRIMARY) |
|
|
c.drawString(40, ytxt, "Musicality Analysis") |
|
|
ytxt -= 18 |
|
|
|
|
|
c.setFont("Geneva", 11) |
|
|
c.setFillColor(BLACK) |
|
|
c.drawString(40, ytxt, f"Key Signature: {key_sig}") |
|
|
ytxt -= 14 |
|
|
c.drawString(40, ytxt, f"Tempo (BPM): {bpm:.1f}") |
|
|
ytxt -= 20 |
|
|
|
|
|
|
|
|
c.setFont("PlayfairBold", 18) |
|
|
c.setFillColor(PRIMARY) |
|
|
c.drawString(40, ytxt, "Technical Forensic Analysis") |
|
|
ytxt -= 18 |
|
|
|
|
|
c.setFont("Geneva", 10) |
|
|
c.setFillColor(BLACK) |
|
|
for line in wrap_paragraph(scientific_text, width=95): |
|
|
if ytxt < 60: |
|
|
c.showPage() |
|
|
W2, H2 = letter |
|
|
c.setFont("Geneva", 10) |
|
|
ytxt = H2 - 60 |
|
|
c.drawString(40, ytxt, line) |
|
|
ytxt -= 11 |
|
|
|
|
|
|
|
|
ytxt -= 10 |
|
|
c.setFont("PlayfairBold", 18) |
|
|
c.setFillColor(PRIMARY) |
|
|
if ytxt < 60: |
|
|
c.showPage() |
|
|
W2, H2 = letter |
|
|
ytxt = H2 - 60 |
|
|
c.drawString(40, ytxt, "Clapback Shade Report") |
|
|
ytxt -= 18 |
|
|
|
|
|
c.setFont("Geneva", 10) |
|
|
c.setFillColor(BLACK) |
|
|
for line in wrap_paragraph(clapback_text, width=95): |
|
|
if ytxt < 60: |
|
|
c.showPage() |
|
|
W2, H2 = letter |
|
|
c.setFont("Geneva", 10) |
|
|
ytxt = H2 - 60 |
|
|
c.drawString(40, ytxt, line) |
|
|
ytxt -= 11 |
|
|
|
|
|
|
|
|
ytxt -= 10 |
|
|
c.setFont("PlayfairBold", 18) |
|
|
c.setFillColor(PRIMARY) |
|
|
if ytxt < 60: |
|
|
c.showPage() |
|
|
W2, H2 = letter |
|
|
ytxt = H2 - 60 |
|
|
c.drawString(40, ytxt, "Transcript") |
|
|
ytxt -= 18 |
|
|
|
|
|
c.setFont("Geneva", 9) |
|
|
c.setFillColor(BLACK) |
|
|
for line in wrap_paragraph(transcript, width=100): |
|
|
if ytxt < 50: |
|
|
c.showPage() |
|
|
W2, H2 = letter |
|
|
c.setFont("Geneva", 9) |
|
|
ytxt = H2 - 60 |
|
|
c.drawString(40, ytxt, line) |
|
|
ytxt -= 10 |
|
|
|
|
|
|
|
|
c.setStrokeColor(LIGHT_GRAY) |
|
|
c.line(40, 40, W - 40, 40) |
|
|
c.setFont("Geneva", 9) |
|
|
c.drawString(40, 28, "© 2025 Tour de Fierce — All Shade, No Shame.") |
|
|
c.drawString(300, 28, "www.tourdefierce.vip") |
|
|
|
|
|
c.save() |
|
|
buffer.seek(0) |
|
|
|
|
|
fname = f"clapback-{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}.pdf" |
|
|
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=f"_{fname}") |
|
|
tmp.write(buffer.getvalue()) |
|
|
tmp.close() |
|
|
return tmp.name |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_analysis(audio_file): |
|
|
if not audio_file: |
|
|
return ( |
|
|
"No audio file uploaded.", |
|
|
"", |
|
|
"", |
|
|
"", |
|
|
"", |
|
|
"", |
|
|
"", |
|
|
"", |
|
|
"", |
|
|
"", |
|
|
None, |
|
|
) |
|
|
|
|
|
|
|
|
y, sr = librosa.load(audio_file, sr=16000, mono=True) |
|
|
|
|
|
|
|
|
try: |
|
|
text = asr_pipe({"array": y, "sampling_rate": sr})["text"] |
|
|
except Exception: |
|
|
text = "[Transcription unavailable]" |
|
|
|
|
|
|
|
|
autotune_idx = compute_autotune_index(y, sr) |
|
|
polish_idx = compute_production_polish(y, sr) |
|
|
|
|
|
emb = extract_embeddings(y, sr) |
|
|
ai_prob = calculate_ai_probability(emb, y, sr, autotune_idx) |
|
|
human_prob = 1.0 - ai_prob |
|
|
|
|
|
ai_pct = ai_prob * 100.0 |
|
|
human_pct = human_prob * 100.0 |
|
|
|
|
|
shade = compute_shade_score(ai_pct, autotune_idx, polish_idx) |
|
|
key_sig = detect_key(y, sr) |
|
|
bpm = detect_bpm(y, sr) |
|
|
voice_text = estimate_voice_type(y, sr) |
|
|
|
|
|
scientific_text = build_scientific_analysis( |
|
|
ai_pct, human_pct, autotune_idx, shade, key_sig, bpm, polish_idx |
|
|
) |
|
|
clapback_text = build_clapback( |
|
|
ai_pct, human_pct, autotune_idx, shade, key_sig, bpm, voice_text |
|
|
) |
|
|
|
|
|
clip_title = os.path.basename(audio_file) |
|
|
|
|
|
pdf_path = make_pdf( |
|
|
ai_pct, |
|
|
human_pct, |
|
|
autotune_idx, |
|
|
shade, |
|
|
key_sig, |
|
|
bpm, |
|
|
text, |
|
|
scientific_text, |
|
|
clapback_text, |
|
|
clip_title, |
|
|
polish_idx, |
|
|
) |
|
|
|
|
|
return ( |
|
|
text, |
|
|
f"{ai_pct:.1f}%", |
|
|
f"{human_pct:.1f}%", |
|
|
f"{autotune_idx:.1f}", |
|
|
f"{shade:.1f}", |
|
|
key_sig, |
|
|
f"{bpm:.1f}", |
|
|
voice_text, |
|
|
scientific_text, |
|
|
clapback_text, |
|
|
pdf_path, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.HTML( |
|
|
""" |
|
|
<div style='text-align:center; padding:20px;'> |
|
|
<h1 style='font-size:36px; font-weight:800;'> |
|
|
👋 Tour de Fierce Audio Clapback Engine™ |
|
|
</h1> |
|
|
<p style='color:#ccc;'> |
|
|
AI Detector • Autotune Detector • Key & BPM • Forensic Reporting |
|
|
</p> |
|
|
</div> |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
audio_in = gr.Audio(type="filepath", label="Upload audio") |
|
|
run_btn = gr.Button("Run Clapback 👏", variant="primary") |
|
|
|
|
|
with gr.Row(): |
|
|
transcript = gr.Textbox( |
|
|
label="Transcript", |
|
|
interactive=False, |
|
|
lines=5, |
|
|
show_label=True, |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
ai_out = gr.Textbox(label="AI Likelihood", interactive=False) |
|
|
human_out = gr.Textbox(label="Human Likelihood", interactive=False) |
|
|
atune_out = gr.Textbox(label="Autotune Index", interactive=False) |
|
|
|
|
|
with gr.Row(): |
|
|
shade_out = gr.Textbox(label="Shade Meter", interactive=False) |
|
|
key_out = gr.Textbox(label="Key Signature", interactive=False) |
|
|
bpm_out = gr.Textbox(label="Tempo (BPM)", interactive=False) |
|
|
voice_out = gr.Textbox(label="Suggested Voice Type", interactive=False) |
|
|
|
|
|
with gr.Row(): |
|
|
forensic_out = gr.Textbox( |
|
|
label="Technical Forensic Analysis", |
|
|
interactive=False, |
|
|
lines=12, |
|
|
) |
|
|
clapback_out = gr.Textbox( |
|
|
label="Clapback Shade Report", |
|
|
interactive=False, |
|
|
lines=12, |
|
|
) |
|
|
|
|
|
pdf_download = gr.File(label="Download Report") |
|
|
|
|
|
run_btn.click( |
|
|
fn=run_analysis, |
|
|
inputs=audio_in, |
|
|
outputs=[ |
|
|
transcript, |
|
|
ai_out, |
|
|
human_out, |
|
|
atune_out, |
|
|
shade_out, |
|
|
key_out, |
|
|
bpm_out, |
|
|
voice_out, |
|
|
forensic_out, |
|
|
clapback_out, |
|
|
pdf_download, |
|
|
], |
|
|
) |
|
|
|
|
|
demo.launch() |
|
|
|