import os import io import tempfile import datetime import textwrap import shutil import uuid import random import numpy as np import torch import librosa import gradio as gr from reportlab.pdfgen import canvas from reportlab.lib.pagesizes import letter from reportlab.lib.utils import ImageReader from reportlab.lib import colors from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase.ttfonts import TTFont from transformers import ( WhisperProcessor, AutoModelForSpeechSeq2Seq, AutoFeatureExtractor, AutoModel, ) from transformers import pipeline as hf_pipeline # ---- SciPy / librosa compatibility patch ---- try: import scipy.signal as _sg from scipy.signal import windows as _win if not hasattr(_sg, "hann"): _sg.hann = _win.hann except Exception: pass # ---- Fonts (fail gracefully if missing) ---- DEFAULT_TITLE_FONT = "Helvetica-Bold" DEFAULT_BODY_FONT = "Helvetica" try: pdfmetrics.registerFont(TTFont("PlayfairBold", "PlayfairDisplay-Bold.ttf")) TITLE_FONT = "PlayfairBold" except Exception: TITLE_FONT = DEFAULT_TITLE_FONT try: pdfmetrics.registerFont(TTFont("Geneva", "Geneva.ttf")) BODY_FONT = "Geneva" except Exception: BODY_FONT = DEFAULT_BODY_FONT # ---- Style ---- ACCENT = colors.HexColor("#8b5cf6") PRIMARY = colors.HexColor("#3b0c3f") LIGHT_GRAY = colors.HexColor("#e6e6e6") GOLD = colors.HexColor("#f4c542") WHITE = colors.white BLACK = colors.black ENGINE_URL = "https://www.tourdefierce.vip/ai-music-detector" LOGO_FILE = "logo.jpg" ASR_MODEL = "openai/whisper-tiny.en" CLF_MODEL = "microsoft/wavlm-base-plus-sv" MAX_AUDIO_DURATION_S = 45.0 # ---- Device ---- DEVICE = "cuda" if torch.cuda.is_available() else "cpu" PIPELINE_DEVICE = 0 if DEVICE == "cuda" else -1 TORCH_DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32 # ---- Load Models ---- processor = WhisperProcessor.from_pretrained(ASR_MODEL) asr_model = AutoModelForSpeechSeq2Seq.from_pretrained( ASR_MODEL, torch_dtype=TORCH_DTYPE, ) asr_model.eval() if DEVICE == "cuda": asr_model.to(DEVICE) asr_pipe = hf_pipeline( "automatic-speech-recognition", model=asr_model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, chunk_length_s=20, device=PIPELINE_DEVICE, ) clf_processor = AutoFeatureExtractor.from_pretrained(CLF_MODEL) clf_model = AutoModel.from_pretrained(CLF_MODEL) if DEVICE == "cuda": clf_model.to(DEVICE) clf_model.eval() # ---- Helpers ---- def safe_copy_to_tmp(audio_file_path: str) -> str: ext = os.path.splitext(audio_file_path)[1] or ".wav" safe_name = f"clapback_{uuid.uuid4().hex}{ext}" safe_path = os.path.join("/tmp", safe_name) shutil.copy2(audio_file_path, safe_path) return safe_path def compute_autotune_index(y, sr): """Heuristic autotune index 0–100 with quantization + variance.""" f0, voiced, _ = librosa.pyin( y, sr=sr, fmin=librosa.note_to_hz("C2"), fmax=librosa.note_to_hz("C6"), ) if f0 is None: return 0.0 voiced_mask = voiced > 0.5 f0 = f0[voiced_mask] if len(f0) < 10: return 0.0 # remove NaNs f0 = f0[~np.isnan(f0)] if len(f0) < 10: return 0.0 # pitch variance in log space log_f0 = np.log(f0) std = np.std(log_f0) std_norm = np.clip(std / 0.25, 0, 1) variance_score = 1.0 - std_norm # flatter -> closer to 1 # quantization: how close to equal-tempered semitones midi = librosa.hz_to_midi(f0) nearest = np.round(midi) dist = np.abs(midi - nearest) mean_dist = np.mean(dist) quantization_score = 1.0 - np.clip(mean_dist / 0.5, 0, 1) # within half-step => high # combine score = 0.5 * variance_score + 0.5 * quantization_score return float(np.clip(score * 100.0, 0.0, 100.0)) def extract_embeddings(y, sr): inp = clf_processor(y, sampling_rate=sr, return_tensors="pt") if DEVICE == "cuda": inp = {k: v.to(DEVICE) for k, v in inp.items()} with torch.no_grad(): out = clf_model(**inp).last_hidden_state.mean(dim=1).squeeze() return out.detach().cpu().numpy() def calculate_ai_probability(emb, y, sr, autotune_idx, polish_idx): """Heuristic AI probability in [0, 1].""" # embedding norm – style/complexity proxy norm = np.linalg.norm(emb) norm_scaled = np.clip((norm - 40.0) / 120.0, 0.0, 1.0) # dynamic range – very flat dynamics often => synthetic/over-processed S = np.abs(librosa.stft(y)) rms = librosa.feature.rms(S=S)[0] dyn = np.percentile(rms, 95) - np.percentile(rms, 5) dyn_scaled = 1.0 - np.clip((dyn - 0.02) / 0.10, 0.0, 1.0) # spectral flatness – noise-like / super-smeared spectra flatness = np.mean(librosa.feature.spectral_flatness(S=S)) flat_scaled = np.clip((flatness - 0.15) / 0.35, 0.0, 1.0) # autotune + polish normalized at_scaled = autotune_idx / 100.0 polish_scaled = polish_idx / 100.0 # weighted combo raw = ( 0.20 * norm_scaled + 0.20 * dyn_scaled + 0.20 * flat_scaled + 0.20 * at_scaled + 0.20 * polish_scaled ) ai_prob = float(np.clip(raw * 0.95 + 0.05, 0.05, 0.99)) return ai_prob def detect_key(y, sr): chroma = librosa.feature.chroma_cqt(y=y, sr=sr) chroma_mean = chroma.mean(axis=1) idx = int(np.argmax(chroma_mean)) KEYS = ["C", "C#", "D", "Eb", "E", "F", "F#", "G", "Ab", "A", "Bb", "B"] root = KEYS[idx] maj = chroma_mean[(idx + 4) % 12] + chroma_mean[(idx + 7) % 12] min_ = chroma_mean[(idx + 3) % 12] + chroma_mean[(idx + 7) % 12] return f"{root} major" if maj >= min_ else f"{root} minor" def detect_bpm(y, sr): onset = librosa.onset.onset_strength(y=y, sr=sr) tempo = librosa.beat.tempo(onset_envelope=onset, sr=sr) if tempo is None or len(tempo) == 0: return 0.0 return float(tempo[0]) def estimate_voice_type(y, sr): f0, voiced, _ = librosa.pyin( y, sr=sr, fmin=librosa.note_to_hz("C2"), fmax=librosa.note_to_hz("C6"), ) if f0 is None or np.sum(voiced) < 5: return "Unable to estimate voice type from this clip." f0 = f0[voiced > 0.5] f0 = f0[~np.isnan(f0)] if len(f0) < 5: return "Unable to estimate voice type from this clip." median = np.median(f0) if median < librosa.note_to_hz("G3"): base = "lower voice (baritone / contralto or low mezzo range)" elif median < librosa.note_to_hz("C4"): base = "mid voice (mezzo / baritenor range)" else: base = "higher voice (tenor or soprano range)" return ( "Given the tessitura of this excerpt, this material is best suited for a " f"{base}. Remember: this is about where the voice lives most comfortably, " "not the single highest or lowest note you can hit on a good day." ) def compute_production_polish(y, sr): S = np.abs(librosa.stft(y)) rms = librosa.feature.rms(S=S)[0] dyn = np.percentile(rms, 95) - np.percentile(rms, 5) dyn_score = 1.0 - np.clip((dyn - 0.015) / 0.12, 0.0, 1.0) flat = np.mean(librosa.feature.spectral_flatness(S=S)) flat_score = np.clip((flat - 0.10) / 0.40, 0.0, 1.0) polish = 0.6 * dyn_score + 0.4 * flat_score return float(np.clip(polish * 100.0, 0.0, 100.0)) def compute_shade_score(ai_percent, autotune_idx, polish_idx): return float( np.clip( 0.60 * ai_percent + 0.25 * autotune_idx + 0.15 * polish_idx, 0.0, 100.0, ) ) def wrap_paragraph(text, width=90): out = [] for p in text.splitlines(): if not p.strip(): out.append("") else: out.extend(textwrap.wrap(p, width)) return out def build_scientific_analysis(ai_pct, human_pct, autotune_idx, shade, key_sig, bpm, polish_idx): lines = [] lines.append("Overview") lines.append( "This clip was analyzed using a hybrid stack of signal processing, neural embeddings, " "and heuristic scoring. These numbers are not legal evidence, but they are useful " "for spotting tracks that are suspiciously polished, mechanically tuned, or dynamically flat." ) lines.append( f"The current model estimates a {ai_pct:.1f}% likelihood that the material behaves like " f"AI-generated or heavily synthetic audio, and a {human_pct:.1f}% likelihood that it behaves " "like a primarily human performance captured through microphones." ) lines.append("") lines.append("Pitch & Autotune") lines.append( "The autotune index looks at how stable the pitch is and how closely it snaps to equal-tempered " "semitones. Organic vocals wobble, lean, and breathe around the center of a note. " f"This recording scores {autotune_idx:.1f}/100, where higher values suggest stronger pitch correction " "or note-by-note MIDI-style control." ) lines.append("") lines.append("Rhythm & Tempo") lines.append( f"Estimated tempo is about {bpm:.1f} BPM, based on onset strength peaks. AI systems and loop-based " "production often favor perfectly locked grids, while live players introduce small fluctuations and " "rubato. Extreme metronomic precision can be a clue, but never proof, of machine involvement." ) lines.append("") lines.append("Timbre, Dynamics & Production Polish") lines.append( f"The production polish score is {polish_idx:.1f}/100. This combines dynamic range and spectral flatness " "to estimate how 'studio-perfect' the mix feels. Very compressed, spectrally uniform audio often lands " "higher on this scale, especially when combined with high autotune and AI-likelihood scores." ) lines.append("") lines.append("Harmonic Context") lines.append( f"Harmonic analysis suggests the material centers around {key_sig}. This is inferred from aggregate " "chroma energy across the clip, so modulations, key changes, and non-diatonic writing can all nudge the " "estimate around a bit." ) lines.append("") lines.append("Transparency Reminder") lines.append( "None of these metrics are a moral judgment. Instead, they are a nudge toward transparency: " "if AI tools, heavy tuning, or ghost-production were involved in creating this track, crediting those " "tools honestly respects both listeners and the human musicians whose work trained these systems." ) return "\n".join(lines) def pick_clapback_line(ai_pct, autotune_idx, shade): """Return a single clapback line keyed to the scores.""" if ai_pct >= 90: options = [ "Breaking news: unexplained seismic activity has been detected near the graves of Beethoven, " "Mozart, and Bach. Early reports blame this upload. This track is AI slop in 4K.", "Somewhere, a laptop is feeling very proud of itself. This isn’t a band — it’s a render queue with Wi-Fi.", "If this were any more synthetic, it would come with a microchip and a warranty card.", ] elif ai_pct >= 70: options = [ "This song is serving joint custody: half human, half hard drive, and the computer’s winning the case.", "We’ve got a messy situationship between a vocalist and their plug-ins. I’m not saying it’s AI… but I am side-eyeing the timeline.", "This track definitely knows what a GPU is. Let’s just say the ones and zeros are heavily featured.", ] elif ai_pct >= 40: options = [ "There’s real human in here, but the plug-ins are loud enough to demand a songwriting credit.", "You’re walking the uncanny valley runway — human energy with a noticeable AI glam squad backstage.", "This sounds like a human performance that went on a spa retreat and came back with suspiciously smooth skin.", ] else: # mostly human if shade <= 25 and autotune_idx <= 30: options = [ "Somebody call the local paper — we’ve got a live musician on our hands. This kind of organic performance is officially an endangered species.", "You can tell there’s a real set of lungs behind this. Frame this waveform and hang it in the Museum of Things Not Generated by a Bot.", "Congratulations: your track passes the ‘did an actual human show up to the session’ test with flying colors.", ] else: options = [ "Underneath the gloss, there’s a real human doing the work. Keep your receipts, keep your technique sharp, and don’t let AI slop steal your shine.", "The mix is polished, but the heartbeat is human. Today’s not the day you have to drag anyone in the comments.", "There’s plenty of production here, but the soul is still analog. Hydrate, vocal warm-up, and live to fight another track.", ] return random.choice(options) def build_clapback(ai_pct, human_pct, autotune_idx, shade, key_sig, bpm, voice_text): lines = [] lines.append("CLAPBACK SUMMARY") lines.append("") lines.append( f"AI likelihood: {ai_pct:.1f}% | Human likelihood: {human_pct:.1f}% | " f"Autotune index: {autotune_idx:.1f}/100 | Shade Meter: {shade:.1f}/100." ) lines.append( f"Key center is estimated around {key_sig} at roughly {bpm:.1f} BPM. Use that if you’re planning a duet, " "a remix, or a courtroom exhibit." ) lines.append("") lines.append(voice_text) lines.append("") lines.append(pick_clapback_line(ai_pct, autotune_idx, shade)) return "\n".join(lines) def scale_color(v, invert=False): if invert: return colors.green if v <= 25 else GOLD if v <= 75 else colors.red return colors.green if v >= 75 else GOLD if v >= 25 else colors.red def make_pdf(ai, human, atune, shade, key, bpm, trans, science, clap, clip, polish): buf = io.BytesIO() c = canvas.Canvas(buf, pagesize=letter) W, H = letter # background c.setFillColor(WHITE) c.rect(0, 0, W, H, fill=1) # logo try: c.drawImage(LOGO_FILE, 40, H - 120, width=90, height=90) except Exception: pass # branding c.setFillColor(PRIMARY) c.setFont(TITLE_FONT, 32) c.drawString(150, H - 60, "Tour de Fierce") c.setFillColor(ACCENT) c.setFont(BODY_FONT, 14) c.drawString(150, H - 82, "Audio Clapback Report™") # meta c.setFillColor(BLACK) c.setFont(BODY_FONT, 10) c.drawString(150, H - 98, f"Generated: {datetime.datetime.now():%Y-%m-%d %H:%M}") c.drawString(40, H - 145, f"Clip analyzed: {clip}") # QR try: import qrcode q = qrcode.make(ENGINE_URL) b = io.BytesIO() q.save(b, "PNG") b.seek(0) c.drawImage(ImageReader(b), W - 120, H - 140, width=80, height=80) except Exception: pass # divider c.setStrokeColor(LIGHT_GRAY) c.line(40, H - 165, W - 40, H - 165) # score boxes c.setFillColor(scale_color(ai, invert=True)) c.rect(40, H - 260, 150, 80, fill=1) c.setFillColor(WHITE) c.setFont(BODY_FONT, 11) c.drawString(55, H - 195, "AI Likelihood") c.setFont(TITLE_FONT, 26) c.drawString(55, H - 220, f"{ai:.1f}%") c.setFillColor(scale_color(human)) c.rect(210, H - 260, 150, 80, fill=1) c.setFillColor(WHITE) c.setFont(BODY_FONT, 11) c.drawString(225, H - 195, "Human Likelihood") c.setFont(TITLE_FONT, 26) c.drawString(225, H - 220, f"{human:.1f}%") c.setFillColor(scale_color(atune, invert=True)) c.rect(380, H - 260, 150, 80, fill=1) c.setFillColor(WHITE) c.setFont(BODY_FONT, 11) c.drawString(395, H - 195, "Autotune Index") c.setFont(TITLE_FONT, 26) c.drawString(395, H - 220, f"{atune:.1f}/100") # shade meter c.setFillColor(BLACK) c.setFont(BODY_FONT, 12) c.drawString(40, H - 295, "Shade Meter") bar_y = H - 310 bar_width = 490 bar_height = 14 c.setFillColor(LIGHT_GRAY) c.roundRect(40, bar_y, bar_width, bar_height, 7, fill=1) c.setFillColor(ACCENT) c.roundRect(40, bar_y, (shade / 100.0) * bar_width, bar_height, 7, fill=1) c.setFillColor(BLACK) c.setFont(BODY_FONT, 10) c.drawString(540, bar_y + 1, f"{shade:.1f}/100") txt = H - 330 c.setFont(BODY_FONT, 9) for line in wrap_paragraph( "The Shade Meter blends AI-likelihood, autotune strength, and production polish into a single, " "dramatic number. 0% is unplugged, unprocessed, angel-on-a-stool realness. 100% is full synthetic " "fantasy — the kind of track that owes royalties to its GPU.", 95, ): c.drawString(40, txt, line) txt -= 11 # musicality txt -= 5 c.setFont(TITLE_FONT, 18) c.setFillColor(PRIMARY) c.drawString(40, txt, "Musicality Analysis") txt -= 18 c.setFont(BODY_FONT, 11) c.setFillColor(BLACK) c.drawString(40, txt, f"Key Signature: {key}") txt -= 14 c.drawString(40, txt, f"Tempo (BPM): {bpm:.1f}") txt -= 20 # technical analysis c.setFont(TITLE_FONT, 18) c.setFillColor(PRIMARY) c.drawString(40, txt, "Technical Forensic Analysis") txt -= 18 c.setFont(BODY_FONT, 10) c.setFillColor(BLACK) for line in wrap_paragraph(science, 95): if txt < 60: c.showPage() W2, H2 = letter txt = H2 - 60 c.setFont(BODY_FONT, 10) c.setFillColor(BLACK) c.drawString(40, txt, line) txt -= 11 # clapback txt -= 10 c.setFont(TITLE_FONT, 18) c.setFillColor(PRIMARY) if txt < 60: c.showPage() W2, H2 = letter txt = H2 - 60 c.drawString(40, txt, "Clapback Shade Report") txt -= 18 c.setFont(BODY_FONT, 10) c.setFillColor(BLACK) for line in wrap_paragraph(clap, 95): if txt < 60: c.showPage() W2, H2 = letter txt = H2 - 60 c.setFont(BODY_FONT, 10) c.setFillColor(BLACK) c.drawString(40, txt, line) txt -= 11 # transcript txt -= 10 c.setFont(TITLE_FONT, 18) c.setFillColor(PRIMARY) if txt < 60: c.showPage() W2, H2 = letter txt = H2 - 60 c.drawString(40, txt, "Transcript") txt -= 18 c.setFont(BODY_FONT, 9) c.setFillColor(BLACK) for line in wrap_paragraph(trans, 100): if txt < 50: c.showPage() W2, H2 = letter txt = H2 - 60 c.setFont(BODY_FONT, 9) c.setFillColor(BLACK) c.drawString(40, txt, line) txt -= 10 # footer year = datetime.datetime.now().year c.setStrokeColor(LIGHT_GRAY) c.line(40, 40, W - 40, 40) c.setFont(BODY_FONT, 9) c.setFillColor(BLACK) c.drawString(40, 28, f"© {year} Tour de Fierce — All Shade, No Shame.") c.drawString(300, 28, "www.tourdefierce.vip") c.save() buf.seek(0) tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") tmp.write(buf.getvalue()) tmp.close() return tmp.name # ---- Main analysis ---- def run_analysis(audio_file): if not audio_file: return ("", "", "", "", "", "", "", "", "", "", None) clip = os.path.basename(audio_file) safe = safe_copy_to_tmp(audio_file) y, sr = librosa.load(safe, sr=16000, mono=True) max_samples = int(MAX_AUDIO_DURATION_S * sr) if len(y) > max_samples: y = y[:max_samples] # transcription try: text = asr_pipe({"array": y, "sampling_rate": sr})["text"] except Exception: text = "[Transcription unavailable]" autotune = compute_autotune_index(y, sr) polish = compute_production_polish(y, sr) emb = extract_embeddings(y, sr) ai = calculate_ai_probability(emb, y, sr, autotune, polish) human = 1.0 - ai ai_pct = ai * 100.0 human_pct = human * 100.0 shade = compute_shade_score(ai_pct, autotune, polish) key = detect_key(y, sr) bpm = detect_bpm(y, sr) voice = estimate_voice_type(y, sr) science = build_scientific_analysis(ai_pct, human_pct, autotune, shade, key, bpm, polish) clap = build_clapback(ai_pct, human_pct, autotune, shade, key, bpm, voice) pdf = make_pdf(ai_pct, human_pct, autotune, shade, key, bpm, text, science, clap, clip, polish) return ( text, f"{ai_pct:.1f}%", f"{human_pct:.1f}%", f"{autotune:.1f}", f"{shade:.1f}", key, f"{bpm:.1f}", voice, science, clap, pdf, ) # ---- UI ---- with gr.Blocks() as demo: gr.HTML( """

👋 Tour de Fierce Audio Clapback Engine™

AI Detector • Autotune • Key • BPM • Shade Reporting

""" ) with gr.Row(): audio_in = gr.Audio(type="filepath", label="Upload audio") run_btn = gr.Button("Run Clapback 👏", variant="primary") transcript = gr.Textbox(label="Transcript", lines=6, interactive=False) with gr.Row(): ai_out = gr.Textbox(label="AI Likelihood", interactive=False) human_out = gr.Textbox(label="Human Likelihood", interactive=False) atune_out = gr.Textbox(label="Autotune Index", interactive=False) with gr.Row(): shade_out = gr.Textbox(label="Shade Meter", interactive=False) key_out = gr.Textbox(label="Key Signature", interactive=False) bpm_out = gr.Textbox(label="Tempo (BPM)", interactive=False) voice_out = gr.Textbox(label="Suggested Voice Type", interactive=False) with gr.Row(): forensic_out = gr.Textbox( label="Technical Forensic Analysis", lines=14, interactive=False, ) clapback_out = gr.Textbox( label="Clapback Shade Report", lines=14, interactive=False, ) pdf_download = gr.File(label="Download Report") run_btn.click( fn=run_analysis, inputs=audio_in, outputs=[ transcript, ai_out, human_out, atune_out, shade_out, key_out, bpm_out, voice_out, forensic_out, clapback_out, pdf_download, ], ) demo.launch()