Spaces:
Sleeping
Sleeping
File size: 10,365 Bytes
dac56c7 b5e8c18 dac56c7 2436d0b ac5e680 53d1828 d4bdf39 6c3972f dac56c7 b5e8c18 dac56c7 715f9f3 cb5e453 b729e8e b524b3b b729e8e bf9bfd9 b5e8c18 b729e8e b5e8c18 b729e8e b524b3b b5e8c18 bf9bfd9 b5e8c18 b729e8e bf9bfd9 b5e8c18 4cdc5b2 b5e8c18 dac56c7 b5e8c18 cb5e453 dac56c7 d4bdf39 dac56c7 7da5350 4cdc5b2 b524b3b dac56c7 cb5e453 dac56c7 d4bdf39 dac56c7 d4bdf39 05ffbdf dac56c7 05ffbdf 6c3972f 05ffbdf dac56c7 4cdc5b2 dac56c7 4cdc5b2 dac56c7 4cdc5b2 dac56c7 4cdc5b2 dac56c7 4cdc5b2 dac56c7 b5e8c18 b524b3b b5e8c18 bc28198 b5e8c18 95937f9 b5e8c18 95937f9 b5e8c18 b524b3b b5e8c18 dac56c7 b5e8c18 b524b3b dac56c7 b5e8c18 b524b3b b5e8c18 bf9bfd9 dac56c7 b5e8c18 b524b3b dac56c7 b524b3b b5e8c18 dac56c7 b524b3b b5e8c18 b524b3b cb5e453 7da5350 b5e8c18 7da5350 dac56c7 b5e8c18 dac56c7 2436d0b b5e8c18 dac56c7 05ffbdf b524b3b dac56c7 05ffbdf 2436d0b b524b3b b5e8c18 dac56c7 b5e8c18 dac56c7 b5e8c18 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 |
import os
import re
import heapq
import uuid
import asyncio
import edge_tts
import gradio as gr
import nest_asyncio
from PyPDF2 import PdfReader
from pydub import AudioSegment
from transformers import pipeline
import concurrent.futures
from edge_tts import VoicesManager
import random
import time
from pydub.exceptions import CouldntDecodeError
import fitz
# Apply nested event loop patch for Jupyter/Colab
nest_asyncio.apply()
# Load LLM
generator = pipeline("text-generation",
model="unsloth/gemma-3-1b-it",
device_map='cpu',
max_new_tokens=350,
do_sample=True,
temperature=0.7,)
# Async function to get voices
async def get_english_voices():
voices = await VoicesManager.create()
voice_male = [v for v in voices.voices if v['Gender'] == 'Male' and v['Locale'].startswith("en")]
voice_female = [v for v in voices.voices if v['Gender'] == 'Female' and v['Locale'].startswith("en")]
MALE_VOICE = random.choice(voice_male)['Name'] if voice_male else "en-US-GuyNeural"
FEMALE_VOICE = random.choice(voice_female)['Name'] if voice_female else "es-ES-ElviraNeural"
return MALE_VOICE, FEMALE_VOICE
# Example usage (you must call this within async context or with asyncio.run)
MALE_VOICE, FEMALE_VOICE = asyncio.run(get_english_voices())
rate_male=-12
pitch_male=-10
pitch_female=5
rate_female=-15
rate_female_str = f"{rate_female:+d}%"
pitch_female_str = f"{pitch_female:+d}Hz"
rate_male_str = f"{rate_female:+d}%"
pitch_male_str = f"{pitch_female:+d}Hz"
KEY_TERMS = [
"model", "propose", "architecture", "performance", "accuracy", "experiment",
"framework", "design", "method", "network", "approach", "outperform",
"layer", "training", "results", "learning", "evaluate", "baseline",
"precision", "recall", "f1", "error", "metric", "loss", "time", "weight", "speed"
]
def split_sentences(text):
return re.split(r'(?<=[.!?])\s+', text.strip())
def extract_sections_from_pdf(pdf_path):
reader = PdfReader(pdf_path)
full_text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
full_text = re.sub(r'\n+', '\n', full_text)
print("orignial text", full_text)
section_patterns = {
"Start of podcast with first section of paper as abstract": r"\babstract\b",
"second section continuing from abstract to Overview and no required to start introductuion between host & guest directly continue in discussion": r"\bintroduction\b",
"third section continuing from Overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"\b(method(?:ology)?|proposed method|approach|model architecture|architecture|experimental setup|network design|implementation details|techniques|framework|learning algorithm|system description)\b",
"fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion and this is the end of conversation so conclude add thank remarks": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
}
sections = {}
matches = []
for name, pattern in section_patterns.items():
match = re.search(pattern, full_text, re.IGNORECASE| re.MULTILINE)
if match:
matches.append((match.start(), match.end(), name))
matches.sort()
for i, (start, end, name) in enumerate(matches):
section_start = end
section_end = matches[i + 1][0] if i + 1 < len(matches) else len(full_text)
section_text = full_text[section_start:section_end].strip()
# Keep up to 4 paragraphs (based on double newline)
paragraphs = section_text.split("\n\n")
limited_section_text = "\n\n".join(paragraphs[:4])
sections[name] = extract_paragraphs(section_text, max_paragraphs=4)
return sections,section_patterns
def extract_paragraphs(text, max_paragraphs=4):
# Use double newlines if present
if "\n\n" in text:
paras = text.split("\n\n")
else:
# If no clear paragraphs, group every 4 lines as one paragraph
lines = text.splitlines()
paras = ['\n'.join(lines[i:i+4]) for i in range(0, len(lines), 4)]
return "\n\n".join(paras[:max_paragraphs])
def summarize_section_by_heuristics(text, max_sentences=5):
sentences = split_sentences(text)
if len(sentences) <= max_sentences:
return text
scored = []
for idx, sent in enumerate(sentences):
score = 0
lower_sent = sent.lower()
words = lower_sent.split()
# Keyword match
score += sum(1 for word in words if word in KEY_TERMS)
# Give more weight to sentences with numbers (e.g. 85%, 0.97, etc.)
if re.search(r'\b\d+(\.\d+)?%?\b', sent): # captures decimals, integers, percentages
score += 2
# Short, information-dense sentences
if 10 < len(words) < 50:
score += 1
# Sentence position (early sentences are usually summary-like)
if idx in [0, 1]:
score += 1
scored.append((score, sent))
# Pick top sentences, preserving original order
top_sentences = heapq.nlargest(max_sentences, scored)
top_sentences = [s for _, s in sorted(top_sentences, key=lambda x: sentences.index(x[1]))]
return " ".join(top_sentences)
def generate_podcast_script(section_name, section_text):
user_prompt = f"""
You are hosting a podcast episode where two characters are having a detailed conversation about a research paper section.
Characters:
- Host: A curious and articulate individual who has read the research paper. The host asks thoughtful questions, adds light commentary, and tries to simplify the topic for listeners.
- Guest: The primary **researcher** or **author** of the paper. The guest explains the section in detail, offering technical insights, motivations, and clarifications.
Goal:
Create a **friendly, engaging, and informative** podcast-style conversation (8–10 sentences total) between the **Host** and **Guest**, focused on the section: **{section_name}**.
Section Content:
\"\"\"
{section_text}
\"\"\"
Format:
Host: ...
Guest: ...
"""
messages = [{"role": "user", "content": user_prompt}]
response = generator(messages, max_new_tokens=350, do_sample=True, temperature=0.7)
return response[0]["generated_text"]
async def generate_voice_line(text, voice, filename, rate="+0%", pitch="+0Hz"):
communicate = edge_tts.Communicate(text, voice)
communicate.rate = rate
communicate.pitch = pitch
await communicate.save(filename)
async def tts_edge_line_by_line(script):
lines = script.split('\n')
segments = []
tasks = []
filenames = []
# Prepare all tasks
for i, line in enumerate(lines):
if 'Host:' in line or 'Guest:' in line:
speaker, content = line.split(':', 1)
speaker = speaker.strip().lower()
voice = MALE_VOICE if speaker == 'host' else FEMALE_VOICE
pitch_str = pitch_male_str if speaker == 'host' else pitch_female_str
rate_str = rate_male_str if speaker == 'host' else rate_female_str
filename = f"segment_{uuid.uuid4().hex}.mp3"
filenames.append(filename)
tasks.append(generate_voice_line(content.strip(), voice, filename, rate=rate_str, pitch=pitch_str))
# Run all TTS tasks
await asyncio.gather(*tasks)
# Wait briefly to ensure files are written
time.sleep(0.3)
# Load audio files safely
for filename in filenames:
if not os.path.exists(filename) or os.path.getsize(filename) == 0:
print(f"⚠️ Skipping corrupt or empty file: {filename}")
continue
segment = AudioSegment.from_mp3(filename)
segments.append(segment)
return segments
def merge_segments(segments, output="podcast_output.mp3"):
podcast = AudioSegment.empty()
for segment in segments:
podcast += segment + AudioSegment.silent(duration=300)
podcast.export(output, format="mp3")
print(f"Podcast saved as {output}")
def process_section(section_summary_pair):
section, summary = section_summary_pair
dialogue = generate_podcast_script(section, summary)
dialogue_content = dialogue[1]["content"]
lines = dialogue_content.split('\n')
dialogue_fine = "\n".join([line for line in lines if 'Host:' in line or 'Guest:' in line]).replace("**", "")
return f"\n\n=== {section.upper()} ===\n{dialogue_fine}\n"
def process_pdf(pdf_file):
pdf_path = "uploaded_pdf.pdf"
with open(pdf_file.name, "rb") as infile, open(pdf_path, "wb") as outfile:
outfile.write(infile.read())
sections,section_patterns = extract_sections_from_pdf(pdf_path)
print("Original text extrated \n\n\n",sections)
summarized_sections = {
name: summarize_section_by_heuristics(content)
for name, content in sections.items()
}
reordered_summarized_sections = {}
for key in section_patterns:
if key in summarized_sections: # Ensure the key exists in data_dict
reordered_summarized_sections[key] = summarized_sections[key]
print(reordered_summarized_sections)
print("Summrized text . \n\n\n",reordered_summarized_sections)
section_summary_pairs = list(reordered_summarized_sections.items())
with concurrent.futures.ThreadPoolExecutor() as executor:
results = executor.map(process_section, section_summary_pairs)
final_script = "".join(results)
print("Script final taken \n\n\n",final_script)
segments = asyncio.run(tts_edge_line_by_line(final_script))
output_audio_path = "podcast_output.mp3"
merge_segments(segments, output=output_audio_path)
os.remove(pdf_path)
return output_audio_path
iface = gr.Interface(
fn=process_pdf,
inputs=gr.File(label="Upload a PDF file"),
outputs=gr.Audio(label="Generated Podcast Audio"),
title="PDF to Podcast",
description="Upload a Research Paper PDF and get a podcast-style audio summary."
)
iface.launch(debug=True)
|