File size: 10,365 Bytes
dac56c7
 
 
b5e8c18
 
 
 
 
 
 
dac56c7
2436d0b
ac5e680
53d1828
d4bdf39
 
6c3972f
dac56c7
b5e8c18
 
 
 
dac56c7
 
715f9f3
cb5e453
 
 
b729e8e
b524b3b
b729e8e
bf9bfd9
 
b5e8c18
b729e8e
 
 
b5e8c18
b729e8e
b524b3b
b5e8c18
bf9bfd9
 
 
 
b5e8c18
b729e8e
bf9bfd9
 
b5e8c18
 
 
 
4cdc5b2
 
b5e8c18
 
dac56c7
 
b5e8c18
cb5e453
dac56c7
 
 
 
d4bdf39
 
dac56c7
7da5350
4cdc5b2
 
b524b3b
dac56c7
cb5e453
dac56c7
 
 
 
d4bdf39
dac56c7
 
 
 
 
 
 
 
d4bdf39
 
 
 
05ffbdf
dac56c7
05ffbdf
6c3972f
 
05ffbdf
 
 
 
 
 
 
 
 
 
 
dac56c7
 
 
 
 
 
 
 
4cdc5b2
 
 
 
dac56c7
4cdc5b2
 
 
 
 
 
dac56c7
 
4cdc5b2
 
dac56c7
 
4cdc5b2
dac56c7
 
4cdc5b2
dac56c7
 
 
b5e8c18
 
b524b3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5e8c18
bc28198
b5e8c18
 
95937f9
b5e8c18
95937f9
 
b5e8c18
 
b524b3b
b5e8c18
dac56c7
 
b5e8c18
 
 
b524b3b
dac56c7
 
 
b5e8c18
b524b3b
b5e8c18
bf9bfd9
 
dac56c7
b5e8c18
 
b524b3b
dac56c7
b524b3b
b5e8c18
dac56c7
b524b3b
 
 
 
b5e8c18
b524b3b
 
 
 
cb5e453
 
7da5350
b5e8c18
7da5350
dac56c7
 
 
b5e8c18
dac56c7
 
 
2436d0b
 
 
 
 
b5e8c18
 
dac56c7
 
 
 
 
 
05ffbdf
b524b3b
 
dac56c7
 
 
 
05ffbdf
 
 
 
 
 
 
 
2436d0b
 
 
 
b524b3b
b5e8c18
dac56c7
 
 
 
 
 
 
 
 
 
 
b5e8c18
dac56c7
 
b5e8c18
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
import os
import re
import heapq
import uuid
import asyncio
import edge_tts
import gradio as gr
import nest_asyncio
from PyPDF2 import PdfReader
from pydub import AudioSegment
from transformers import pipeline
import concurrent.futures
from edge_tts import VoicesManager
import random
import time
from pydub.exceptions import CouldntDecodeError
import fitz

# Apply nested event loop patch for Jupyter/Colab
nest_asyncio.apply()

# Load LLM
generator = pipeline("text-generation",
                     model="unsloth/gemma-3-1b-it",
                     device_map='cpu',
                     max_new_tokens=350,
                     do_sample=True,             
                     temperature=0.7,)
# Async function to get voices
async def get_english_voices():
    voices = await VoicesManager.create()
    voice_male = [v for v in voices.voices if v['Gender'] == 'Male' and v['Locale'].startswith("en")]
    voice_female = [v for v in voices.voices if v['Gender'] == 'Female' and v['Locale'].startswith("en")]

    MALE_VOICE = random.choice(voice_male)['Name'] if voice_male else "en-US-GuyNeural"
    FEMALE_VOICE = random.choice(voice_female)['Name'] if voice_female else "es-ES-ElviraNeural"
    return MALE_VOICE, FEMALE_VOICE

# Example usage (you must call this within async context or with asyncio.run)
MALE_VOICE, FEMALE_VOICE = asyncio.run(get_english_voices())

rate_male=-12
pitch_male=-10
pitch_female=5
rate_female=-15
rate_female_str = f"{rate_female:+d}%"
pitch_female_str = f"{pitch_female:+d}Hz"
rate_male_str = f"{rate_female:+d}%"
pitch_male_str = f"{pitch_female:+d}Hz"

KEY_TERMS = [
    "model", "propose", "architecture", "performance", "accuracy", "experiment",
    "framework", "design", "method", "network", "approach", "outperform",
    "layer", "training", "results", "learning", "evaluate", "baseline",
    "precision", "recall", "f1", "error", "metric", "loss", "time", "weight", "speed"
]

def split_sentences(text):
    return re.split(r'(?<=[.!?])\s+', text.strip())

def extract_sections_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    full_text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
    full_text = re.sub(r'\n+', '\n', full_text)

    print("orignial text", full_text)

    section_patterns = {
        "Start of podcast with first section of paper as abstract": r"\babstract\b",
        "second section continuing from abstract to Overview and no required to start introductuion between host & guest directly continue in discussion": r"\bintroduction\b",
        "third section continuing from Overview to methodology and no required to start introductuion between host & guest directly continue in discussion": r"\b(method(?:ology)?|proposed method|approach|model architecture|architecture|experimental setup|network design|implementation details|techniques|framework|learning algorithm|system description)\b",
        "fourth and the last section continuing from methodology to conclusion and no required to start introductuion between host & guest directly continue in discussion and this is the end of conversation so conclude add thank remarks": r"\bconclusion(?:s)?\b|\bsummary\b|final thoughts\b|result(?:s)?",
    }
  

    sections = {}
    matches = []
    for name, pattern in section_patterns.items():
        match = re.search(pattern, full_text, re.IGNORECASE| re.MULTILINE)
        if match:
            matches.append((match.start(), match.end(), name))

    matches.sort()
    for i, (start, end, name) in enumerate(matches):
        section_start = end
        section_end = matches[i + 1][0] if i + 1 < len(matches) else len(full_text)
        section_text = full_text[section_start:section_end].strip()
        
        # Keep up to 4 paragraphs (based on double newline)
        paragraphs = section_text.split("\n\n")
        limited_section_text = "\n\n".join(paragraphs[:4])
        sections[name] = extract_paragraphs(section_text, max_paragraphs=4)

    return sections,section_patterns


    
def extract_paragraphs(text, max_paragraphs=4):
    # Use double newlines if present
    if "\n\n" in text:
        paras = text.split("\n\n")
    else:
        # If no clear paragraphs, group every 4 lines as one paragraph
        lines = text.splitlines()
        paras = ['\n'.join(lines[i:i+4]) for i in range(0, len(lines), 4)]

    return "\n\n".join(paras[:max_paragraphs])
def summarize_section_by_heuristics(text, max_sentences=5):
    sentences = split_sentences(text)
    if len(sentences) <= max_sentences:
        return text

    scored = []
    for idx, sent in enumerate(sentences):
        score = 0
        lower_sent = sent.lower()
        words = lower_sent.split()

        # Keyword match
        score += sum(1 for word in words if word in KEY_TERMS)

        # Give more weight to sentences with numbers (e.g. 85%, 0.97, etc.)
        if re.search(r'\b\d+(\.\d+)?%?\b', sent):  # captures decimals, integers, percentages
            score += 2

        # Short, information-dense sentences
        if 10 < len(words) < 50:
            score += 1

        # Sentence position (early sentences are usually summary-like)
        if idx in [0, 1]:
            score += 1

        scored.append((score, sent))

    # Pick top sentences, preserving original order
    top_sentences = heapq.nlargest(max_sentences, scored)
    top_sentences = [s for _, s in sorted(top_sentences, key=lambda x: sentences.index(x[1]))]
    return " ".join(top_sentences)

def generate_podcast_script(section_name, section_text):
    user_prompt = f"""
        You are hosting a podcast episode where two characters are having a detailed conversation about a research paper section.
        
        Characters:
        - Host: A curious and articulate individual who has read the research paper. The host asks thoughtful questions, adds light commentary, and tries to simplify the topic for listeners.
        - Guest: The primary **researcher** or **author** of the paper. The guest explains the section in detail, offering technical insights, motivations, and clarifications.
        
        Goal:
        Create a **friendly, engaging, and informative** podcast-style conversation (8–10 sentences total) between the **Host** and **Guest**, focused on the section: **{section_name}**.
        
        Section Content:
        \"\"\"
        {section_text}
        \"\"\"
        
        Format:
        Host: ...
        Guest: ...
        """

    messages = [{"role": "user", "content": user_prompt}]
    response = generator(messages, max_new_tokens=350, do_sample=True, temperature=0.7)
    return response[0]["generated_text"]

async def generate_voice_line(text, voice, filename, rate="+0%", pitch="+0Hz"):
    communicate = edge_tts.Communicate(text, voice)
    communicate.rate = rate
    communicate.pitch = pitch
    await communicate.save(filename)


async def tts_edge_line_by_line(script):
    lines = script.split('\n')
    segments = []
    tasks = []
    filenames = []

    # Prepare all tasks
    for i, line in enumerate(lines):
        if 'Host:' in line or 'Guest:' in line:
            speaker, content = line.split(':', 1)
            speaker = speaker.strip().lower()

            voice = MALE_VOICE if speaker == 'host' else FEMALE_VOICE
            pitch_str = pitch_male_str if speaker == 'host' else pitch_female_str
            rate_str = rate_male_str if speaker == 'host' else rate_female_str

            filename = f"segment_{uuid.uuid4().hex}.mp3"
            filenames.append(filename)
            tasks.append(generate_voice_line(content.strip(), voice, filename, rate=rate_str, pitch=pitch_str))

    # Run all TTS tasks
    await asyncio.gather(*tasks)

    # Wait briefly to ensure files are written
    time.sleep(0.3)

    # Load audio files safely
    for filename in filenames:
        if not os.path.exists(filename) or os.path.getsize(filename) == 0:
            print(f"⚠️ Skipping corrupt or empty file: {filename}")
            continue

        segment = AudioSegment.from_mp3(filename)
        segments.append(segment)

    return segments

def merge_segments(segments, output="podcast_output.mp3"):
    podcast = AudioSegment.empty()
    for segment in segments:
        podcast += segment + AudioSegment.silent(duration=300)
    podcast.export(output, format="mp3")
    print(f"Podcast saved as {output}")

def process_section(section_summary_pair):
    section, summary = section_summary_pair
    dialogue = generate_podcast_script(section, summary)
    dialogue_content = dialogue[1]["content"]
    lines = dialogue_content.split('\n')
    dialogue_fine = "\n".join([line for line in lines if 'Host:' in line or 'Guest:' in line]).replace("**", "")
    return f"\n\n=== {section.upper()} ===\n{dialogue_fine}\n"

def process_pdf(pdf_file):
    pdf_path = "uploaded_pdf.pdf"
    with open(pdf_file.name, "rb") as infile, open(pdf_path, "wb") as outfile:
        outfile.write(infile.read())

    sections,section_patterns = extract_sections_from_pdf(pdf_path)
    
    print("Original text extrated \n\n\n",sections)
    summarized_sections = {
        name: summarize_section_by_heuristics(content)
        for name, content in sections.items()
    }
    reordered_summarized_sections = {}
    for key in section_patterns:
        if key in summarized_sections: # Ensure the key exists in data_dict
            reordered_summarized_sections[key] = summarized_sections[key]
    
    print(reordered_summarized_sections)
    print("Summrized text . \n\n\n",reordered_summarized_sections)
    section_summary_pairs = list(reordered_summarized_sections.items())
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = executor.map(process_section, section_summary_pairs)

    final_script = "".join(results)
    print("Script final taken \n\n\n",final_script)
    segments = asyncio.run(tts_edge_line_by_line(final_script))
    output_audio_path = "podcast_output.mp3"
    merge_segments(segments, output=output_audio_path)

    os.remove(pdf_path)
    return output_audio_path

iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="Upload a PDF file"),
    outputs=gr.Audio(label="Generated Podcast Audio"),
    title="PDF to Podcast",
    description="Upload a Research Paper PDF and get a podcast-style audio summary."
)

iface.launch(debug=True)