Rausda6 commited on
Commit
4b51c12
Β·
verified Β·
1 Parent(s): 413618e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +467 -192
app.py CHANGED
@@ -10,8 +10,11 @@ import time
10
  import mimetypes
11
  import torch
12
  import re
13
- from typing import List, Dict
14
  from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
 
 
 
15
 
16
  # Constants
17
  MAX_FILE_SIZE_MB = 20
@@ -19,36 +22,77 @@ MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
19
 
20
  MODEL_ID = "unsloth/gemma-3-1b-pt"
21
 
22
- # Initialize model with proper error handling
23
- try:
24
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
25
- if tokenizer.pad_token is None:
26
- tokenizer.pad_token = tokenizer.eos_token
27
-
28
- model = AutoModelForCausalLM.from_pretrained(
29
- MODEL_ID,
30
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
31
- device_map="auto",
32
- trust_remote_code=True
33
- ).eval()
34
-
35
- # Configure generation parameters
36
- generation_config = GenerationConfig(
37
- max_new_tokens=1024,
38
- temperature=0.7,
39
- top_p=0.9,
40
- do_sample=True,
41
- pad_token_id=tokenizer.pad_token_id,
42
- eos_token_id=tokenizer.eos_token_id,
43
- )
44
-
45
- print(f"Model loaded successfully on device: {model.device}")
46
 
47
- except Exception as e:
48
- print(f"Model initialization error: {e}")
49
- model = None
50
- tokenizer = None
51
- generation_config = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  class PodcastGenerator:
54
  def __init__(self):
@@ -56,263 +100,471 @@ class PodcastGenerator:
56
  self.tokenizer = tokenizer
57
  self.generation_config = generation_config
58
 
59
- def extract_json_from_text(self, text: str) -> Dict:
60
- """Extract JSON from model output using regex patterns"""
61
- # Remove the input prompt from the output
62
- # Look for JSON-like structures
63
- json_patterns = [
64
- r'\{[^{}]*"topic"[^{}]*"podcast"[^{}]*\[.*?\]\s*\}',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  r'\{.*?"topic".*?"podcast".*?\[.*?\].*?\}',
 
 
66
  ]
67
 
68
- for pattern in json_patterns:
 
69
  matches = re.findall(pattern, text, re.DOTALL | re.IGNORECASE)
 
70
  for match in matches:
71
  try:
72
- # Clean up the match
73
- cleaned_match = match.strip()
74
- return json.loads(cleaned_match)
75
- except json.JSONDecodeError:
 
 
 
 
 
 
 
 
 
 
 
76
  continue
77
 
78
- # If no valid JSON found, create a fallback structure
79
  return self.create_fallback_podcast(text)
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  def create_fallback_podcast(self, text: str) -> Dict:
82
- """Create a basic podcast structure when JSON parsing fails"""
83
- # Extract meaningful sentences from the text
84
- sentences = [s.strip() for s in text.split('.') if len(s.strip()) > 10]
 
 
85
 
86
  if not sentences:
87
- sentences = ["Let's discuss this interesting topic.", "That's a great point to consider."]
 
 
 
 
 
 
 
 
 
88
 
 
89
  podcast_lines = []
90
- for i, sentence in enumerate(sentences[:10]): # Limit to 10 exchanges
91
  speaker = (i % 2) + 1
 
92
  podcast_lines.append({
93
  "speaker": speaker,
94
- "line": sentence + "." if not sentence.endswith('.') else sentence
95
  })
96
 
97
- return {
98
  "topic": "Generated Discussion",
99
  "podcast": podcast_lines
100
  }
 
 
 
101
 
102
  async def generate_script(self, prompt: str, language: str, file_obj=None, progress=None) -> Dict:
103
- if not self.model or not self.tokenizer:
104
- raise Exception("Model not properly initialized. Please check model loading.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
 
106
  example_json = {
107
- "topic": "AGI",
108
  "podcast": [
109
- {"speaker": 1, "line": "So, AGI, huh? Seems like everyone's talking about it these days."},
110
- {"speaker": 2, "line": "Yeah, it's definitely having a moment, isn't it?"},
111
- {"speaker": 1, "line": "It really is. What got you hooked on this topic?"},
112
- {"speaker": 2, "line": "The potential implications are fascinating and concerning at the same time."}
113
  ]
114
  }
115
 
116
- if language == "Auto Detect":
117
- language_instruction = "Use the same language as the input text"
118
- else:
119
- language_instruction = f"Generate the podcast in {language} language"
120
 
121
- # Simplified, more direct prompt
122
- system_prompt = f"""Generate a podcast script as valid JSON. {language_instruction}.
123
 
124
  Requirements:
125
  - Exactly 2 speakers (speaker 1 and 2)
126
- - Natural, engaging conversation
127
- - JSON format only
 
128
 
129
- Example format:
130
- {json.dumps(example_json, indent=2)}
131
 
132
- Input topic: {prompt}
133
 
134
- Generate JSON:"""
135
 
136
  try:
137
  if progress:
138
- progress(0.3, "Generating podcast script...")
139
 
140
- # Tokenize with proper attention mask
 
 
141
  inputs = self.tokenizer(
142
- system_prompt,
143
- return_tensors="pt",
144
- padding=True,
145
  truncation=True,
146
- max_length=2048
147
  )
 
 
148
  inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
 
149
 
150
- # Generate with timeout
 
 
151
  with torch.no_grad():
152
- output = self.model.generate(
 
 
153
  **inputs,
154
  generation_config=self.generation_config,
155
  pad_token_id=self.tokenizer.pad_token_id,
 
 
156
  )
157
- logs.append("βœ… calling llm")
158
- # Decode only the new tokens
 
 
159
  generated_text = self.tokenizer.decode(
160
- output[0][inputs['input_ids'].shape[1]:],
161
- skip_special_tokens=True
 
162
  )
163
- logs.append("βœ… generated text")
164
- print(f"Generated text: {generated_text[:500]}...")
 
165
 
166
  if progress:
167
- progress(0.4, "Processing generated script...")
168
 
169
- # Extract JSON from the generated text
170
- result = self.extract_json_from_text(generated_text)
171
 
172
  if progress:
173
- progress(0.5, "Script generated successfully!")
174
 
 
175
  return result
176
 
177
  except Exception as e:
178
- print(f"Generation error: {e}")
179
- # Return fallback podcast
180
- return {
181
- "topic": prompt or "Discussion",
182
- "podcast": [
183
- {"speaker": 1, "line": f"Welcome to our discussion about {prompt or 'this topic'}."},
184
- {"speaker": 2, "line": "Thanks for having me. This is indeed an interesting subject."},
185
- {"speaker": 1, "line": "Let's dive into the key points and explore different perspectives."},
186
- {"speaker": 2, "line": "Absolutely. There's a lot to unpack here."},
187
- {"speaker": 1, "line": "What aspects do you find most compelling?"},
188
- {"speaker": 2, "line": "The implications and potential applications are fascinating."},
189
- {"speaker": 1, "line": "That's a great point. Thanks for the insightful discussion."},
190
- {"speaker": 2, "line": "Thank you. This has been a valuable conversation."}
191
- ]
192
- }
193
 
194
  async def tts_generate(self, text: str, speaker: int, speaker1: str, speaker2: str) -> str:
195
- """Generate TTS audio with improved error handling"""
196
  voice = speaker1 if speaker == 1 else speaker2
197
- speech = edge_tts.Communicate(text, voice)
198
 
199
- temp_filename = f"temp_audio_{uuid.uuid4()}.wav"
 
 
 
 
 
 
 
 
200
  max_retries = 3
201
 
202
  for attempt in range(max_retries):
203
  try:
204
- await asyncio.wait_for(speech.save(temp_filename), timeout=30)
205
- if os.path.exists(temp_filename) and os.path.getsize(temp_filename) > 0:
 
 
 
 
 
 
 
 
 
 
 
206
  return temp_filename
207
  else:
208
- raise Exception("Generated audio file is empty")
 
209
  except asyncio.TimeoutError:
 
210
  if os.path.exists(temp_filename):
211
  os.remove(temp_filename)
212
  if attempt == max_retries - 1:
213
  raise Exception("TTS generation timed out after multiple attempts")
214
- await asyncio.sleep(1) # Brief delay before retry
 
215
  except Exception as e:
 
216
  if os.path.exists(temp_filename):
217
  os.remove(temp_filename)
218
  if attempt == max_retries - 1:
219
- raise Exception(f"TTS generation failed: {str(e)}")
220
- await asyncio.sleep(1)
221
 
222
  async def combine_audio_files(self, audio_files: List[str], progress=None) -> str:
223
- """Combine audio files with silence padding"""
224
  if progress:
225
- progress(0.9, "Combining audio files...")
226
 
 
 
227
  try:
228
  combined_audio = AudioSegment.empty()
229
- silence_padding = AudioSegment.silent(duration=500) # 500ms silence
230
 
231
  for i, audio_file in enumerate(audio_files):
232
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  audio_segment = AudioSegment.from_file(audio_file)
 
 
 
 
 
234
  combined_audio += audio_segment
235
 
236
  # Add silence between speakers (except for the last file)
237
  if i < len(audio_files) - 1:
238
  combined_audio += silence_padding
 
 
239
 
240
  except Exception as e:
241
- print(f"Warning: Could not process audio file {audio_file}: {e}")
 
242
  finally:
243
  # Clean up temporary file
244
- if os.path.exists(audio_file):
245
- os.remove(audio_file)
 
 
 
 
246
 
247
  if len(combined_audio) == 0:
248
- raise Exception("No audio content generated")
249
 
250
- output_filename = f"podcast_output_{uuid.uuid4()}.wav"
 
 
 
251
  combined_audio.export(output_filename, format="wav")
252
 
 
 
 
 
 
253
  if progress:
254
- progress(1.0, "Podcast generated successfully!")
255
 
256
  return output_filename
257
 
258
  except Exception as e:
 
 
 
259
  # Clean up any remaining temp files
260
  for audio_file in audio_files:
261
- if os.path.exists(audio_file):
262
- os.remove(audio_file)
263
- raise Exception(f"Audio combination failed: {str(e)}")
 
 
 
 
264
 
265
  async def generate_podcast(self, input_text: str, language: str, speaker1: str, speaker2: str, file_obj=None, progress=None) -> str:
266
- """Main podcast generation pipeline with improved error handling"""
 
 
 
267
  try:
268
  if progress:
269
- progress(0.1, "Starting podcast generation...")
270
 
271
  # Generate script
 
272
  podcast_json = await self.generate_script(input_text, language, file_obj, progress)
273
 
274
- if not podcast_json.get('podcast'):
275
- raise Exception("No podcast content generated")
276
 
277
-
278
- logs.append("βœ… process input 1")
279
 
280
  if progress:
281
- progress(0.5, "Converting text to speech...")
282
 
283
- # Generate TTS with sequential processing to avoid overload
284
  audio_files = []
285
  total_lines = len(podcast_json['podcast'])
 
286
 
287
  for i, item in enumerate(podcast_json['podcast']):
288
  try:
 
 
289
  audio_file = await self.tts_generate(
290
  item['line'],
291
  item['speaker'],
292
  speaker1,
293
  speaker2
294
  )
 
295
  audio_files.append(audio_file)
 
296
 
297
  # Update progress
298
  if progress:
299
  current_progress = 0.5 + (0.4 * (i + 1) / total_lines)
300
- progress(current_progress, f"Generated speech {i + 1}/{total_lines}")
301
 
302
  except Exception as e:
303
- print(f"TTS error for line {i}: {e}")
304
- # Continue with remaining lines
305
  continue
306
 
307
  if not audio_files:
308
- raise Exception("No audio files generated successfully")
 
 
 
 
 
309
 
310
  # Combine audio files
311
  combined_audio = await self.combine_audio_files(audio_files, progress)
 
 
 
 
312
  return combined_audio
313
 
314
  except Exception as e:
315
- raise Exception(f"Podcast generation failed: {str(e)}")
 
 
 
 
316
 
317
  # Voice mapping
318
  VOICE_MAPPING = {
@@ -327,72 +579,107 @@ VOICE_MAPPING = {
327
  }
328
 
329
  async def process_input(input_text: str, input_file, language: str, speaker1: str, speaker2: str, progress=None) -> str:
330
- """Process input and generate podcast"""
331
- start_time = time.time()
332
-
 
 
333
  try:
334
  if progress:
335
- progress(0.05, "Processing input...")
336
 
337
  # Map speaker names to voice IDs
338
  speaker1_voice = VOICE_MAPPING.get(speaker1, "en-US-AndrewMultilingualNeural")
339
  speaker2_voice = VOICE_MAPPING.get(speaker2, "en-US-AvaMultilingualNeural")
340
- logs.append("βœ… process input 1")
 
 
 
341
  # Validate input
342
  if not input_text or input_text.strip() == "":
343
  if input_file is None:
344
- raise Exception("Please provide either text input or upload a file")
345
- # TODO: Add file processing logic here if needed
 
 
 
 
 
 
 
 
 
346
 
347
  podcast_generator = PodcastGenerator()
348
  result = await podcast_generator.generate_podcast(
349
  input_text, language, speaker1_voice, speaker2_voice, input_file, progress
350
  )
351
 
352
- logs.append("βœ… process input 2")
353
- end_time = time.time()
354
- print(f"Total generation time: {end_time - start_time:.2f} seconds")
355
  return result
356
 
357
  except Exception as e:
358
- error_msg = str(e)
359
- print(f"Processing error: {error_msg}")
360
- raise Exception(f"Generation failed: {error_msg}")
 
361
 
362
  def generate_podcast_gradio(input_text, input_file, language, speaker1, speaker2):
 
 
 
 
363
  try:
 
 
 
364
  # Validate inputs
365
  if not input_text and input_file is None:
366
- return None
 
367
 
368
  if input_text and len(input_text.strip()) == 0:
369
  input_text = None
370
- logs.append("βœ… File processing 1")
371
- # Create a simple progress tracker
372
- progress_history = []
373
 
374
-
375
  def progress_callback(value, text):
376
- progress_history.append(f"{value:.1%}: {text}")
377
- print(f"Progress: {value:.1%} - {text}")
378
- logs.append("βœ… File processing 2")
379
- # Run the async function
380
- loop = asyncio.new_event_loop()
381
- asyncio.set_event_loop(loop)
382
  try:
383
- result = loop.run_until_complete(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  process_input(input_text, input_file, language, speaker1, speaker2, progress_callback)
385
  )
386
- return result
387
- finally:
388
- loop.close()
389
 
 
 
 
390
  except Exception as e:
391
- print(f"Gradio function error: {e}")
392
- raise gr.Error(f"Failed to generate podcast: {str(e)}")
 
 
393
 
394
  def create_interface():
395
- """Create the Gradio interface with proper component configuration"""
396
  language_options = [
397
  "Auto Detect", "English", "German", "French", "Spanish", "Italian",
398
  "Portuguese", "Dutch", "Russian", "Chinese", "Japanese", "Korean"
@@ -409,6 +696,12 @@ def create_interface():
409
  gr.Markdown("# πŸŽ™οΈ PodcastGen 2")
410
  gr.Markdown("Generate professional 2-speaker podcasts from text input!")
411
 
 
 
 
 
 
 
412
  with gr.Row():
413
  with gr.Column(scale=2):
414
  input_text = gr.Textbox(
@@ -423,7 +716,7 @@ def create_interface():
423
  label="Upload File (Optional)",
424
  file_types=[".pdf", ".txt"],
425
  type="filepath",
426
- #info=f"Max size: {MAX_FILE_SIZE_MB}MB"
427
  )
428
 
429
  with gr.Row():
@@ -449,13 +742,15 @@ def create_interface():
449
  generate_btn = gr.Button(
450
  "πŸŽ™οΈ Generate Podcast",
451
  variant="primary",
452
- size="lg"
 
453
  )
454
 
455
  log_output = gr.Textbox(
456
  label="πŸͺ΅ Debug & Transcript Log",
457
  lines=15,
458
- interactive=False
 
459
  )
460
 
461
  output_audio = gr.Audio(
@@ -469,32 +764,12 @@ def create_interface():
469
  generate_btn.click(
470
  fn=generate_podcast_gradio,
471
  inputs=[input_text, input_file, language, speaker1, speaker2],
472
- outputs=[output_audio],
473
  show_progress=True
474
  )
475
 
476
  # Add usage instructions
477
- with gr.Accordion("Usage Instructions", open=False):
478
  gr.Markdown("""
479
  ### How to use:
480
- 1. **Input**: Enter your topic or text in the text box, or upload a PDF/TXT file
481
- 2. **Language**: Choose the output language (Auto Detect recommended)
482
- 3. **Voices**: Select different voices for Speaker 1 and Speaker 2
483
- 4. **Generate**: Click the button and wait for processing
484
-
485
- ### Tips:
486
- - Provide clear, specific topics for better results
487
- - The AI will create a natural conversation between two speakers
488
- - Generation may take 1-3 minutes depending on text length
489
- """)
490
-
491
- return demo
492
-
493
- if __name__ == "__main__":
494
- demo = create_interface()
495
- demo.launch(
496
- server_name="0.0.0.0",
497
- server_port=7860,
498
- show_error=True,
499
- share=False
500
- )
 
10
  import mimetypes
11
  import torch
12
  import re
13
+ from typing import List, Dict, Optional
14
  from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
15
+ import PyPDF2
16
+ import io
17
+ import traceback
18
 
19
  # Constants
20
  MAX_FILE_SIZE_MB = 20
 
22
 
23
  MODEL_ID = "unsloth/gemma-3-1b-pt"
24
 
25
+ # Global logging system - CRITICAL FIX #1
26
+ logs = []
27
+
28
+ def add_log(message):
29
+ """Thread-safe logging function"""
30
+ logs.append(f"[{time.strftime('%H:%M:%S')}] {message}")
31
+ print(message)
32
+
33
+ # Initialize model with comprehensive error handling - CRITICAL FIX #2
34
+ model = None
35
+ tokenizer = None
36
+ generation_config = None
37
+
38
+ def initialize_model():
39
+ """Separate model initialization with better error handling"""
40
+ global model, tokenizer, generation_config
 
 
 
 
 
 
 
 
41
 
42
+ try:
43
+ add_log("πŸ”„ Initializing model...")
44
+
45
+ tokenizer = AutoTokenizer.from_pretrained(
46
+ MODEL_ID,
47
+ trust_remote_code=True,
48
+ use_fast=False # Sometimes fast tokenizers cause issues
49
+ )
50
+
51
+ # Ensure proper padding token
52
+ if tokenizer.pad_token is None:
53
+ tokenizer.pad_token = tokenizer.eos_token
54
+ add_log("βœ… Set pad_token to eos_token")
55
+
56
+ # Load model with proper device management
57
+ device = "cuda" if torch.cuda.is_available() else "cpu"
58
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
59
+
60
+ model = AutoModelForCausalLM.from_pretrained(
61
+ MODEL_ID,
62
+ torch_dtype=torch_dtype,
63
+ device_map="auto" if torch.cuda.is_available() else None,
64
+ trust_remote_code=True,
65
+ low_cpu_mem_usage=True
66
+ )
67
+
68
+ if not torch.cuda.is_available():
69
+ model = model.to(device)
70
+
71
+ model.eval()
72
+
73
+ # Configure generation parameters
74
+ generation_config = GenerationConfig(
75
+ max_new_tokens=512, # Reduced for stability
76
+ temperature=0.7,
77
+ top_p=0.9,
78
+ do_sample=True,
79
+ pad_token_id=tokenizer.pad_token_id,
80
+ eos_token_id=tokenizer.eos_token_id,
81
+ repetition_penalty=1.1,
82
+ length_penalty=1.0
83
+ )
84
+
85
+ add_log(f"βœ… Model loaded successfully on device: {model.device}")
86
+ return True
87
+
88
+ except Exception as e:
89
+ error_msg = f"❌ Model initialization failed: {str(e)}"
90
+ add_log(error_msg)
91
+ add_log(f"Traceback: {traceback.format_exc()}")
92
+ return False
93
+
94
+ # Initialize model at startup
95
+ model_loaded = initialize_model()
96
 
97
  class PodcastGenerator:
98
  def __init__(self):
 
100
  self.tokenizer = tokenizer
101
  self.generation_config = generation_config
102
 
103
+ def extract_text_from_pdf(self, file_path: str) -> str:
104
+ """Extract text from PDF file - CRITICAL FIX #3"""
105
+ try:
106
+ add_log(f"πŸ“– Extracting text from PDF: {file_path}")
107
+
108
+ with open(file_path, 'rb') as file:
109
+ pdf_reader = PyPDF2.PdfReader(file)
110
+ text = ""
111
+
112
+ for page_num, page in enumerate(pdf_reader.pages):
113
+ try:
114
+ page_text = page.extract_text()
115
+ text += page_text + "\n"
116
+ add_log(f"βœ… Extracted page {page_num + 1}")
117
+ except Exception as e:
118
+ add_log(f"⚠️ Failed to extract page {page_num + 1}: {e}")
119
+ continue
120
+
121
+ if not text.strip():
122
+ raise Exception("No text could be extracted from PDF")
123
+
124
+ add_log(f"βœ… PDF extraction complete. Text length: {len(text)} characters")
125
+ return text.strip()
126
+
127
+ except Exception as e:
128
+ error_msg = f"❌ PDF extraction failed: {str(e)}"
129
+ add_log(error_msg)
130
+ raise Exception(error_msg)
131
+
132
+ def clean_and_validate_json(self, text: str) -> Dict:
133
+ """Improved JSON extraction and validation - CRITICAL FIX #4"""
134
+ add_log("πŸ” Attempting to extract JSON from generated text")
135
+
136
+ # Multiple strategies for JSON extraction
137
+ strategies = [
138
+ # Strategy 1: Look for complete JSON objects
139
+ r'\{[^{}]*"topic"[^{}]*"podcast"[^{}]*\[[^\]]*\][^{}]*\}',
140
+ # Strategy 2: More flexible pattern
141
  r'\{.*?"topic".*?"podcast".*?\[.*?\].*?\}',
142
+ # Strategy 3: Extract content between first { and last }
143
+ r'\{.*\}'
144
  ]
145
 
146
+ for i, pattern in enumerate(strategies):
147
+ add_log(f"🎯 Trying extraction strategy {i+1}")
148
  matches = re.findall(pattern, text, re.DOTALL | re.IGNORECASE)
149
+
150
  for match in matches:
151
  try:
152
+ # Clean the match
153
+ cleaned = match.strip()
154
+ # Fix common JSON issues
155
+ cleaned = re.sub(r',\s*}', '}', cleaned) # Remove trailing commas
156
+ cleaned = re.sub(r',\s*]', ']', cleaned) # Remove trailing commas in arrays
157
+
158
+ parsed = json.loads(cleaned)
159
+
160
+ # Validate structure
161
+ if self.validate_podcast_structure(parsed):
162
+ add_log("βœ… Valid JSON structure found")
163
+ return parsed
164
+
165
+ except json.JSONDecodeError as e:
166
+ add_log(f"⚠️ JSON parse error in strategy {i+1}: {e}")
167
  continue
168
 
169
+ add_log("⚠️ No valid JSON found, creating fallback")
170
  return self.create_fallback_podcast(text)
171
 
172
+ def validate_podcast_structure(self, data: Dict) -> bool:
173
+ """Validate podcast JSON structure"""
174
+ try:
175
+ if not isinstance(data, dict):
176
+ return False
177
+
178
+ if 'topic' not in data or 'podcast' not in data:
179
+ return False
180
+
181
+ if not isinstance(data['podcast'], list):
182
+ return False
183
+
184
+ for item in data['podcast']:
185
+ if not isinstance(item, dict):
186
+ return False
187
+ if 'speaker' not in item or 'line' not in item:
188
+ return False
189
+ if not isinstance(item['speaker'], int) or item['speaker'] not in [1, 2]:
190
+ return False
191
+ if not isinstance(item['line'], str) or len(item['line'].strip()) == 0:
192
+ return False
193
+
194
+ return len(data['podcast']) > 0
195
+
196
+ except Exception:
197
+ return False
198
+
199
  def create_fallback_podcast(self, text: str) -> Dict:
200
+ """Create fallback podcast structure - IMPROVED"""
201
+ add_log("πŸ”§ Creating fallback podcast structure")
202
+
203
+ # Extract meaningful content from the original text
204
+ sentences = [s.strip() for s in text.split('.') if len(s.strip()) > 20]
205
 
206
  if not sentences:
207
+ sentences = [
208
+ "Welcome to our podcast discussion",
209
+ "Today we're exploring an interesting topic",
210
+ "Let's dive into the key points",
211
+ "That's a fascinating perspective",
212
+ "What are your thoughts on this matter",
213
+ "I think there are multiple angles to consider",
214
+ "This is definitely worth exploring further",
215
+ "Thank you for this engaging conversation"
216
+ ]
217
 
218
+ # Create balanced conversation
219
  podcast_lines = []
220
+ for i, sentence in enumerate(sentences[:12]): # Limit to 12 exchanges
221
  speaker = (i % 2) + 1
222
+ line = sentence + "." if not sentence.endswith('.') else sentence
223
  podcast_lines.append({
224
  "speaker": speaker,
225
+ "line": line
226
  })
227
 
228
+ result = {
229
  "topic": "Generated Discussion",
230
  "podcast": podcast_lines
231
  }
232
+
233
+ add_log(f"βœ… Fallback podcast created with {len(podcast_lines)} lines")
234
+ return result
235
 
236
  async def generate_script(self, prompt: str, language: str, file_obj=None, progress=None) -> Dict:
237
+ """Improved script generation with better error handling"""
238
+ if not model_loaded or not self.model or not self.tokenizer:
239
+ raise Exception("❌ Model not properly initialized. Please restart the application.")
240
+
241
+ add_log("🎬 Starting script generation")
242
+
243
+ # Process file if provided - CRITICAL FIX #5
244
+ if file_obj is not None:
245
+ try:
246
+ add_log(f"πŸ“ Processing uploaded file: {file_obj}")
247
+
248
+ if file_obj.endswith('.pdf'):
249
+ extracted_text = self.extract_text_from_pdf(file_obj)
250
+ # Truncate if too long
251
+ if len(extracted_text) > 2000:
252
+ extracted_text = extracted_text[:2000] + "..."
253
+ add_log("βœ‚οΈ Text truncated to 2000 characters")
254
+ prompt = f"Create a podcast discussion about this content: {extracted_text}"
255
+ elif file_obj.endswith('.txt'):
256
+ with open(file_obj, 'r', encoding='utf-8') as f:
257
+ file_content = f.read()
258
+ if len(file_content) > 2000:
259
+ file_content = file_content[:2000] + "..."
260
+ prompt = f"Create a podcast discussion about this content: {file_content}"
261
+
262
+ except Exception as e:
263
+ add_log(f"⚠️ File processing error: {e}")
264
+ # Continue with original prompt
265
 
266
+ # Create focused prompt - CRITICAL FIX #6
267
  example_json = {
268
+ "topic": "AI Technology",
269
  "podcast": [
270
+ {"speaker": 1, "line": "Welcome to our discussion about AI technology."},
271
+ {"speaker": 2, "line": "Thanks for having me. This is such an exciting field."},
272
+ {"speaker": 1, "line": "What aspects of AI do you find most interesting?"},
273
+ {"speaker": 2, "line": "I'm particularly fascinated by machine learning applications."}
274
  ]
275
  }
276
 
277
+ language_instruction = f"Generate in {language}" if language != "Auto Detect" else "Use appropriate language"
 
 
 
278
 
279
+ # Simplified and more reliable prompt
280
+ system_prompt = f"""Create a podcast script in valid JSON format.
281
 
282
  Requirements:
283
  - Exactly 2 speakers (speaker 1 and 2)
284
+ - Natural conversation style
285
+ - 6-8 exchanges total
286
+ - {language_instruction}
287
 
288
+ Example JSON structure:
289
+ {json.dumps(example_json)}
290
 
291
+ Topic: {prompt}
292
 
293
+ JSON:"""
294
 
295
  try:
296
  if progress:
297
+ progress(0.3, "πŸ€– Generating script...")
298
 
299
+ add_log("πŸ”€ Tokenizing input...")
300
+
301
+ # Tokenize with proper handling
302
  inputs = self.tokenizer(
303
+ system_prompt,
304
+ return_tensors="pt",
305
+ padding=True,
306
  truncation=True,
307
+ max_length=1024 # Reduced for stability
308
  )
309
+
310
+ # Move to correct device
311
  inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
312
+ add_log(f"βœ… Inputs moved to device: {self.model.device}")
313
 
314
+ add_log("οΏ½οΏ½οΏ½ Generating with model...")
315
+
316
+ # Generate with timeout and better parameters
317
  with torch.no_grad():
318
+ torch.cuda.empty_cache() if torch.cuda.is_available() else None
319
+
320
+ outputs = self.model.generate(
321
  **inputs,
322
  generation_config=self.generation_config,
323
  pad_token_id=self.tokenizer.pad_token_id,
324
+ attention_mask=inputs.get('attention_mask'),
325
+ use_cache=True
326
  )
327
+
328
+ add_log("βœ… Model generation complete")
329
+
330
+ # Decode only new tokens
331
  generated_text = self.tokenizer.decode(
332
+ outputs[0][inputs['input_ids'].shape[1]:],
333
+ skip_special_tokens=True,
334
+ clean_up_tokenization_spaces=True
335
  )
336
+
337
+ add_log(f"πŸ“ Generated text length: {len(generated_text)} characters")
338
+ add_log(f"πŸ” Generated text preview: {generated_text[:200]}...")
339
 
340
  if progress:
341
+ progress(0.4, "πŸ” Processing generated script...")
342
 
343
+ # Extract and validate JSON
344
+ result = self.clean_and_validate_json(generated_text)
345
 
346
  if progress:
347
+ progress(0.5, "βœ… Script generated successfully!")
348
 
349
+ add_log(f"βœ… Final script has {len(result.get('podcast', []))} lines")
350
  return result
351
 
352
  except Exception as e:
353
+ error_msg = f"❌ Script generation error: {str(e)}"
354
+ add_log(error_msg)
355
+ add_log(f"πŸ” Traceback: {traceback.format_exc()}")
356
+
357
+ # Return robust fallback
358
+ return self.create_fallback_podcast(prompt or "General Discussion")
 
 
 
 
 
 
 
 
 
359
 
360
  async def tts_generate(self, text: str, speaker: int, speaker1: str, speaker2: str) -> str:
361
+ """Improved TTS generation with better error handling - CRITICAL FIX #7"""
362
  voice = speaker1 if speaker == 1 else speaker2
363
+ add_log(f"πŸŽ™οΈ Generating TTS for speaker {speaker} with voice {voice}")
364
 
365
+ # Clean text for TTS
366
+ text = text.strip()
367
+ if not text:
368
+ raise Exception("Empty text for TTS")
369
+
370
+ # Remove problematic characters
371
+ text = re.sub(r'[^\w\s.,!?;:\-\'"()]', '', text)
372
+
373
+ temp_filename = f"temp_audio_{uuid.uuid4().hex[:8]}.wav"
374
  max_retries = 3
375
 
376
  for attempt in range(max_retries):
377
  try:
378
+ add_log(f"🎡 TTS attempt {attempt + 1} for: {text[:50]}...")
379
+
380
+ communicate = edge_tts.Communicate(text, voice)
381
+
382
+ # Use asyncio.wait_for with timeout
383
+ await asyncio.wait_for(
384
+ communicate.save(temp_filename),
385
+ timeout=30.0
386
+ )
387
+
388
+ # Verify file was created and has content
389
+ if os.path.exists(temp_filename) and os.path.getsize(temp_filename) > 1000:
390
+ add_log(f"βœ… TTS successful: {os.path.getsize(temp_filename)} bytes")
391
  return temp_filename
392
  else:
393
+ raise Exception("Generated audio file is too small or empty")
394
+
395
  except asyncio.TimeoutError:
396
+ add_log(f"⏰ TTS timeout on attempt {attempt + 1}")
397
  if os.path.exists(temp_filename):
398
  os.remove(temp_filename)
399
  if attempt == max_retries - 1:
400
  raise Exception("TTS generation timed out after multiple attempts")
401
+ await asyncio.sleep(2)
402
+
403
  except Exception as e:
404
+ add_log(f"❌ TTS error on attempt {attempt + 1}: {str(e)}")
405
  if os.path.exists(temp_filename):
406
  os.remove(temp_filename)
407
  if attempt == max_retries - 1:
408
+ raise Exception(f"TTS generation failed after {max_retries} attempts: {str(e)}")
409
+ await asyncio.sleep(2)
410
 
411
  async def combine_audio_files(self, audio_files: List[str], progress=None) -> str:
412
+ """Improved audio combination - CRITICAL FIX #8"""
413
  if progress:
414
+ progress(0.9, "🎡 Combining audio files...")
415
 
416
+ add_log(f"πŸ”— Combining {len(audio_files)} audio files")
417
+
418
  try:
419
  combined_audio = AudioSegment.empty()
420
+ silence_padding = AudioSegment.silent(duration=800) # 800ms silence
421
 
422
  for i, audio_file in enumerate(audio_files):
423
  try:
424
+ add_log(f"πŸ“ Processing audio file {i+1}: {audio_file}")
425
+
426
+ if not os.path.exists(audio_file):
427
+ add_log(f"⚠️ Audio file not found: {audio_file}")
428
+ continue
429
+
430
+ file_size = os.path.getsize(audio_file)
431
+ add_log(f"πŸ“Š File size: {file_size} bytes")
432
+
433
+ if file_size < 1000:
434
+ add_log(f"⚠️ Audio file too small, skipping: {audio_file}")
435
+ continue
436
+
437
  audio_segment = AudioSegment.from_file(audio_file)
438
+
439
+ if len(audio_segment) < 100: # Less than 100ms
440
+ add_log(f"⚠️ Audio segment too short, skipping")
441
+ continue
442
+
443
  combined_audio += audio_segment
444
 
445
  # Add silence between speakers (except for the last file)
446
  if i < len(audio_files) - 1:
447
  combined_audio += silence_padding
448
+
449
+ add_log(f"βœ… Added audio segment {i+1}, total duration: {len(combined_audio)}ms")
450
 
451
  except Exception as e:
452
+ add_log(f"⚠️ Could not process audio file {audio_file}: {e}")
453
+ continue
454
  finally:
455
  # Clean up temporary file
456
+ try:
457
+ if os.path.exists(audio_file):
458
+ os.remove(audio_file)
459
+ add_log(f"πŸ—‘οΈ Cleaned up temp file: {audio_file}")
460
+ except:
461
+ pass
462
 
463
  if len(combined_audio) == 0:
464
+ raise Exception("No valid audio content was generated")
465
 
466
+ if len(combined_audio) < 5000: # Less than 5 seconds
467
+ raise Exception("Combined audio is too short")
468
+
469
+ output_filename = f"podcast_output_{uuid.uuid4().hex[:8]}.wav"
470
  combined_audio.export(output_filename, format="wav")
471
 
472
+ file_size = os.path.getsize(output_filename)
473
+ duration = len(combined_audio) / 1000 # Duration in seconds
474
+
475
+ add_log(f"βœ… Final podcast: {output_filename} ({file_size} bytes, {duration:.1f}s)")
476
+
477
  if progress:
478
+ progress(1.0, "πŸŽ‰ Podcast generated successfully!")
479
 
480
  return output_filename
481
 
482
  except Exception as e:
483
+ error_msg = f"❌ Audio combination failed: {str(e)}"
484
+ add_log(error_msg)
485
+
486
  # Clean up any remaining temp files
487
  for audio_file in audio_files:
488
+ try:
489
+ if os.path.exists(audio_file):
490
+ os.remove(audio_file)
491
+ except:
492
+ pass
493
+
494
+ raise Exception(error_msg)
495
 
496
  async def generate_podcast(self, input_text: str, language: str, speaker1: str, speaker2: str, file_obj=None, progress=None) -> str:
497
+ """Main podcast generation pipeline - CRITICAL FIX #9"""
498
+ start_time = time.time()
499
+ add_log("🎬 Starting podcast generation pipeline")
500
+
501
  try:
502
  if progress:
503
+ progress(0.1, "πŸš€ Starting podcast generation...")
504
 
505
  # Generate script
506
+ add_log("πŸ“ Generating podcast script...")
507
  podcast_json = await self.generate_script(input_text, language, file_obj, progress)
508
 
509
+ if not podcast_json.get('podcast') or len(podcast_json['podcast']) == 0:
510
+ raise Exception("No podcast content was generated")
511
 
512
+ add_log(f"βœ… Script generated with {len(podcast_json['podcast'])} dialogue lines")
 
513
 
514
  if progress:
515
+ progress(0.5, "πŸŽ™οΈ Converting text to speech...")
516
 
517
+ # Generate TTS with proper error handling
518
  audio_files = []
519
  total_lines = len(podcast_json['podcast'])
520
+ successful_lines = 0
521
 
522
  for i, item in enumerate(podcast_json['podcast']):
523
  try:
524
+ add_log(f"🎡 Processing line {i+1}/{total_lines}: Speaker {item['speaker']}")
525
+
526
  audio_file = await self.tts_generate(
527
  item['line'],
528
  item['speaker'],
529
  speaker1,
530
  speaker2
531
  )
532
+
533
  audio_files.append(audio_file)
534
+ successful_lines += 1
535
 
536
  # Update progress
537
  if progress:
538
  current_progress = 0.5 + (0.4 * (i + 1) / total_lines)
539
+ progress(current_progress, f"πŸŽ™οΈ Generated speech {successful_lines}/{total_lines}")
540
 
541
  except Exception as e:
542
+ add_log(f"❌ TTS failed for line {i+1}: {e}")
543
+ # Continue with remaining lines rather than failing completely
544
  continue
545
 
546
  if not audio_files:
547
+ raise Exception("No audio files were generated successfully")
548
+
549
+ if successful_lines < len(podcast_json['podcast']) / 2:
550
+ add_log(f"⚠️ Warning: Only {successful_lines}/{total_lines} lines processed successfully")
551
+
552
+ add_log(f"βœ… TTS generation complete: {len(audio_files)} audio files")
553
 
554
  # Combine audio files
555
  combined_audio = await self.combine_audio_files(audio_files, progress)
556
+
557
+ elapsed_time = time.time() - start_time
558
+ add_log(f"πŸŽ‰ Podcast generation completed in {elapsed_time:.1f} seconds")
559
+
560
  return combined_audio
561
 
562
  except Exception as e:
563
+ elapsed_time = time.time() - start_time
564
+ error_msg = f"❌ Podcast generation failed after {elapsed_time:.1f}s: {str(e)}"
565
+ add_log(error_msg)
566
+ add_log(f"πŸ” Full traceback: {traceback.format_exc()}")
567
+ raise Exception(error_msg)
568
 
569
  # Voice mapping
570
  VOICE_MAPPING = {
 
579
  }
580
 
581
  async def process_input(input_text: str, input_file, language: str, speaker1: str, speaker2: str, progress=None) -> str:
582
+ """Process input and generate podcast - MAIN ENTRY POINT"""
583
+ add_log("=" * 50)
584
+ add_log("🎬 NEW PODCAST GENERATION REQUEST")
585
+ add_log("=" * 50)
586
+
587
  try:
588
  if progress:
589
+ progress(0.05, "πŸ” Processing input...")
590
 
591
  # Map speaker names to voice IDs
592
  speaker1_voice = VOICE_MAPPING.get(speaker1, "en-US-AndrewMultilingualNeural")
593
  speaker2_voice = VOICE_MAPPING.get(speaker2, "en-US-AvaMultilingualNeural")
594
+
595
+ add_log(f"🎭 Speaker 1: {speaker1} -> {speaker1_voice}")
596
+ add_log(f"🎭 Speaker 2: {speaker2} -> {speaker2_voice}")
597
+
598
  # Validate input
599
  if not input_text or input_text.strip() == "":
600
  if input_file is None:
601
+ raise Exception("❌ Please provide either text input or upload a file")
602
+ add_log("πŸ“ No text input provided, will process uploaded file")
603
+ else:
604
+ add_log(f"πŸ“ Text input provided: {len(input_text)} characters")
605
+
606
+ if input_file:
607
+ add_log(f"πŸ“Ž File uploaded: {input_file}")
608
+
609
+ # Check model status
610
+ if not model_loaded:
611
+ raise Exception("❌ Model not loaded. Please restart the application.")
612
 
613
  podcast_generator = PodcastGenerator()
614
  result = await podcast_generator.generate_podcast(
615
  input_text, language, speaker1_voice, speaker2_voice, input_file, progress
616
  )
617
 
618
+ add_log("πŸŽ‰ PODCAST GENERATION COMPLETED SUCCESSFULLY")
 
 
619
  return result
620
 
621
  except Exception as e:
622
+ error_msg = f"❌ CRITICAL ERROR: {str(e)}"
623
+ add_log(error_msg)
624
+ add_log(f"πŸ” Traceback: {traceback.format_exc()}")
625
+ raise Exception(error_msg)
626
 
627
  def generate_podcast_gradio(input_text, input_file, language, speaker1, speaker2):
628
+ """Gradio interface function - CRITICAL FIX #10"""
629
+ global logs
630
+ logs = [] # Reset logs for each generation
631
+
632
  try:
633
+ add_log("🎬 Gradio function called")
634
+ add_log(f"πŸ“‹ Parameters: text={bool(input_text)}, file={bool(input_file)}, lang={language}")
635
+
636
  # Validate inputs
637
  if not input_text and input_file is None:
638
+ add_log("❌ No input provided")
639
+ return None, "\n".join(logs)
640
 
641
  if input_text and len(input_text.strip()) == 0:
642
  input_text = None
 
 
 
643
 
644
+ # Progress tracking
645
  def progress_callback(value, text):
646
+ add_log(f"πŸ“Š Progress: {value:.1%} - {text}")
647
+
648
+ # Create new event loop for this request - CRITICAL FIX
 
 
 
649
  try:
650
+ # Try to get existing loop
651
+ loop = asyncio.get_event_loop()
652
+ if loop.is_running():
653
+ # If loop is running, we need to run in thread
654
+ import concurrent.futures
655
+ with concurrent.futures.ThreadPoolExecutor() as executor:
656
+ future = executor.submit(
657
+ lambda: asyncio.run(
658
+ process_input(input_text, input_file, language, speaker1, speaker2, progress_callback)
659
+ )
660
+ )
661
+ result = future.result(timeout=300) # 5 minute timeout
662
+ else:
663
+ result = loop.run_until_complete(
664
+ process_input(input_text, input_file, language, speaker1, speaker2, progress_callback)
665
+ )
666
+ except RuntimeError:
667
+ # No event loop exists, create new one
668
+ result = asyncio.run(
669
  process_input(input_text, input_file, language, speaker1, speaker2, progress_callback)
670
  )
 
 
 
671
 
672
+ add_log("βœ… Gradio function completed successfully")
673
+ return result, "\n".join(logs)
674
+
675
  except Exception as e:
676
+ error_msg = f"❌ Gradio function error: {str(e)}"
677
+ add_log(error_msg)
678
+ add_log(f"πŸ” Traceback: {traceback.format_exc()}")
679
+ return None, "\n".join(logs)
680
 
681
  def create_interface():
682
+ """Create the Gradio interface"""
683
  language_options = [
684
  "Auto Detect", "English", "German", "French", "Spanish", "Italian",
685
  "Portuguese", "Dutch", "Russian", "Chinese", "Japanese", "Korean"
 
696
  gr.Markdown("# πŸŽ™οΈ PodcastGen 2")
697
  gr.Markdown("Generate professional 2-speaker podcasts from text input!")
698
 
699
+ # Model status indicator
700
+ if model_loaded:
701
+ gr.Markdown("βœ… **Model Status: Ready**")
702
+ else:
703
+ gr.Markdown("❌ **Model Status: Failed to Load**")
704
+
705
  with gr.Row():
706
  with gr.Column(scale=2):
707
  input_text = gr.Textbox(
 
716
  label="Upload File (Optional)",
717
  file_types=[".pdf", ".txt"],
718
  type="filepath",
719
+ info=f"Max size: {MAX_FILE_SIZE_MB}MB"
720
  )
721
 
722
  with gr.Row():
 
742
  generate_btn = gr.Button(
743
  "πŸŽ™οΈ Generate Podcast",
744
  variant="primary",
745
+ size="lg",
746
+ interactive=model_loaded
747
  )
748
 
749
  log_output = gr.Textbox(
750
  label="πŸͺ΅ Debug & Transcript Log",
751
  lines=15,
752
+ interactive=False,
753
+ info="Real-time generation logs and debugging information"
754
  )
755
 
756
  output_audio = gr.Audio(
 
764
  generate_btn.click(
765
  fn=generate_podcast_gradio,
766
  inputs=[input_text, input_file, language, speaker1, speaker2],
767
+ outputs=[output_audio, log_output],
768
  show_progress=True
769
  )
770
 
771
  # Add usage instructions
772
+ with gr.Accordion("Usage Instructions & Troubleshooting", open=False):
773
  gr.Markdown("""
774
  ### How to use:
775
+ 1. **Input**: Enter your topic or text in the text box, or upload a PDF/TXT file