Rausda6 commited on
Commit
5c31036
Β·
verified Β·
1 Parent(s): bbef331

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -1
app.py CHANGED
@@ -149,12 +149,54 @@ class PodcastGenerator:
149
 
150
  add_log(f"βœ… PDF extraction complete. Text length: {len(text)} characters")
151
  return text.strip()
152
-
153
  except Exception as e:
154
  error_msg = f"❌ PDF extraction failed: {str(e)}"
155
  add_log(error_msg)
156
  raise Exception(error_msg)
157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  def clean_and_validate_json(self, text: str) -> Dict:
159
  """Improved JSON extraction and validation - CRITICAL FIX #4"""
160
  add_log("πŸ” Attempting to extract JSON from generated text")
@@ -379,6 +421,12 @@ Speaker 2: ...
379
 
380
  add_log(f"πŸ“ Generated text length: {len(generated_text)} characters")
381
  add_log(f"πŸ” Generated text preview: {generated_text[:2000]}...")
 
 
 
 
 
 
382
 
383
  if progress:
384
  progress(0.4, "πŸ” Processing generated script...")
 
149
 
150
  add_log(f"βœ… PDF extraction complete. Text length: {len(text)} characters")
151
  return text.strip()
152
+
153
  except Exception as e:
154
  error_msg = f"❌ PDF extraction failed: {str(e)}"
155
  add_log(error_msg)
156
  raise Exception(error_msg)
157
 
158
+ async def postprocess_conversation(self, raw_text: str) -> str:
159
+ """Run LLM again to enforce strict Speaker 1/2 format"""
160
+ prompt = f"""
161
+ You are a podcast formatter.
162
+
163
+ Take the following input conversation, and reformat it so that:
164
+ - Every line begins with exactly `Speaker 1:` or `Speaker 2:` (with colon)
165
+ - No timestamps, names, parentheses, or extra formatting
166
+ - No blank lines
167
+ - Do not invent or change the content
168
+
169
+ Example output:
170
+ Speaker 1: Hello and welcome.
171
+ Speaker 2: Thanks! Glad to be here.
172
+
173
+ Now format the following:
174
+ {raw_text}
175
+ """
176
+
177
+ inputs = self.tokenizer(
178
+ prompt,
179
+ return_tensors="pt",
180
+ truncation=True,
181
+ max_length=2048
182
+ )
183
+ inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
184
+
185
+ with torch.no_grad():
186
+ outputs = self.model.generate(
187
+ **inputs,
188
+ max_new_tokens=1024,
189
+ pad_token_id=self.tokenizer.pad_token_id,
190
+ eos_token_id=self.tokenizer.eos_token_id
191
+ )
192
+
193
+ formatted = self.tokenizer.decode(
194
+ outputs[0][inputs['input_ids'].shape[1]:],
195
+ skip_special_tokens=True
196
+ )
197
+ return formatted.strip()
198
+
199
+
200
  def clean_and_validate_json(self, text: str) -> Dict:
201
  """Improved JSON extraction and validation - CRITICAL FIX #4"""
202
  add_log("πŸ” Attempting to extract JSON from generated text")
 
421
 
422
  add_log(f"πŸ“ Generated text length: {len(generated_text)} characters")
423
  add_log(f"πŸ” Generated text preview: {generated_text[:2000]}...")
424
+
425
+ formatted_text = await self.postprocess_conversation(generated_text)
426
+ add_log(f"🧼 Post-processed text:\n{formatted_text[:2000]}")
427
+
428
+ # Proceed with parsing to JSON
429
+ generated_text = self.conversation_to_json(formatted_text)
430
 
431
  if progress:
432
  progress(0.4, "πŸ” Processing generated script...")