Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -149,12 +149,54 @@ class PodcastGenerator:
|
|
149 |
|
150 |
add_log(f"β
PDF extraction complete. Text length: {len(text)} characters")
|
151 |
return text.strip()
|
152 |
-
|
153 |
except Exception as e:
|
154 |
error_msg = f"β PDF extraction failed: {str(e)}"
|
155 |
add_log(error_msg)
|
156 |
raise Exception(error_msg)
|
157 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
def clean_and_validate_json(self, text: str) -> Dict:
|
159 |
"""Improved JSON extraction and validation - CRITICAL FIX #4"""
|
160 |
add_log("π Attempting to extract JSON from generated text")
|
@@ -379,6 +421,12 @@ Speaker 2: ...
|
|
379 |
|
380 |
add_log(f"π Generated text length: {len(generated_text)} characters")
|
381 |
add_log(f"π Generated text preview: {generated_text[:2000]}...")
|
|
|
|
|
|
|
|
|
|
|
|
|
382 |
|
383 |
if progress:
|
384 |
progress(0.4, "π Processing generated script...")
|
|
|
149 |
|
150 |
add_log(f"β
PDF extraction complete. Text length: {len(text)} characters")
|
151 |
return text.strip()
|
152 |
+
|
153 |
except Exception as e:
|
154 |
error_msg = f"β PDF extraction failed: {str(e)}"
|
155 |
add_log(error_msg)
|
156 |
raise Exception(error_msg)
|
157 |
|
158 |
+
async def postprocess_conversation(self, raw_text: str) -> str:
|
159 |
+
"""Run LLM again to enforce strict Speaker 1/2 format"""
|
160 |
+
prompt = f"""
|
161 |
+
You are a podcast formatter.
|
162 |
+
|
163 |
+
Take the following input conversation, and reformat it so that:
|
164 |
+
- Every line begins with exactly `Speaker 1:` or `Speaker 2:` (with colon)
|
165 |
+
- No timestamps, names, parentheses, or extra formatting
|
166 |
+
- No blank lines
|
167 |
+
- Do not invent or change the content
|
168 |
+
|
169 |
+
Example output:
|
170 |
+
Speaker 1: Hello and welcome.
|
171 |
+
Speaker 2: Thanks! Glad to be here.
|
172 |
+
|
173 |
+
Now format the following:
|
174 |
+
{raw_text}
|
175 |
+
"""
|
176 |
+
|
177 |
+
inputs = self.tokenizer(
|
178 |
+
prompt,
|
179 |
+
return_tensors="pt",
|
180 |
+
truncation=True,
|
181 |
+
max_length=2048
|
182 |
+
)
|
183 |
+
inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
|
184 |
+
|
185 |
+
with torch.no_grad():
|
186 |
+
outputs = self.model.generate(
|
187 |
+
**inputs,
|
188 |
+
max_new_tokens=1024,
|
189 |
+
pad_token_id=self.tokenizer.pad_token_id,
|
190 |
+
eos_token_id=self.tokenizer.eos_token_id
|
191 |
+
)
|
192 |
+
|
193 |
+
formatted = self.tokenizer.decode(
|
194 |
+
outputs[0][inputs['input_ids'].shape[1]:],
|
195 |
+
skip_special_tokens=True
|
196 |
+
)
|
197 |
+
return formatted.strip()
|
198 |
+
|
199 |
+
|
200 |
def clean_and_validate_json(self, text: str) -> Dict:
|
201 |
"""Improved JSON extraction and validation - CRITICAL FIX #4"""
|
202 |
add_log("π Attempting to extract JSON from generated text")
|
|
|
421 |
|
422 |
add_log(f"π Generated text length: {len(generated_text)} characters")
|
423 |
add_log(f"π Generated text preview: {generated_text[:2000]}...")
|
424 |
+
|
425 |
+
formatted_text = await self.postprocess_conversation(generated_text)
|
426 |
+
add_log(f"π§Ό Post-processed text:\n{formatted_text[:2000]}")
|
427 |
+
|
428 |
+
# Proceed with parsing to JSON
|
429 |
+
generated_text = self.conversation_to_json(formatted_text)
|
430 |
|
431 |
if progress:
|
432 |
progress(0.4, "π Processing generated script...")
|