Spaces:
Running
on
L40S
Running
on
L40S
Update app.py
Browse files
app.py
CHANGED
@@ -165,11 +165,14 @@ Take the following input conversation, and reformat it so that:
|
|
165 |
- No timestamps, names, parentheses, or extra formatting, no chapter names, no special characters beside ":"
|
166 |
- No blank lines
|
167 |
- Do not invent or change the content
|
168 |
-
|
169 |
Example output:
|
170 |
Speaker 1: Hello and welcome.
|
171 |
Speaker 2: Thanks! Glad to be here.
|
172 |
-
|
|
|
|
|
|
|
173 |
Now format the following:
|
174 |
{raw_text}
|
175 |
"""
|
@@ -237,21 +240,28 @@ Now format the following:
|
|
237 |
add_log("⚠️ No valid JSON found, creating fallback")
|
238 |
return self.create_fallback_podcast(text)
|
239 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
def conversation_to_json(self, text: str) -> Dict:
|
241 |
|
242 |
"""Convert speaker-formatted text to podcast JSON structure"""
|
243 |
# Allow leading whitespace and enforce full line match
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
match = re.match(r'^Speaker\s*([12])\s*:\s*(.+)', line)
|
252 |
-
if match:
|
253 |
-
speaker, content = match.groups()
|
254 |
-
podcast.append({"speaker": int(speaker), "line": content.strip()})
|
255 |
|
256 |
return {
|
257 |
"topic": "Generated from Input",
|
|
|
165 |
- No timestamps, names, parentheses, or extra formatting, no chapter names, no special characters beside ":"
|
166 |
- No blank lines
|
167 |
- Do not invent or change the content
|
168 |
+
- you are not allowed to use anywhere in the text the character +#-*<>"()[]
|
169 |
Example output:
|
170 |
Speaker 1: Hello and welcome.
|
171 |
Speaker 2: Thanks! Glad to be here.
|
172 |
+
Speaker 1: ...
|
173 |
+
Speaker 2: ...
|
174 |
+
Speaker 1: ...
|
175 |
+
Speaker 2: ...
|
176 |
Now format the following:
|
177 |
{raw_text}
|
178 |
"""
|
|
|
240 |
add_log("⚠️ No valid JSON found, creating fallback")
|
241 |
return self.create_fallback_podcast(text)
|
242 |
|
243 |
+
def normalize_speaker_lines(self,text: str) -> str:
|
244 |
+
"""Normalize lines to 'Speaker 1: text' format based on presence of 1 or 2 and a ':' or '-'."""
|
245 |
+
# Convert markdown and bracketed formats to 'Speaker X: ...'
|
246 |
+
text = re.sub(
|
247 |
+
r'(?i)^.*?([12])[^a-zA-Z0-9]*[:\-]\s*',
|
248 |
+
lambda m: f"Speaker {m.group(1)}: ",
|
249 |
+
text,
|
250 |
+
flags=re.MULTILINE
|
251 |
+
)
|
252 |
+
return text
|
253 |
+
|
254 |
def conversation_to_json(self, text: str) -> Dict:
|
255 |
|
256 |
"""Convert speaker-formatted text to podcast JSON structure"""
|
257 |
# Allow leading whitespace and enforce full line match
|
258 |
+
"""Convert speaker-formatted text to podcast JSON structure"""
|
259 |
+
text = self.normalize_speaker_lines(text)
|
260 |
+
|
261 |
+
|
262 |
+
# Match strict "Speaker X: ..." lines only
|
263 |
+
lines = re.findall(r'^Speaker\s+([12]):\s*(.+)', text, flags=re.MULTILINE)
|
264 |
+
podcast = [{"speaker": int(s), "line": l.strip()} for s, l in lines]
|
|
|
|
|
|
|
|
|
265 |
|
266 |
return {
|
267 |
"topic": "Generated from Input",
|