Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -8,7 +8,8 @@ from typing import Union, Optional, Dict, Tuple # Import Dict and Tuple
|
|
8 |
from synthgen import (
|
9 |
generate_synthetic_text,
|
10 |
generate_prompts,
|
11 |
-
generate_synthetic_conversation
|
|
|
12 |
)
|
13 |
# We no longer need to import api_key here or check it directly in app.py
|
14 |
|
@@ -27,6 +28,21 @@ def create_json_file(data: object, base_filename: str) -> Union[str, None]:
|
|
27 |
print(f"Error creating JSON file {base_filename}: {e}")
|
28 |
return None
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
def parse_conversation_string(text: str) -> list[dict]:
|
31 |
"""Parses a multi-line conversation string into a list of message dictionaries."""
|
32 |
messages = []
|
@@ -252,11 +268,222 @@ def run_conversation_generation_and_prepare_json(
|
|
252 |
return (gr.update(value=output_str), gr.update(value=json_filepath))
|
253 |
|
254 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
# --- Gradio Interface Definition ---
|
256 |
with gr.Blocks() as demo:
|
257 |
gr.Markdown("# Synthetic Data Generator using OpenRouter")
|
258 |
gr.Markdown(
|
259 |
-
"Generate synthetic text samples or
|
260 |
)
|
261 |
# Removed the api_key_loaded check and warning Markdown
|
262 |
|
@@ -370,10 +597,77 @@ with gr.Blocks() as demo:
|
|
370 |
outputs=[output_conv, download_file_conv] # Output to both Textbox and File
|
371 |
)
|
372 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
373 |
|
374 |
# Launch the Gradio app
|
375 |
if __name__ == "__main__":
|
376 |
print("Launching Gradio App...")
|
377 |
print("Make sure the OPENROUTER_API_KEY environment variable is set.")
|
378 |
# Use share=True for temporary public link if running locally and need to test
|
379 |
-
demo.launch() # share=True
|
|
|
|
8 |
from synthgen import (
|
9 |
generate_synthetic_text,
|
10 |
generate_prompts,
|
11 |
+
generate_synthetic_conversation,
|
12 |
+
generate_corpus_content # Import the new function
|
13 |
)
|
14 |
# We no longer need to import api_key here or check it directly in app.py
|
15 |
|
|
|
28 |
print(f"Error creating JSON file {base_filename}: {e}")
|
29 |
return None
|
30 |
|
31 |
+
# Add the missing function definition
|
32 |
+
def create_text_file(data: str, base_filename: str) -> Union[str, None]:
|
33 |
+
"""Creates a temporary text file and returns its path."""
|
34 |
+
try:
|
35 |
+
# Ensure filename ends with .txt
|
36 |
+
if not base_filename.lower().endswith(".txt"):
|
37 |
+
base_filename += ".txt" # Append if missing for clarity, though suffix handles it
|
38 |
+
# Create a temporary file with a .txt extension
|
39 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix=".txt", delete=False, encoding='utf-8') as temp_file:
|
40 |
+
temp_file.write(data)
|
41 |
+
return temp_file.name # Return the path to the temporary file
|
42 |
+
except Exception as e:
|
43 |
+
print(f"Error creating text file {base_filename}: {e}")
|
44 |
+
return None
|
45 |
+
|
46 |
def parse_conversation_string(text: str) -> list[dict]:
|
47 |
"""Parses a multi-line conversation string into a list of message dictionaries."""
|
48 |
messages = []
|
|
|
268 |
return (gr.update(value=output_str), gr.update(value=json_filepath))
|
269 |
|
270 |
|
271 |
+
# Define content_type_labels globally for use in UI and wrapper functions
|
272 |
+
content_type_labels = {
|
273 |
+
"Corpus Snippets": "# Snippets",
|
274 |
+
"Short Story": "Approx Words",
|
275 |
+
"Article": "Approx Words"
|
276 |
+
}
|
277 |
+
content_type_defaults = {
|
278 |
+
"Corpus Snippets": 5,
|
279 |
+
"Short Story": 1000, # Match new backend default
|
280 |
+
"Article": 1500 # Match new backend default
|
281 |
+
}
|
282 |
+
|
283 |
+
# Wrapper for Corpus/Content Generation
|
284 |
+
def run_corpus_generation_and_prepare_file(
|
285 |
+
topic: str,
|
286 |
+
content_type: str,
|
287 |
+
length_param: int,
|
288 |
+
model: str,
|
289 |
+
temperature: float,
|
290 |
+
top_p: float,
|
291 |
+
max_tokens: int
|
292 |
+
) -> Tuple[gr.update, gr.update]:
|
293 |
+
"""Generates corpus/story/article content and prepares a file for download."""
|
294 |
+
temp_val = temperature if temperature > 0 else None
|
295 |
+
top_p_val = top_p if 0 < top_p <= 1 else None
|
296 |
+
max_tokens_val = max_tokens if max_tokens > 0 else None
|
297 |
+
|
298 |
+
# Use the global dictionary for error messages
|
299 |
+
label_for_error = content_type_labels.get(content_type, 'Length Param')
|
300 |
+
if not topic: return (gr.update(value="Error: Please enter a topic."), gr.update(value=None))
|
301 |
+
if not content_type: return (gr.update(value="Error: Please select a content type."), gr.update(value=None))
|
302 |
+
if length_param <= 0: return (gr.update(value=f"Error: Please enter a positive value for '{label_for_error}'."), gr.update(value=None))
|
303 |
+
|
304 |
+
print(f"Generating {content_type} about '{topic}'...")
|
305 |
+
output_str = f"Generating {content_type} about '{topic}' using model '{model}'...\n"
|
306 |
+
output_str += f"(Settings: Temp={temp_val}, Top-P={top_p_val}, MaxTokens={max_tokens_val})\n" + "="*40 + "\n\n"
|
307 |
+
|
308 |
+
generated_content = generate_corpus_content(
|
309 |
+
topic=topic, content_type=content_type, length_param=length_param, model=model,
|
310 |
+
temperature=temp_val, top_p=top_p_val, max_tokens=max_tokens_val
|
311 |
+
)
|
312 |
+
output_str += generated_content
|
313 |
+
|
314 |
+
file_path = None
|
315 |
+
if not generated_content.startswith("Error:"):
|
316 |
+
core_content = generated_content
|
317 |
+
if "\n\n" in generated_content: parts = generated_content.split("\n\n", 1); core_content = parts[1] if len(parts) > 1 else generated_content
|
318 |
+
if content_type == "Corpus Snippets":
|
319 |
+
snippets = [s.strip() for s in core_content.split('---') if s.strip()]
|
320 |
+
if not snippets: snippets = [s.strip() for s in core_content.split('\n\n') if s.strip()]
|
321 |
+
corpus_data = {"topic": topic, "snippets": snippets}
|
322 |
+
file_path = create_json_file(corpus_data, f"{topic}_corpus.json")
|
323 |
+
else:
|
324 |
+
file_path = create_text_file(core_content, f"{topic}_{content_type.replace(' ','_')}.txt")
|
325 |
+
|
326 |
+
return (gr.update(value=output_str), gr.update(value=file_path))
|
327 |
+
|
328 |
+
# NEW function to update the length parameter label and default value
|
329 |
+
def update_length_param_ui(content_type: str) -> gr.update:
|
330 |
+
"""Updates the label and default value of the length parameter input."""
|
331 |
+
new_label = content_type_labels.get(content_type, "Length Param")
|
332 |
+
new_value = content_type_defaults.get(content_type, 5) # Default to 5 if type unknown
|
333 |
+
return gr.update(label=new_label, value=new_value)
|
334 |
+
|
335 |
+
|
336 |
+
# --- Generation Wrappers ---
|
337 |
+
# ... (generate_prompts_ui, run_generation_and_prepare_json, run_conversation_generation_and_prepare_json remain the same) ...
|
338 |
+
|
339 |
+
# NEW UI Wrapper for generating TOPICS
|
340 |
+
def generate_topics_ui(
|
341 |
+
num_topics: int,
|
342 |
+
model: str,
|
343 |
+
temperature: float,
|
344 |
+
top_p: float,
|
345 |
+
max_tokens: int
|
346 |
+
) -> str:
|
347 |
+
"""UI Wrapper to generate diverse topics using the AI."""
|
348 |
+
temp_val = temperature if temperature > 0 else None
|
349 |
+
top_p_val = top_p if 0 < top_p <= 1 else None
|
350 |
+
max_tokens_val = max_tokens if max_tokens > 0 else 150 # Limit token for topic list
|
351 |
+
|
352 |
+
if not model:
|
353 |
+
return "Error: Please select a model for topic generation."
|
354 |
+
if num_topics <= 0:
|
355 |
+
return "Error: Number of topics to generate must be positive."
|
356 |
+
if num_topics > 50: # Keep limit reasonable
|
357 |
+
return "Error: Cannot generate more than 50 topics at a time."
|
358 |
+
|
359 |
+
print(f"Generating {num_topics} topics with settings: Temp={temp_val}, Top-P={top_p_val}, MaxTokens={max_tokens_val}")
|
360 |
+
|
361 |
+
# Instruction focused on generating topics
|
362 |
+
instruction = (
|
363 |
+
f"Generate exactly {num_topics} diverse and interesting topics suitable for generating content like articles, stories, or corpus snippets. "
|
364 |
+
f"Each topic should be concise (a few words to a short phrase). "
|
365 |
+
f"Present each topic on a new line, with no other introductory or concluding text or numbering."
|
366 |
+
f"\n\nExamples:\n"
|
367 |
+
f"The future of renewable energy\n"
|
368 |
+
f"The history of the Silk Road\n"
|
369 |
+
f"The impact of social media on mental health"
|
370 |
+
)
|
371 |
+
system_msg = "You are an expert topic generator. Follow the user's instructions precisely."
|
372 |
+
|
373 |
+
try:
|
374 |
+
# Use the core text generation function
|
375 |
+
generated_text = generate_synthetic_text(
|
376 |
+
instruction,
|
377 |
+
model,
|
378 |
+
system_message=system_msg,
|
379 |
+
temperature=temp_val,
|
380 |
+
top_p=top_p_val,
|
381 |
+
max_tokens=max_tokens_val
|
382 |
+
)
|
383 |
+
|
384 |
+
if generated_text.startswith("Error:"):
|
385 |
+
raise ValueError(generated_text) # Propagate error
|
386 |
+
|
387 |
+
# Split into lines and clean up
|
388 |
+
topics_list = [t.strip() for t in generated_text.strip().split('\n') if t.strip()]
|
389 |
+
|
390 |
+
if not topics_list:
|
391 |
+
print(f"Warning: Failed to parse topics from generated text. Raw text:\n{generated_text}")
|
392 |
+
raise ValueError("AI failed to generate topics in the expected format.")
|
393 |
+
|
394 |
+
# Return newline-separated string for the Textbox
|
395 |
+
return "\n".join(topics_list[:num_topics]) # Truncate if needed
|
396 |
+
|
397 |
+
except ValueError as e:
|
398 |
+
return f"Error generating topics: {e}"
|
399 |
+
except Exception as e:
|
400 |
+
print(f"Unexpected error in generate_topics_ui: {e}")
|
401 |
+
return f"An unexpected error occurred: {e}"
|
402 |
+
|
403 |
+
# Modified Wrapper for Bulk Corpus/Content Generation
|
404 |
+
def run_bulk_content_generation_and_prepare_json(
|
405 |
+
topics_text: str, # Renamed from topic
|
406 |
+
content_type: str,
|
407 |
+
length_param: int,
|
408 |
+
model: str,
|
409 |
+
temperature: float,
|
410 |
+
top_p: float,
|
411 |
+
max_tokens: int
|
412 |
+
) -> Tuple[gr.update, gr.update]:
|
413 |
+
"""Generates content for multiple topics and prepares a JSON file."""
|
414 |
+
temp_val = temperature if temperature > 0 else None
|
415 |
+
top_p_val = top_p if 0 < top_p <= 1 else None
|
416 |
+
max_tokens_val = max_tokens if max_tokens > 0 else None
|
417 |
+
|
418 |
+
# --- Input Validation ---
|
419 |
+
if not topics_text:
|
420 |
+
return (gr.update(value="Error: Please enter or generate at least one topic."), gr.update(value=None))
|
421 |
+
if not content_type:
|
422 |
+
return (gr.update(value="Error: Please select a content type."), gr.update(value=None))
|
423 |
+
|
424 |
+
topics = [t.strip() for t in topics_text.strip().split('\n') if t.strip()]
|
425 |
+
if not topics:
|
426 |
+
return (gr.update(value="Error: No valid topics found in the input."), gr.update(value=None))
|
427 |
+
|
428 |
+
label_for_error = content_type_labels.get(content_type, 'Length Param')
|
429 |
+
if length_param <= 0:
|
430 |
+
return (gr.update(value=f"Error: Please enter a positive value for '{label_for_error}'."), gr.update(value=None))
|
431 |
+
# --- End Validation ---
|
432 |
+
|
433 |
+
output_str = f"Generating {content_type} for {len(topics)} topics using model '{model}'...\n"
|
434 |
+
output_str += f"(Settings: Temp={temp_val}, Top-P={top_p_val}, MaxTokens={max_tokens_val})\n" + "="*40 + "\n\n"
|
435 |
+
|
436 |
+
bulk_results = [] # Store results for JSON
|
437 |
+
|
438 |
+
# --- Loop through topics ---
|
439 |
+
for i, topic in enumerate(topics):
|
440 |
+
print(f"Generating {content_type} for topic {i+1}/{len(topics)}: '{topic}'...")
|
441 |
+
output_str += f"--- Topic {i+1}/{len(topics)}: '{topic}' ---\n"
|
442 |
+
|
443 |
+
generated_content_full = generate_corpus_content( # Returns string including title/error
|
444 |
+
topic=topic, content_type=content_type, length_param=length_param, model=model,
|
445 |
+
temperature=temp_val, top_p=top_p_val, max_tokens=max_tokens_val
|
446 |
+
)
|
447 |
+
|
448 |
+
output_str += generated_content_full + "\n\n" # Add full result to textbox
|
449 |
+
|
450 |
+
# --- Prepare structured result for JSON ---
|
451 |
+
result_entry = {"topic": topic, "content_type": content_type}
|
452 |
+
if generated_content_full.startswith("Error:"):
|
453 |
+
result_entry["status"] = "error"
|
454 |
+
result_entry["error_message"] = generated_content_full
|
455 |
+
result_entry["content"] = None
|
456 |
+
else:
|
457 |
+
result_entry["status"] = "success"
|
458 |
+
result_entry["error_message"] = None
|
459 |
+
# Extract core content (remove potential title added by backend)
|
460 |
+
core_content = generated_content_full
|
461 |
+
if "\n\n" in generated_content_full:
|
462 |
+
parts = generated_content_full.split("\n\n", 1)
|
463 |
+
core_content = parts[1] if len(parts) > 1 else generated_content_full
|
464 |
+
|
465 |
+
if content_type == "Corpus Snippets":
|
466 |
+
snippets = [s.strip() for s in core_content.split('---') if s.strip()]
|
467 |
+
if not snippets: snippets = [s.strip() for s in core_content.split('\n\n') if s.strip()]
|
468 |
+
result_entry["content"] = snippets # Store list for corpus
|
469 |
+
else:
|
470 |
+
result_entry["content"] = core_content # Store string for story/article
|
471 |
+
|
472 |
+
bulk_results.append(result_entry)
|
473 |
+
# --- End JSON preparation ---
|
474 |
+
|
475 |
+
# --- Finalize ---
|
476 |
+
output_str += "="*40 + f"\nBulk generation complete for {len(topics)} topics."
|
477 |
+
json_filepath = create_json_file(bulk_results, f"{content_type.replace(' ','_')}_bulk_results.json")
|
478 |
+
|
479 |
+
return (gr.update(value=output_str), gr.update(value=json_filepath))
|
480 |
+
|
481 |
+
|
482 |
# --- Gradio Interface Definition ---
|
483 |
with gr.Blocks() as demo:
|
484 |
gr.Markdown("# Synthetic Data Generator using OpenRouter")
|
485 |
gr.Markdown(
|
486 |
+
"Generate synthetic text samples, conversations, or other content using various models"
|
487 |
)
|
488 |
# Removed the api_key_loaded check and warning Markdown
|
489 |
|
|
|
597 |
outputs=[output_conv, download_file_conv] # Output to both Textbox and File
|
598 |
)
|
599 |
|
600 |
+
# --- Content Generation Tab (Modified for Bulk) ---
|
601 |
+
with gr.TabItem("Bulk Content Generation"):
|
602 |
+
output_content = gr.Textbox(label="Generated Content (Log)", lines=15, show_copy_button=True)
|
603 |
+
# Output is now always JSON
|
604 |
+
download_file_content = gr.File(label="Download Results as JSON")
|
605 |
+
|
606 |
+
gr.Markdown("Enter one topic per line below, or use the 'Generate Topics' button.")
|
607 |
+
with gr.Row():
|
608 |
+
# Changed to multi-line Textbox
|
609 |
+
topic_input_content = gr.Textbox(
|
610 |
+
label="Topics (one per line)",
|
611 |
+
lines=5,
|
612 |
+
placeholder="Enter topics here, one per line...\ne.g., The future of renewable energy\nThe history of the Silk Road"
|
613 |
+
)
|
614 |
+
|
615 |
+
# --- Topic Generation ---
|
616 |
+
with gr.Accordion("Topic Generation Options", open=False):
|
617 |
+
with gr.Row():
|
618 |
+
num_topics_input = gr.Number(label="# Topics to Generate", value=5, minimum=1, maximum=50, step=1)
|
619 |
+
# Use shared model selector below and settings
|
620 |
+
generate_topics_button = gr.Button("Generate Topics using AI")
|
621 |
+
|
622 |
+
# --- Generation Settings ---
|
623 |
+
with gr.Row():
|
624 |
+
content_type_choices = list(content_type_labels.keys())
|
625 |
+
content_type_input = gr.Dropdown(
|
626 |
+
label="Content Type", choices=content_type_choices, value=content_type_choices[0]
|
627 |
+
)
|
628 |
+
default_length_label = content_type_labels[content_type_choices[0]]
|
629 |
+
default_length_value = content_type_defaults[content_type_choices[0]]
|
630 |
+
length_param_input = gr.Number(
|
631 |
+
label=default_length_label, value=default_length_value, minimum=1, step=1
|
632 |
+
)
|
633 |
+
with gr.Row():
|
634 |
+
model_input_content = gr.Dropdown(label="Model", choices=model_choices, value=default_model)
|
635 |
+
|
636 |
+
# Button to trigger bulk generation
|
637 |
+
generate_content_button = gr.Button("Generate Bulk Content")
|
638 |
+
|
639 |
+
# --- Event Listeners ---
|
640 |
+
# Listener to update length param UI
|
641 |
+
content_type_input.change(
|
642 |
+
fn=update_length_param_ui,
|
643 |
+
inputs=content_type_input,
|
644 |
+
outputs=length_param_input
|
645 |
+
)
|
646 |
+
# Listener for topic generation button
|
647 |
+
generate_topics_button.click(
|
648 |
+
fn=generate_topics_ui,
|
649 |
+
inputs=[ # Pass necessary inputs for topic generation
|
650 |
+
num_topics_input, model_input_content, # Use this tab's model selector
|
651 |
+
temperature_slider, top_p_slider, max_tokens_slider
|
652 |
+
],
|
653 |
+
outputs=topic_input_content # Output generated topics to the textbox
|
654 |
+
)
|
655 |
+
# Listener for main generation button
|
656 |
+
generate_content_button.click(
|
657 |
+
fn=run_bulk_content_generation_and_prepare_json, # Use the new bulk wrapper
|
658 |
+
inputs=[
|
659 |
+
topic_input_content, content_type_input, length_param_input,
|
660 |
+
model_input_content,
|
661 |
+
temperature_slider, top_p_slider, max_tokens_slider
|
662 |
+
],
|
663 |
+
outputs=[output_content, download_file_content]
|
664 |
+
)
|
665 |
+
|
666 |
|
667 |
# Launch the Gradio app
|
668 |
if __name__ == "__main__":
|
669 |
print("Launching Gradio App...")
|
670 |
print("Make sure the OPENROUTER_API_KEY environment variable is set.")
|
671 |
# Use share=True for temporary public link if running locally and need to test
|
672 |
+
demo.launch() # share=True
|
673 |
+
|