ReallyFloppyPenguin commited on
Commit
b4174f8
·
verified ·
1 Parent(s): c429c3c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +297 -3
app.py CHANGED
@@ -8,7 +8,8 @@ from typing import Union, Optional, Dict, Tuple # Import Dict and Tuple
8
  from synthgen import (
9
  generate_synthetic_text,
10
  generate_prompts,
11
- generate_synthetic_conversation
 
12
  )
13
  # We no longer need to import api_key here or check it directly in app.py
14
 
@@ -27,6 +28,21 @@ def create_json_file(data: object, base_filename: str) -> Union[str, None]:
27
  print(f"Error creating JSON file {base_filename}: {e}")
28
  return None
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  def parse_conversation_string(text: str) -> list[dict]:
31
  """Parses a multi-line conversation string into a list of message dictionaries."""
32
  messages = []
@@ -252,11 +268,222 @@ def run_conversation_generation_and_prepare_json(
252
  return (gr.update(value=output_str), gr.update(value=json_filepath))
253
 
254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  # --- Gradio Interface Definition ---
256
  with gr.Blocks() as demo:
257
  gr.Markdown("# Synthetic Data Generator using OpenRouter")
258
  gr.Markdown(
259
- "Generate synthetic text samples or conversations using various models"
260
  )
261
  # Removed the api_key_loaded check and warning Markdown
262
 
@@ -370,10 +597,77 @@ with gr.Blocks() as demo:
370
  outputs=[output_conv, download_file_conv] # Output to both Textbox and File
371
  )
372
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
 
374
  # Launch the Gradio app
375
  if __name__ == "__main__":
376
  print("Launching Gradio App...")
377
  print("Make sure the OPENROUTER_API_KEY environment variable is set.")
378
  # Use share=True for temporary public link if running locally and need to test
379
- demo.launch() # share=True
 
 
8
  from synthgen import (
9
  generate_synthetic_text,
10
  generate_prompts,
11
+ generate_synthetic_conversation,
12
+ generate_corpus_content # Import the new function
13
  )
14
  # We no longer need to import api_key here or check it directly in app.py
15
 
 
28
  print(f"Error creating JSON file {base_filename}: {e}")
29
  return None
30
 
31
+ # Add the missing function definition
32
+ def create_text_file(data: str, base_filename: str) -> Union[str, None]:
33
+ """Creates a temporary text file and returns its path."""
34
+ try:
35
+ # Ensure filename ends with .txt
36
+ if not base_filename.lower().endswith(".txt"):
37
+ base_filename += ".txt" # Append if missing for clarity, though suffix handles it
38
+ # Create a temporary file with a .txt extension
39
+ with tempfile.NamedTemporaryFile(mode='w', suffix=".txt", delete=False, encoding='utf-8') as temp_file:
40
+ temp_file.write(data)
41
+ return temp_file.name # Return the path to the temporary file
42
+ except Exception as e:
43
+ print(f"Error creating text file {base_filename}: {e}")
44
+ return None
45
+
46
  def parse_conversation_string(text: str) -> list[dict]:
47
  """Parses a multi-line conversation string into a list of message dictionaries."""
48
  messages = []
 
268
  return (gr.update(value=output_str), gr.update(value=json_filepath))
269
 
270
 
271
+ # Define content_type_labels globally for use in UI and wrapper functions
272
+ content_type_labels = {
273
+ "Corpus Snippets": "# Snippets",
274
+ "Short Story": "Approx Words",
275
+ "Article": "Approx Words"
276
+ }
277
+ content_type_defaults = {
278
+ "Corpus Snippets": 5,
279
+ "Short Story": 1000, # Match new backend default
280
+ "Article": 1500 # Match new backend default
281
+ }
282
+
283
+ # Wrapper for Corpus/Content Generation
284
+ def run_corpus_generation_and_prepare_file(
285
+ topic: str,
286
+ content_type: str,
287
+ length_param: int,
288
+ model: str,
289
+ temperature: float,
290
+ top_p: float,
291
+ max_tokens: int
292
+ ) -> Tuple[gr.update, gr.update]:
293
+ """Generates corpus/story/article content and prepares a file for download."""
294
+ temp_val = temperature if temperature > 0 else None
295
+ top_p_val = top_p if 0 < top_p <= 1 else None
296
+ max_tokens_val = max_tokens if max_tokens > 0 else None
297
+
298
+ # Use the global dictionary for error messages
299
+ label_for_error = content_type_labels.get(content_type, 'Length Param')
300
+ if not topic: return (gr.update(value="Error: Please enter a topic."), gr.update(value=None))
301
+ if not content_type: return (gr.update(value="Error: Please select a content type."), gr.update(value=None))
302
+ if length_param <= 0: return (gr.update(value=f"Error: Please enter a positive value for '{label_for_error}'."), gr.update(value=None))
303
+
304
+ print(f"Generating {content_type} about '{topic}'...")
305
+ output_str = f"Generating {content_type} about '{topic}' using model '{model}'...\n"
306
+ output_str += f"(Settings: Temp={temp_val}, Top-P={top_p_val}, MaxTokens={max_tokens_val})\n" + "="*40 + "\n\n"
307
+
308
+ generated_content = generate_corpus_content(
309
+ topic=topic, content_type=content_type, length_param=length_param, model=model,
310
+ temperature=temp_val, top_p=top_p_val, max_tokens=max_tokens_val
311
+ )
312
+ output_str += generated_content
313
+
314
+ file_path = None
315
+ if not generated_content.startswith("Error:"):
316
+ core_content = generated_content
317
+ if "\n\n" in generated_content: parts = generated_content.split("\n\n", 1); core_content = parts[1] if len(parts) > 1 else generated_content
318
+ if content_type == "Corpus Snippets":
319
+ snippets = [s.strip() for s in core_content.split('---') if s.strip()]
320
+ if not snippets: snippets = [s.strip() for s in core_content.split('\n\n') if s.strip()]
321
+ corpus_data = {"topic": topic, "snippets": snippets}
322
+ file_path = create_json_file(corpus_data, f"{topic}_corpus.json")
323
+ else:
324
+ file_path = create_text_file(core_content, f"{topic}_{content_type.replace(' ','_')}.txt")
325
+
326
+ return (gr.update(value=output_str), gr.update(value=file_path))
327
+
328
+ # NEW function to update the length parameter label and default value
329
+ def update_length_param_ui(content_type: str) -> gr.update:
330
+ """Updates the label and default value of the length parameter input."""
331
+ new_label = content_type_labels.get(content_type, "Length Param")
332
+ new_value = content_type_defaults.get(content_type, 5) # Default to 5 if type unknown
333
+ return gr.update(label=new_label, value=new_value)
334
+
335
+
336
+ # --- Generation Wrappers ---
337
+ # ... (generate_prompts_ui, run_generation_and_prepare_json, run_conversation_generation_and_prepare_json remain the same) ...
338
+
339
+ # NEW UI Wrapper for generating TOPICS
340
+ def generate_topics_ui(
341
+ num_topics: int,
342
+ model: str,
343
+ temperature: float,
344
+ top_p: float,
345
+ max_tokens: int
346
+ ) -> str:
347
+ """UI Wrapper to generate diverse topics using the AI."""
348
+ temp_val = temperature if temperature > 0 else None
349
+ top_p_val = top_p if 0 < top_p <= 1 else None
350
+ max_tokens_val = max_tokens if max_tokens > 0 else 150 # Limit token for topic list
351
+
352
+ if not model:
353
+ return "Error: Please select a model for topic generation."
354
+ if num_topics <= 0:
355
+ return "Error: Number of topics to generate must be positive."
356
+ if num_topics > 50: # Keep limit reasonable
357
+ return "Error: Cannot generate more than 50 topics at a time."
358
+
359
+ print(f"Generating {num_topics} topics with settings: Temp={temp_val}, Top-P={top_p_val}, MaxTokens={max_tokens_val}")
360
+
361
+ # Instruction focused on generating topics
362
+ instruction = (
363
+ f"Generate exactly {num_topics} diverse and interesting topics suitable for generating content like articles, stories, or corpus snippets. "
364
+ f"Each topic should be concise (a few words to a short phrase). "
365
+ f"Present each topic on a new line, with no other introductory or concluding text or numbering."
366
+ f"\n\nExamples:\n"
367
+ f"The future of renewable energy\n"
368
+ f"The history of the Silk Road\n"
369
+ f"The impact of social media on mental health"
370
+ )
371
+ system_msg = "You are an expert topic generator. Follow the user's instructions precisely."
372
+
373
+ try:
374
+ # Use the core text generation function
375
+ generated_text = generate_synthetic_text(
376
+ instruction,
377
+ model,
378
+ system_message=system_msg,
379
+ temperature=temp_val,
380
+ top_p=top_p_val,
381
+ max_tokens=max_tokens_val
382
+ )
383
+
384
+ if generated_text.startswith("Error:"):
385
+ raise ValueError(generated_text) # Propagate error
386
+
387
+ # Split into lines and clean up
388
+ topics_list = [t.strip() for t in generated_text.strip().split('\n') if t.strip()]
389
+
390
+ if not topics_list:
391
+ print(f"Warning: Failed to parse topics from generated text. Raw text:\n{generated_text}")
392
+ raise ValueError("AI failed to generate topics in the expected format.")
393
+
394
+ # Return newline-separated string for the Textbox
395
+ return "\n".join(topics_list[:num_topics]) # Truncate if needed
396
+
397
+ except ValueError as e:
398
+ return f"Error generating topics: {e}"
399
+ except Exception as e:
400
+ print(f"Unexpected error in generate_topics_ui: {e}")
401
+ return f"An unexpected error occurred: {e}"
402
+
403
+ # Modified Wrapper for Bulk Corpus/Content Generation
404
+ def run_bulk_content_generation_and_prepare_json(
405
+ topics_text: str, # Renamed from topic
406
+ content_type: str,
407
+ length_param: int,
408
+ model: str,
409
+ temperature: float,
410
+ top_p: float,
411
+ max_tokens: int
412
+ ) -> Tuple[gr.update, gr.update]:
413
+ """Generates content for multiple topics and prepares a JSON file."""
414
+ temp_val = temperature if temperature > 0 else None
415
+ top_p_val = top_p if 0 < top_p <= 1 else None
416
+ max_tokens_val = max_tokens if max_tokens > 0 else None
417
+
418
+ # --- Input Validation ---
419
+ if not topics_text:
420
+ return (gr.update(value="Error: Please enter or generate at least one topic."), gr.update(value=None))
421
+ if not content_type:
422
+ return (gr.update(value="Error: Please select a content type."), gr.update(value=None))
423
+
424
+ topics = [t.strip() for t in topics_text.strip().split('\n') if t.strip()]
425
+ if not topics:
426
+ return (gr.update(value="Error: No valid topics found in the input."), gr.update(value=None))
427
+
428
+ label_for_error = content_type_labels.get(content_type, 'Length Param')
429
+ if length_param <= 0:
430
+ return (gr.update(value=f"Error: Please enter a positive value for '{label_for_error}'."), gr.update(value=None))
431
+ # --- End Validation ---
432
+
433
+ output_str = f"Generating {content_type} for {len(topics)} topics using model '{model}'...\n"
434
+ output_str += f"(Settings: Temp={temp_val}, Top-P={top_p_val}, MaxTokens={max_tokens_val})\n" + "="*40 + "\n\n"
435
+
436
+ bulk_results = [] # Store results for JSON
437
+
438
+ # --- Loop through topics ---
439
+ for i, topic in enumerate(topics):
440
+ print(f"Generating {content_type} for topic {i+1}/{len(topics)}: '{topic}'...")
441
+ output_str += f"--- Topic {i+1}/{len(topics)}: '{topic}' ---\n"
442
+
443
+ generated_content_full = generate_corpus_content( # Returns string including title/error
444
+ topic=topic, content_type=content_type, length_param=length_param, model=model,
445
+ temperature=temp_val, top_p=top_p_val, max_tokens=max_tokens_val
446
+ )
447
+
448
+ output_str += generated_content_full + "\n\n" # Add full result to textbox
449
+
450
+ # --- Prepare structured result for JSON ---
451
+ result_entry = {"topic": topic, "content_type": content_type}
452
+ if generated_content_full.startswith("Error:"):
453
+ result_entry["status"] = "error"
454
+ result_entry["error_message"] = generated_content_full
455
+ result_entry["content"] = None
456
+ else:
457
+ result_entry["status"] = "success"
458
+ result_entry["error_message"] = None
459
+ # Extract core content (remove potential title added by backend)
460
+ core_content = generated_content_full
461
+ if "\n\n" in generated_content_full:
462
+ parts = generated_content_full.split("\n\n", 1)
463
+ core_content = parts[1] if len(parts) > 1 else generated_content_full
464
+
465
+ if content_type == "Corpus Snippets":
466
+ snippets = [s.strip() for s in core_content.split('---') if s.strip()]
467
+ if not snippets: snippets = [s.strip() for s in core_content.split('\n\n') if s.strip()]
468
+ result_entry["content"] = snippets # Store list for corpus
469
+ else:
470
+ result_entry["content"] = core_content # Store string for story/article
471
+
472
+ bulk_results.append(result_entry)
473
+ # --- End JSON preparation ---
474
+
475
+ # --- Finalize ---
476
+ output_str += "="*40 + f"\nBulk generation complete for {len(topics)} topics."
477
+ json_filepath = create_json_file(bulk_results, f"{content_type.replace(' ','_')}_bulk_results.json")
478
+
479
+ return (gr.update(value=output_str), gr.update(value=json_filepath))
480
+
481
+
482
  # --- Gradio Interface Definition ---
483
  with gr.Blocks() as demo:
484
  gr.Markdown("# Synthetic Data Generator using OpenRouter")
485
  gr.Markdown(
486
+ "Generate synthetic text samples, conversations, or other content using various models"
487
  )
488
  # Removed the api_key_loaded check and warning Markdown
489
 
 
597
  outputs=[output_conv, download_file_conv] # Output to both Textbox and File
598
  )
599
 
600
+ # --- Content Generation Tab (Modified for Bulk) ---
601
+ with gr.TabItem("Bulk Content Generation"):
602
+ output_content = gr.Textbox(label="Generated Content (Log)", lines=15, show_copy_button=True)
603
+ # Output is now always JSON
604
+ download_file_content = gr.File(label="Download Results as JSON")
605
+
606
+ gr.Markdown("Enter one topic per line below, or use the 'Generate Topics' button.")
607
+ with gr.Row():
608
+ # Changed to multi-line Textbox
609
+ topic_input_content = gr.Textbox(
610
+ label="Topics (one per line)",
611
+ lines=5,
612
+ placeholder="Enter topics here, one per line...\ne.g., The future of renewable energy\nThe history of the Silk Road"
613
+ )
614
+
615
+ # --- Topic Generation ---
616
+ with gr.Accordion("Topic Generation Options", open=False):
617
+ with gr.Row():
618
+ num_topics_input = gr.Number(label="# Topics to Generate", value=5, minimum=1, maximum=50, step=1)
619
+ # Use shared model selector below and settings
620
+ generate_topics_button = gr.Button("Generate Topics using AI")
621
+
622
+ # --- Generation Settings ---
623
+ with gr.Row():
624
+ content_type_choices = list(content_type_labels.keys())
625
+ content_type_input = gr.Dropdown(
626
+ label="Content Type", choices=content_type_choices, value=content_type_choices[0]
627
+ )
628
+ default_length_label = content_type_labels[content_type_choices[0]]
629
+ default_length_value = content_type_defaults[content_type_choices[0]]
630
+ length_param_input = gr.Number(
631
+ label=default_length_label, value=default_length_value, minimum=1, step=1
632
+ )
633
+ with gr.Row():
634
+ model_input_content = gr.Dropdown(label="Model", choices=model_choices, value=default_model)
635
+
636
+ # Button to trigger bulk generation
637
+ generate_content_button = gr.Button("Generate Bulk Content")
638
+
639
+ # --- Event Listeners ---
640
+ # Listener to update length param UI
641
+ content_type_input.change(
642
+ fn=update_length_param_ui,
643
+ inputs=content_type_input,
644
+ outputs=length_param_input
645
+ )
646
+ # Listener for topic generation button
647
+ generate_topics_button.click(
648
+ fn=generate_topics_ui,
649
+ inputs=[ # Pass necessary inputs for topic generation
650
+ num_topics_input, model_input_content, # Use this tab's model selector
651
+ temperature_slider, top_p_slider, max_tokens_slider
652
+ ],
653
+ outputs=topic_input_content # Output generated topics to the textbox
654
+ )
655
+ # Listener for main generation button
656
+ generate_content_button.click(
657
+ fn=run_bulk_content_generation_and_prepare_json, # Use the new bulk wrapper
658
+ inputs=[
659
+ topic_input_content, content_type_input, length_param_input,
660
+ model_input_content,
661
+ temperature_slider, top_p_slider, max_tokens_slider
662
+ ],
663
+ outputs=[output_content, download_file_content]
664
+ )
665
+
666
 
667
  # Launch the Gradio app
668
  if __name__ == "__main__":
669
  print("Launching Gradio App...")
670
  print("Make sure the OPENROUTER_API_KEY environment variable is set.")
671
  # Use share=True for temporary public link if running locally and need to test
672
+ demo.launch() # share=True
673
+