Spaces:

gauravgmi
/

story-generation

Sleeping

File size: 26,157 Bytes

# app5.py – Interactive Bedtime Stories with Poster 🖼️ and TTS 🔊
# ------------------------------------------------------------------
# After the final scene (Scene 3) we automatically generate a DALL·E-3
# poster summarising the whole story, and you can listen to any scene
# via OpenAI TTS (streaming, synchronous – no asyncio required).
# ------------------------------------------------------------------

from __future__ import annotations

import os
import re
import textwrap
import tempfile
from pathlib import Path
from typing import List
import base64
import hashlib

import openai
import gradio as gr
from openai import OpenAI, OpenAIError

# ---------- CONFIG ---------------------------------------------------------
openai.api_key = os.getenv("OPENAI_API_KEY")
MODEL = "gpt-4o-mini"
TTS_MODEL = "tts-1"              # correct OpenAI TTS model
TEMPERATURE = 0.4

# Voice options for narration
VOICE_OPTIONS = {
    "fable": "👨 Dad (Default)",
    "shimmer": "👩 Mom", 
    "nova": "👧 Sister",
    "onyx": "👴 Grandad"
}
DEFAULT_VOICE = "fable"

# TTS narration instructions for bedtime story atmosphere
TTS_INSTRUCTIONS = (
    "Speak slowly and softly, like a bedtime storyteller. "
    "Put a tiny pause after each sentence. "
    "Smile in your voice; sound friendly and reassuring. "
    "Keep overall volume low so it won't startle a sleepy child."
)

# Seven core genres
CATEGORIES = [
    "Animal Adventures",
    "Fantasy & Magic",
    "Friendship & Emotional Growth",
    "Mystery & Problem-Solving",
    "Humor & Silly Situations",
    "Science & Space Exploration",
    "Values & Morals (Fables)",
]
DEFAULT_CATEGORY = CATEGORIES[0]

# ---------- PROMPT TEMPLATES ----------------------------------------------
SCENE_TEMPLATE = '''
You are a children's storyteller. Write **SCENE {scene_no}/3** of an
age-5-to-10 bedtime story (≈ 150 words).

**Category:** {category}
**Child's idea:** "{idea}"
👉 *Work the idea into the first two sentences.*

### Story-Arc Requirements
- **Scene 1** – introduce the main character and their WANT/PROBLEM.
- **Scene 2** – raise the stakes; a challenge appears.
- **Scene 3** – climax and satisfying resolution. No numbered choices.

### Style Rules
1. Use vivid language and **relevant emojis** (😀🐉🍪🌟🚀 …).
2. Keep sentences short and clear.
3. Leave a blank line between paragraphs.
4. **Scenes 1 & 2:** end with *exactly two* **bold** numbered choices ("1." & "2.").
5. **Scene 3:** wrap up the tale (no choices). Do **not** write "The end." before Scene 3.
6. Each scene should clearly advance the arc.

Story so far:
"""{story_so_far}"""

`last_choice` = "{last_choice}"
If `last_choice` == "N/A" this is the opening scene, otherwise nod to the child's choice in one friendly sentence before continuing.
'''

REVISION_TEMPLATE = '''
You previously wrote SCENE {scene_no}/3 …

Rewrite the scene so it satisfies the feedback below. **Change at least two sentences visibly** and keep to the style rules (including **bold** choice text).

Feedback: "{feedback}"

Original scene:
"""{original_scene}"""
'''

# ---------- LLM CORE -------------------------------------------------------

def _chat(prompt: str) -> str:
    resp = openai.chat.completions.create(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}],
        temperature=TEMPERATURE,
        max_tokens=600,
    )
    return resp.choices[0].message.content.strip()

# ---------- TTS (TEXT-TO-SPEECH) ------------------------------------------

_client = OpenAI()                       # uses OPENAI_API_KEY env-var
_audio_cache: dict[str, str] = {}        # md5(clean_text) ➜ data-URL

def _clean_for_tts(raw: str) -> str:
    """Remove markdown markers and numbered options; truncate at 4096 chars."""
    no_md = re.sub(r"[*_`#🌟]", "", raw)
    no_opts = "\n".join(
        ln for ln in no_md.splitlines() if not ln.strip().startswith(("1.", "2."))
    )
    return no_opts[:4096]


def _generate_audio(text: str) -> str:
    """Return a base-64 data-URL (audio/mp3) for Gradio's <audio> component."""
    clean = _clean_for_tts(text)
    h = hashlib.md5(clean.encode()).hexdigest()
    if h in _audio_cache:
        return _audio_cache[h]

    try:
        # 1. Stream TTS to a temporary mp3 file ----------------------------
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
            tmp_path = Path(tmp.name)

        with _client.audio.speech.with_streaming_response.create(
            model=TTS_MODEL,
            voice=DEFAULT_VOICE,  # Use default voice for the cached version
            input=clean,
            speed=0.9  # Slightly slower for bedtime stories
        ) as resp:
            resp.stream_to_file(tmp_path)

        # 2. Read, encode, cache ------------------------------------------
        mp3_bytes = tmp_path.read_bytes()
        data_url = "data:audio/mp3;base64," + base64.b64encode(mp3_bytes).decode()
        _audio_cache[h] = data_url
        tmp_path.unlink(missing_ok=True)
        return data_url

    except OpenAIError as e:
        print("[TTS] error:", e)
        return ""  # silent failure keeps UI responsive

# ---------- IMAGE GENERATION ----------------------------------------------

def _generate_poster(scenes: List[str]) -> str:
    """Return a DALL·E-3 image URL representing the whole story."""
    story_text = " ".join(scenes)
    story_essence = textwrap.shorten(story_text, width=200, placeholder="…")

    prompt = (
        f"A fantasy scene with {story_essence}\n\n"
        "Medium: Digital painting\n"
        "Style: Soft, dreamlike, no text\n\n"
        "Rule: Image only. Zero text. No words. No letters. No writing. No labels. No captions. Visual only."
    )

    img_resp = openai.images.generate(
        model="dall-e-3",
        prompt=prompt,
        size="1024x1024",
        quality="standard",
        n=1,
        response_format="url",
    )
    return img_resp.data[0].url

# ---------- LLM JUDGE MODULE ----------------------------------------------

# Judge evaluation prompt
JUDGE_PROMPT = """
You are an expert in children's literature and child development. Evaluate this bedtime story for ages 5-10.

Story to evaluate:
\"\"\"
{story}
\"\"\"

Please evaluate the story on these 3 key criteria (score 1-10 for each):

1. **Age Appropriateness**: Is the vocabulary, themes, and content suitable for ages 5-10?
2. **Ease of Reading**: How easy is it for children to follow and understand?
3. **Clarity of Moral/Takeaway**: Is there a clear, positive lesson or message?

For each criterion:
- Give a score (1-10)
- Provide a brief explanation (1-2 sentences)

End with:
- Overall Score (average of the 3 scores)
- Final Verdict (2-3 sentences summarizing the story's quality as a bedtime story)

Format your response clearly with scores and explanations.
"""

def judge_story(state: dict):
    """Evaluate the completed story using LLM judge"""
    if not state.get("scenes") or len(state["scenes"]) < 3:
        return gr.update(value="⚠️ Please complete the story first!", visible=True)
    
    # Get the full story
    full_story = "\n\n".join(state["scenes"])
    
    # Get judge evaluation
    try:
        response = _client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": "You are an expert evaluator of children's bedtime stories."},
                {"role": "user", "content": JUDGE_PROMPT.format(story=full_story)}
            ],
            temperature=0.3,  # Lower temperature for more consistent evaluation
            max_tokens=500
        )
        
        evaluation = response.choices[0].message.content.strip()
        
        # Format the evaluation nicely
        formatted_eval = f"""## 📊 Story Evaluation Report

{evaluation}

---
*Evaluated for ages 5-10 bedtime story standards*
"""
        
        return gr.update(value=formatted_eval, visible=True)
        
    except Exception as e:
        print(f"[Judge Error]: {e}")
        return gr.update(value="⚠️ Error evaluating story. Please try again.", visible=True)

# ---------- LEARN SOMETHING MODULE -----------------------------------------

import collections
import string

def extract_learning_term(story: str) -> str:
    """
    Extract an educational term using smart heuristics:
    - Pick rare (interesting) terms over common ones
    - Avoid proper nouns by checking capitalization patterns
    - Filter out adverbs and past-tense verbs
    - Use LLM fallback if needed
    """
    # 1️⃣  Tokenise (letters/apostrophes only), keep case info
    tokens = re.findall(r"\b[A-Za-z']{4,}\b", story)
    lc_tokens = [t.lower() for t in tokens]

    # 2️⃣  Build frequency table
    counts = collections.Counter(lc_tokens)

    # 3️⃣  Candidate filter:
    #     - appears mostly in lower-case form (prob not a proper noun)
    #     - not an adverb (-ly) or past-tense (-ed)  → kids find actions/objects easier
    cands = []
    for tok in set(lc_tokens):
        if tok.endswith(("ly", "ed")):
            continue
        # how many times was it capitalised?
        caps = sum(1 for t in tokens if t.lower() == tok and t[0].isupper())
        if caps / counts[tok] > 0.5:      # >50 % caps ⇒ likely a name
            continue
        cands.append(tok)

    if cands:
        # Pick the **rarest**, breaking ties by longest length
        cands.sort(key=lambda w: (counts[w], -len(w)))
        return cands[0]

    # 4️⃣  Fallback mini-LLM: ask for ONE teachable term
    prompt = (
        "From the story below, name ONE interesting action, object, or animal "
        "that a 7-year-old could learn about (just the single word, no quotes):\n"
        f"\"\"\"\n{story[:1200]}\n\"\"\""
    )
    resp = _client.chat.completions.create(
        model=MODEL,
        temperature=0,
        max_tokens=3,
        messages=[{"role": "user", "content": prompt}],
    )
    term = resp.choices[0].message.content.strip(string.punctuation + " ""\"'")
    return term or "rainbow"

# Keep the old function name for compatibility
def extract_key_noun(story: str) -> str:
    """Wrapper for backward compatibility"""
    return extract_learning_term(story)

# Helper: Get child-friendly fact
_FACT_PROMPT = """Explain "{term}" to a 7-year-old in **three short lines**.
Use friendly language and finish with a question to make them curious."""

def fetch_child_fact(term: str) -> str:
    """Get a kid-friendly fact about the given term"""
    prompt = _FACT_PROMPT.format(term=term)
    resp = _client.chat.completions.create(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.5,
        max_tokens=120,
    )
    return resp.choices[0].message.content.strip()

# Callback for Learn Something button
def learn_something(state: dict):
    """Extract key noun from story, get fact, and generate audio"""
    if not state.get("scenes"):
        return gr.update(visible=False)
    
    # Get the full story so far
    story_text = "\n\n".join(state["scenes"])
    
    # Extract interesting noun and get fact
    term = extract_key_noun(story_text)
    fact = fetch_child_fact(term)
    
    # Generate TTS for the fact using Mom's voice
    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
            tmp_path = tmp.name
        
        response = _client.audio.speech.create(
            model=TTS_MODEL,
            voice="shimmer",  # Mom's voice for educational content
            input=fact,
            speed=0.95  # Slightly slower for clarity
        )
        
        with open(tmp_path, 'wb') as f:
            for chunk in response.iter_bytes(1024):
                f.write(chunk)
        
        return gr.update(value=tmp_path, visible=True)
    except Exception as e:
        print(f"[Learn TTS Error]: {e}")
        return gr.update(visible=False)

# ---------- UTILS ----------------------------------------------------------

def _strip_early_ending(text: str, scene_no: int) -> str:
    if scene_no < 3:
        text = re.sub(r"^\s*(The\s+end\.?)(\s*)$", "", text, flags=re.I | re.M)
    return text


def _extract_options(scene_text: str) -> List[str]:
    """Return exactly two **clean** numbered options, or fall back."""
    opts: List[str] = []
    for ln in scene_text.splitlines():
        stripped = ln.strip()
        clean = re.sub(r"[*_`]+", "", stripped)
        if clean.startswith("1.") or clean.startswith("2."):
            opts.append(clean)
    if len(opts) == 2:
        return opts
    return [
        "1. Continue bravely 🌟",
        "2. Take a quiet turn 💤",
    ]

# ---------- STATE STRUCTURE ------------------------------------------------
# state = {scene_no:int, scenes:List[str], idea:str, category:str}

# ---------- CALLBACKS ------------------------------------------------------

def start_story(idea: str, category: str, state: dict):
    idea = idea.strip()
    if not idea:
        return (
            gr.update(value="🌜 *Please type a story idea first!*"),
            *(gr.update(visible=False),) * 11,
            gr.update(value="", visible=False),  # judge_output - clear
            gr.update(visible=False),  # learn_audio - hide
            state,
        )

    scene1 = _chat(
        SCENE_TEMPLATE.format(
            scene_no=1,
            story_so_far="",
            last_choice="N/A",
            idea=idea,
            category=category,
        )
    )
    scene1 = _strip_early_ending(scene1, 1)

    state.clear()
    state.update({"scene_no": 1, "scenes": [scene1], "idea": idea, "category": category})

    opt1, opt2 = _extract_options(scene1)
    return (
        gr.update(value=scene1),
        gr.update(value=opt1, visible=True),
        gr.update(value=opt2, visible=True),
        gr.update(visible=True),   # feedback box
        gr.update(visible=True),   # feedback button
        gr.update(visible=False),  # poster img
        gr.update(visible=False),  # poster btn
        gr.update(visible=True),   # voice dropdown
        gr.update(visible=True),   # narrate btn
        gr.update(value=None, visible=False),  # audio player - clear and hide
        gr.update(visible=True),   # learn btn
        gr.update(visible=False),  # judge btn
        gr.update(value="", visible=False),  # judge_output - clear
        gr.update(visible=False),  # learn_audio - hide
        state,
    )


def choose(option_text: str, state: dict):
    scene_no = state["scene_no"] + 1
    combined_story = state["scenes"][-1] + f"\n\n🎲 **You chose:** {option_text}\n\n"

    new_scene = _chat(
        SCENE_TEMPLATE.format(
            scene_no=scene_no,
            story_so_far="\n\n".join(state["scenes"]),
            last_choice=option_text,
            idea=state["idea"],
            category=state["category"],
        )
    )
    new_scene = _strip_early_ending(new_scene, scene_no)
    state["scenes"].append(new_scene)
    state["scene_no"] = scene_no

    if scene_no < 3:
        opt1, opt2 = _extract_options(new_scene)
        display = "\n\n".join(state["scenes"])
        return (
            gr.update(value=display),
            gr.update(value=opt1, visible=True),
            gr.update(value=opt2, visible=True),
            gr.update(visible=True),   # feedback box
            gr.update(visible=True),   # feedback btn
            gr.update(visible=False),  # poster img
            gr.update(visible=False),  # poster btn
            gr.update(visible=True),   # voice dropdown
            gr.update(visible=True),   # narrate btn
            gr.update(value=None, visible=False),  # audio - clear and hide
            gr.update(visible=True),   # learn btn
            gr.update(visible=False),  # judge btn
            state,
        )
    else:
        ending = "\n\n".join(state["scenes"]) + "\n\n🌟 **The End!** 🌟"
        return (
            gr.update(value=ending),
            *(gr.update(visible=False),) * 2,     # choice buttons
            gr.update(visible=False),              # feedback box
            gr.update(visible=False),              # feedback btn
            gr.update(visible=False),              # poster img (hidden until click)
            gr.update(visible=True),               # poster btn
            gr.update(visible=True),               # voice dropdown
            gr.update(visible=True),               # narrate btn
            gr.update(value=None, visible=False),  # audio - clear and hide
            gr.update(visible=True),               # learn btn
            gr.update(visible=True),               # judge btn - show after story ends
            state,
        )


def apply_feedback(feedback: str, state: dict):
    feedback = feedback.strip()
    if not feedback:
        return gr.update(value="⚠️ Please type feedback."), *(gr.update(visible=False),) * 10, gr.update(visible=False), state

    idx = state["scene_no"] - 1
    revised = _chat(
        REVISION_TEMPLATE.format(
            scene_no=state["scene_no"],
            feedback=feedback,
            original_scene=state["scenes"][idx],
        )
    )
    revised = _strip_early_ending(revised, state["scene_no"])
    state["scenes"][idx] = revised

    display = "\n\n".join(state["scenes"])
    if state["scene_no"] < 3:
        opt1, opt2 = _extract_options(revised)
        return (
            gr.update(value=display),
            gr.update(value=opt1, visible=True),
            gr.update(value=opt2, visible=True),
            gr.update(value="", visible=True),
            gr.update(visible=True),
            gr.update(visible=False),  # poster img
            gr.update(visible=False),  # poster btn
            gr.update(visible=True),   # voice dropdown
            gr.update(visible=True),   # narrate btn
            gr.update(value=None, visible=False),  # audio - clear and hide
            gr.update(visible=True),   # learn btn
            gr.update(visible=False),  # judge btn
            state,
        )
    else:
        ending = display + "\n\n🌟 **The End!** 🌟"
        return (
            gr.update(value=ending),
            *(gr.update(visible=False),) * 2,
            gr.update(visible=False),
            gr.update(visible=False),
            gr.update(visible=False),  # poster img
            gr.update(visible=True),   # poster btn
            gr.update(visible=True),   # voice dropdown
            gr.update(visible=True),   # narrate btn
            gr.update(value=None, visible=False),  # audio - clear and hide
            gr.update(visible=True),   # learn btn
            gr.update(visible=True),   # judge btn
            state,
        )


def generate_poster_clicked(state: dict):
    if state.get("scenes") and len(state["scenes"]) >= 3:
        poster_url = _generate_poster(state["scenes"])
        return gr.update(value=poster_url, visible=True), gr.update(visible=False)
    return gr.update(visible=False), gr.update(visible=True)


def narrate_scene(voice_choice: str, state: dict):
    if state.get("scenes") and state.get("scene_no", 0) > 0:
        latest_scene = state["scenes"][-1]
        
        # Clean text for TTS
        clean_text = _clean_for_tts(latest_scene)
        
        # Use selected voice or default
        selected_voice = voice_choice if voice_choice else DEFAULT_VOICE
        
        # Set speed based on voice
        voice_speeds = {
            "fable": 0.9,     # Dad - slightly slower
            "shimmer": 0.9,   # Mom - slightly slower
            "nova": 1.1,      # Sister - slightly faster
            "onyx": 0.8       # Grandad - slowest
        }
        speed = voice_speeds.get(selected_voice, 0.9)
        
        try:
            # Generate speech and save to temp file
            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
                tmp_path = tmp.name
            
            response = _client.audio.speech.create(
                model=TTS_MODEL,
                voice=selected_voice,
                input=clean_text,
                speed=speed
            )
            
            # Write response content to file
            with open(tmp_path, 'wb') as f:
                for chunk in response.iter_bytes(1024):
                    f.write(chunk)
            
            # Return file path instead of data URL
            return gr.update(value=tmp_path, visible=True)
        except Exception as e:
            print(f"[TTS] Error: {e}")
            return gr.update(visible=False)
    return gr.update(visible=False)


def reset(state: dict):
    state.clear()
    return (
        gr.update(value=""),                    # story text
        *(gr.update(visible=False),) * 11,      # all other components
        gr.update(value="", visible=False),     # judge_output - clear and hide
        state,
    )

# ---------- CUSTOM CSS -----------------------------------------------------
CUSTOM_CSS = """
.story-text {
  font-size: 20px;
  line-height: 1.6;
}
.story-text strong {
  font-weight: 700;
}
.choice-btn {
  font-size: 18px !important;
  font-weight: 600;
}
"""

# ---------- UI -------------------------------------------------------------
with gr.Blocks(title="🌙 Interactive Bedtime Stories", css=CUSTOM_CSS) as demo:
    gr.Markdown(
        "## 🌙 **Interactive Bedtime Stories**\n"
        "Describe an adventure, pick a category, then guide the tale – and see a poster of your story!"
    )

    state = gr.State({})

    with gr.Row():
        idea_box = gr.Textbox(
            label="✨ Your story idea",
            placeholder="e.g. A friendly dragon who loves cookies…",
            lines=1,
        )
        cat_menu = gr.Dropdown(choices=CATEGORIES, value=DEFAULT_CATEGORY, label="📚 Category")
        start_btn = gr.Button("🚀 Start Story", variant="primary")

    story_md = gr.Markdown(elem_classes="story-text")

    with gr.Row():
        btn1 = gr.Button(elem_classes="choice-btn", visible=False)
        btn2 = gr.Button(elem_classes="choice-btn", visible=False)

    fb_box = gr.Textbox(label="📝 Request a change", lines=1, visible=False)
    fb_btn = gr.Button("🔄 Apply Feedback", visible=False)

    poster_img = gr.Image(label="🎨 Story Poster", visible=False)
    poster_btn = gr.Button("🎨 Display Poster", variant="primary", visible=False)

    with gr.Row():
        voice_dropdown = gr.Dropdown(
            choices=[(label, voice) for voice, label in VOICE_OPTIONS.items()],
            value=DEFAULT_VOICE,
            label="Choose Narrator",
            visible=False,
            scale=1
        )
        narrate_btn = gr.Button("🔊 Listen to Scene", variant="secondary", visible=False, scale=2)
    
    audio_player = gr.Audio(label="Story Narration", visible=False, autoplay=True)
    
    # Learn Something feature
    with gr.Row():
        learn_btn = gr.Button("🐾 Learn Something", variant="secondary", visible=False)
        learn_audio = gr.Audio(label="Fun Fact", visible=False, autoplay=True)
    
    # Judge feature
    judge_btn = gr.Button("⚖️ Judge Story", variant="secondary", visible=False)
    judge_output = gr.Markdown(visible=False)

    reset_btn = gr.Button("🔄 New Story", visible=False)

    # ---------- Wiring ----------
    start_btn.click(
        start_story,
        inputs=[idea_box, cat_menu, state],
        outputs=[
            story_md,
            btn1,
            btn2,
            fb_box,
            fb_btn,
            poster_img,
            poster_btn,
            voice_dropdown,
            narrate_btn,
            audio_player,
            learn_btn,
            judge_btn,
            judge_output,
            state,
        ],
    )

    btn1.click(
        choose,
        inputs=[btn1, state],
        outputs=[
            story_md,
            btn1,
            btn2,
            fb_box,
            fb_btn,
            poster_img,
            poster_btn,
            voice_dropdown,
            narrate_btn,
            audio_player,
            learn_btn,
            judge_btn,
            state,
        ],
    )
    btn2.click(
        choose,
        inputs=[btn2, state],
        outputs=[
            story_md,
            btn1,
            btn2,
            fb_box,
            fb_btn,
            poster_img,
            poster_btn,
            voice_dropdown,
            narrate_btn,
            audio_player,
            learn_btn,
            judge_btn,
            state,
        ],
    )

    fb_btn.click(
        apply_feedback,
        inputs=[fb_box, state],
        outputs=[
            story_md,
            btn1,
            btn2,
            fb_box,
            fb_btn,
            poster_img,
            poster_btn,
            voice_dropdown,
            narrate_btn,
            audio_player,
            learn_btn,
            judge_btn,
            state,
        ],
    )

    poster_btn.click(
        generate_poster_clicked,
        inputs=[state],
        outputs=[poster_img, poster_btn],
    )

    narrate_btn.click(
        narrate_scene,
        inputs=[voice_dropdown, state],
        outputs=[audio_player],
    )
    
    learn_btn.click(
        learn_something,
        inputs=[state],
        outputs=[learn_audio],
    )
    
    judge_btn.click(
        judge_story,
        inputs=[state],
        outputs=[judge_output],
    )

    reset_btn.click(
        reset,
        inputs=[state],
        outputs=[
            story_md,
            btn1,
            btn2,
            fb_box,
            fb_btn,
            poster_img,
            poster_btn,
            voice_dropdown,
            narrate_btn,
            audio_player,
            learn_btn,
            judge_btn,
            judge_output,
            state,
        ],
    )

    story_md.change(
        lambda x: gr.update(visible=bool(x)),
        inputs=[story_md],
        outputs=[reset_btn],
    )

# --------------------------------------------------------------------------
if __name__ == "__main__":
    if not os.getenv("OPENAI_API_KEY"):
        raise RuntimeError("OPENAI_API_KEY not set")

    demo.launch()