khanpm / app.py
morethanair's picture
Fix: add Hugging Face Spaces config header
f32c9fc
import streamlit as st
import os
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
from typing import List, Dict
import re # For parsing timestamp and extracting video ID
import streamlit.components.v1 as components # For embedding HTML
from openai import OpenAI # Import OpenAI library
import logging
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# --- Helper Functions (Existing: parse_timestamp_to_seconds, get_youtube_video_id, add_timestamp_to_youtube_url, generate_youtube_embed_html) ---
def parse_timestamp_to_seconds(timestamp: str) -> int | None:
"""HH:MM:SS ๋˜๋Š” HH:MM:SS.ms ํ˜•์‹์˜ ํƒ€์ž„์Šคํƒฌํ”„๋ฅผ ์ดˆ ๋‹จ์œ„๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค."""
if not isinstance(timestamp, str):
return None
# Remove milliseconds part if present
timestamp_no_ms = timestamp.split('.')[0]
parts = timestamp_no_ms.split(':')
try:
if len(parts) == 3:
h, m, s = map(int, parts)
return h * 3600 + m * 60 + s
elif len(parts) == 2:
m, s = map(int, parts)
return m * 60 + s
elif len(parts) == 1:
return int(parts[0])
else:
return None
except ValueError:
return None
def get_youtube_video_id(url: str) -> str | None:
"""YouTube URL์—์„œ ๋น„๋””์˜ค ID๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค."""
if not isinstance(url, str):
return None
# Standard YouTube URLs (youtube.com/watch?v=...), shortened URLs (youtu.be/...), etc.
match = re.search(r"(?:v=|/|youtu\.be/|embed/|shorts/)([0-9A-Za-z_-]{11})", url)
return match.group(1) if match else None
def add_timestamp_to_youtube_url(youtube_url: str, timestamp: str) -> str:
"""YouTube URL์— ํƒ€์ž„์Šคํƒฌํ”„๋ฅผ ์ถ”๊ฐ€ํ•ฉ๋‹ˆ๋‹ค."""
seconds = parse_timestamp_to_seconds(timestamp)
if seconds is None or not youtube_url:
return youtube_url # Return original URL if timestamp is invalid or URL is empty
separator = '&' if '?' in youtube_url else '?'
# Remove existing t= parameter if present
cleaned_url = re.sub(r'[?&]t=\d+s?', '', youtube_url)
separator = '&' if '?' in cleaned_url else '?' # Re-check separator after cleaning
return f"{cleaned_url}{separator}t={seconds}s"
def generate_youtube_embed_html(youtube_url: str, timestamp: str) -> str | None:
"""ํƒ€์ž„์Šคํƒฌํ”„๊ฐ€ ์ ์šฉ๋œ YouTube ์ž„๋ฒ ๋“œ HTML ์ฝ”๋“œ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค. ๊ฐ€๋กœ 800px ๊ณ ์ •, ์„ธ๋กœ ์ž๋™ ์กฐ์ ˆ."""
video_id = get_youtube_video_id(youtube_url)
start_seconds = parse_timestamp_to_seconds(timestamp)
if not video_id:
logger.warning(f"Could not extract video ID from URL: {youtube_url}")
return None # Cannot generate embed code without video ID
start_param = f"start={start_seconds}" if start_seconds is not None else ""
# Use aspect ratio approach with fixed width 800px
return f'''
<div style="position: relative; width: 800px; padding-bottom: 450px; /* 800px * 9 / 16 = 450px */ height: 0; overflow: hidden;">
<iframe
src="https://www.youtube.com/embed/{video_id}?{start_param}&autoplay=0&rel=0"
frameborder="0"
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
referrerpolicy="strict-origin-when-cross-origin"
allowfullscreen
style="position: absolute; top: 0; left: 0; width: 100%; height: 100%;">
</iframe>
</div>
'''
# --- ์„ค์ • ---
# Pinecone ์„ค์ •
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY","pcsk_PZHLK_TRAvMCyNmJM4FKGCX7rbbY22a58fhnWYasx1mf3WL6sRasoASZXfsbnJYvCQ13w") # Load from environment variable
PINECONE_ENV = os.getenv("PINECONE_ENV", "us-east-1")
INDEX_NAME = "video-embeddings"
EMBEDDING_MODEL = "jhgan/ko-sroberta-multitask"
# OpenAI ์„ค์ •
OPENAI_API_KEY = "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"
# --- ๋ฆฌ์†Œ์Šค ๋กœ๋”ฉ (์บ์‹ฑ ํ™œ์šฉ) ---
@st.cache_resource
def init_pinecone():
"""Pinecone ํด๋ผ์ด์–ธํŠธ๋ฅผ ์ดˆ๊ธฐํ™”ํ•ฉ๋‹ˆ๋‹ค."""
api_key = PINECONE_API_KEY
if not api_key:
st.error("Pinecone API ํ‚ค๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. ํ™˜๊ฒฝ ๋ณ€์ˆ˜๋ฅผ ํ™•์ธํ•˜์„ธ์š”.")
st.stop()
try:
pc = Pinecone(api_key=api_key)
logger.info("Successfully connected to Pinecone.")
return pc
except Exception as e:
st.error(f"Pinecone ์ดˆ๊ธฐํ™” ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
st.stop()
@st.cache_resource
def load_embedding_model():
"""Sentence Transformer ๋ชจ๋ธ์„ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค."""
try:
model = SentenceTransformer("my_model")
logger.info(f"Successfully loaded embedding model: {EMBEDDING_MODEL}")
return model
except Exception as e:
st.error(f"์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
st.stop()
@st.cache_resource
def get_pinecone_index(_pc: Pinecone, index_name: str):
"""Pinecone ์ธ๋ฑ์Šค ๊ฐ์ฒด๋ฅผ ๊ฐ€์ ธ์˜ต๋‹ˆ๋‹ค."""
try:
index = _pc.Index(index_name)
# Optionally, do a quick check like index.describe_index_stats() to confirm connection
stats = index.describe_index_stats()
logger.info(f"Successfully connected to Pinecone index '{index_name}'. Stats: {stats.get('total_vector_count', 'N/A')} vectors")
return index
except Exception as e:
st.error(f"Pinecone ์ธ๋ฑ์Šค '{index_name}' ์—ฐ๊ฒฐ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}. ์ธ๋ฑ์Šค๊ฐ€ ์กด์žฌํ•˜๊ณ  ํ™œ์„ฑ ์ƒํƒœ์ธ์ง€ ํ™•์ธํ•˜์„ธ์š”.")
st.stop()
@st.cache_resource
def init_openai_client():
"""OpenAI ํด๋ผ์ด์–ธํŠธ๋ฅผ ์ดˆ๊ธฐํ™”ํ•ฉ๋‹ˆ๋‹ค."""
if not OPENAI_API_KEY:
st.error("OpenAI API ํ‚ค๊ฐ€ ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. ํ™˜๊ฒฝ ๋ณ€์ˆ˜๋ฅผ ํ™•์ธํ•˜์„ธ์š”.")
st.stop()
try:
client = OpenAI(api_key=OPENAI_API_KEY)
# Test connection (optional, but recommended)
client.models.list()
logger.info("Successfully connected to OpenAI.")
return client
except Exception as e:
st.error(f"OpenAI ํด๋ผ์ด์–ธํŠธ ์ดˆ๊ธฐํ™” ๋˜๋Š” ์—ฐ๊ฒฐ ํ…Œ์ŠคํŠธ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
st.stop()
# --- ๊ฒ€์ƒ‰ ํ•จ์ˆ˜ ---
def search(query: str, top_k: int = 5, _index=None, _model=None) -> List[Dict]:
"""Pinecone ์ธ๋ฑ์Šค์—์„œ ๊ฒ€์ƒ‰์„ ์ˆ˜ํ–‰ํ•˜๊ณ  title๊ณผ original_text๋ฅผ ํฌํ•จํ•ฉ๋‹ˆ๋‹ค."""
if not query or _index is None or _model is None:
return []
try:
query_vec = _model.encode(query, convert_to_numpy=True).tolist()
result = _index.query(vector=query_vec, top_k=top_k, include_metadata=True)
matches = result.get("matches", [])
search_results = []
for m in matches:
metadata = m.get("metadata", {})
search_results.append({
"URL": metadata.get("url", "N/A"),
"ํƒ€์ž„์Šคํƒฌํ”„": metadata.get("timestamp", "N/A"),
"ํƒ€์ž…": metadata.get("type", "N/A"),
"์ œ๋ชฉ": metadata.get("title", "N/A"), # ์ œ๋ชฉ ์ถ”๊ฐ€
"์š”์•ฝ": metadata.get("summary", "N/A"),
"์›๋ณธํ…์ŠคํŠธ": metadata.get("original_text", "N/A"), # ์ปจํ…์ŠคํŠธ๋กœ ํ™œ์šฉํ•  ์›๋ณธ ํ…์ŠคํŠธ
"์ ์ˆ˜": m.get("score", 0.0)
})
logger.info(f"Pinecone search returned {len(search_results)} results for query: '{query[:50]}...'")
return search_results
except Exception as e:
st.error(f"Pinecone ๊ฒ€์ƒ‰ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
logger.error(f"Error during Pinecone search: {e}", exc_info=True)
return []
# --- OpenAI ๋‹ต๋ณ€ ์ƒ์„ฑ ํ•จ์ˆ˜ ---
def generate_khan_answer(query: str, search_results: List[Dict], client: OpenAI) -> str:
"""์‚ฌ์šฉ์ž ์งˆ๋ฌธ๊ณผ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ Khan ํŽ˜๋ฅด์†Œ๋‚˜ ๋‹ต๋ณ€์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค."""
if not search_results:
# Return a persona-consistent message even when no results are found
return "ํ˜„์žฌ ์งˆ๋ฌธ์— ๋Œ€ํ•ด ์ฐธ๊ณ ํ•  ๋งŒํ•œ ๊ด€๋ จ ์˜์ƒ์„ ์ฐพ์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค. ์งˆ๋ฌธ์„ ์กฐ๊ธˆ ๋” ๋ช…ํ™•ํ•˜๊ฒŒ ํ•ด์ฃผ์‹œ๊ฑฐ๋‚˜ ๋‹ค๋ฅธ ๋ฐฉ์‹์œผ๋กœ ์งˆ๋ฌธํ•ด์ฃผ์‹œ๋ฉด ๋„์›€์ด ๋  ๊ฒƒ ๊ฐ™์Šต๋‹ˆ๋‹ค."
# Build context string for OpenAI more robustly, including timestamped URL
context_parts = []
for i, r in enumerate(search_results):
original_text_snippet = ""
if r.get('์›๋ณธํ…์ŠคํŠธ'):
snippet = r['์›๋ณธํ…์ŠคํŠธ'][:200]
original_text_snippet = f"\n(์›๋ณธ ๋‚ด์šฉ ์ผ๋ถ€: {snippet}...)"
# Generate timestamped URL if possible
timestamped_url_str = "N/A"
url = r.get('URL', 'N/A')
timestamp = r.get('ํƒ€์ž„์Šคํƒฌํ”„', 'N/A')
is_youtube = url and isinstance(url, str) and ('youtube.com' in url or 'youtu.be' in url)
has_valid_timestamp = timestamp and timestamp != 'N/A' and parse_timestamp_to_seconds(timestamp) is not None
if is_youtube and has_valid_timestamp:
try:
timestamped_url_str = add_timestamp_to_youtube_url(url, timestamp)
except Exception:
timestamped_url_str = url # Fallback to original URL on error
elif url != "N/A":
timestamped_url_str = url # Use original URL if not YouTube/no timestamp
context_parts.append(
f"๊ด€๋ จ ์ •๋ณด {i+1}:\n"
f"์ œ๋ชฉ: {r.get('์ œ๋ชฉ', 'N/A')}\n"
f"์˜์ƒ URL (์›๋ณธ): {url}\n"
f"ํƒ€์ž„์Šคํƒฌํ”„: {timestamp}\n"
f"ํƒ€์ž„์Šคํƒฌํ”„ ์ ์šฉ URL: {timestamped_url_str}\n" # Add the timestamped URL here
f"๋‚ด์šฉ ํƒ€์ž…: {r.get('ํƒ€์ž…', 'N/A')}\n"
f"์š”์•ฝ: {r.get('์š”์•ฝ', 'N/A')}"
f"{original_text_snippet}" # Append the snippet safely
)
context = "\n\n---\n\n".join(context_parts) # Join the parts
# Updated system prompt to instruct Markdown link usage
system_prompt = """๋„ˆ๋Š” ํ˜„์‹ค์ ์ธ ์กฐ์–ธ์„ ์ž˜ํ•˜๋Š” PM ๋ฉ˜ํ†  Khan์ด๋‹ค.
- ๋งํˆฌ๋Š” ๋‹จํ˜ธํ•˜์ง€๋งŒ ๊ณต๊ฐ๋ ฅ์ด ์žˆ๋‹ค. "~์ž…๋‹ˆ๋‹ค." ๋˜๋Š” "~์ฃ ."์™€ ๊ฐ™์ด ๋ช…ํ™•ํ•˜๊ฒŒ ๋๋งบ๋Š”๋‹ค. ์กด๋Œ“๋ง์„ ์‚ฌ์šฉํ•œ๋‹ค.
- ์™„๊ณกํ•œ ํ‘œํ˜„์„ ํ™œ์šฉํ•˜๋ฉฐ, ์ƒ๋Œ€๋ฐฉ์˜ ๊ฐ์ •์„ ํ•จ๋ถ€๋กœ ๋‹จ์ • ์ง“์ง€ ์•Š๋Š”๋‹ค. ์˜ˆ: "๊ทธ๋Ÿด ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค", "์Œ, ๊ทธ๋ ‡๊ฒŒ ๋А๋‚„ ์ˆ˜ ์žˆ์ฃ " ๋“ฑ.
- ๋‹จ์ˆœํ•œ ์œ„๋กœ๋ณด๋‹ค๋Š” ๊ตฌ์กฐ์ ์ด๊ณ  ์‹ค์šฉ์ ์ธ ์ œ์•ˆ์„ ์šฐ์„ ํ•œ๋‹ค. ์งˆ๋ฌธ์ž๊ฐ€ ๋†“์นœ ๋งฅ๋ฝ์ด๋‚˜ ๊ตฌ์กฐ๋ฅผ ์งš์–ด์ฃผ๊ณ , ๋‹ค์Œ ๋‹จ๊ณ„ ๋˜๋Š” ์ „๋žต์  ์„ ํƒ์ง€๋ฅผ ์ œ์‹œํ•œ๋‹ค.
- ์งˆ๋ฌธ์ด ๋ง‰์—ฐํ•˜๊ฑฐ๋‚˜ ์ถ”์ƒ์ ์ด๋ฉด, ํ•ต์‹ฌ์„ ์ขํ˜€ ๋‹ค์‹œ ๋˜๋ฌผ์–ด๋ณธ๋‹ค. ์˜ˆ: "๊ทธ ์ƒํ™ฉ์—์„œ ๊ฐ€์žฅ ๋‹ต๋‹ตํ–ˆ๋˜ ์ˆœ๊ฐ„์€ ์–ธ์ œ์˜€๋‚˜์š”?"์™€ ๊ฐ™์ด ์งˆ๋ฌธ์„ ๊ตฌ์ฒดํ™”ํ•œ๋‹ค.
- ๊ธด ์„ค๋ช…๋ณด๋‹ค๋Š” ํ•ต์‹ฌ์„ ๋น ๋ฅด๊ฒŒ ์ „๋‹ฌํ•œ๋‹ค. ๋‹ค๋งŒ, ํ•„์š”ํ•œ ๊ฒฝ์šฐ ์งง์€ ๋น„์œ ๋‚˜ ์˜ˆ์‹œ๋กœ ์ง๊ด€์ ์ธ ์ดํ•ด๋ฅผ ๋•๋Š”๋‹ค.
- ๋‹ต๋ณ€ ์ค‘ ๊ด€๋ จ ์ •๋ณด๋ฅผ ์ฐธ์กฐํ•  ๋•Œ๋Š”, ๋ฐ˜๋“œ์‹œ 'ํƒ€์ž„์Šคํƒฌํ”„ ์ ์šฉ URL'์„ ์‚ฌ์šฉํ•˜์—ฌ ๋‹ค์Œ๊ณผ ๊ฐ™์€ Markdown ๋งํฌ ํ˜•์‹์œผ๋กœ ์ œ์‹œํ•ด์•ผ ํ•œ๋‹ค: `[์˜์ƒ ์ œ๋ชฉ](ํƒ€์ž„์Šคํƒฌํ”„_์ ์šฉ_URL)`. ์˜ˆ: "์ž์„ธํ•œ ๋‚ด์šฉ์€ [๋น„๊ฐœ๋ฐœ์ž๊ฐ€ ์—ฐ๋ด‰ 2์–ต์„ ๋ฐ›๋Š” ํ˜„์‹ค์ ์ธ ๋ฐฉ๋ฒ•](https://www.youtube.com/watch?v=VIDEO_ID&t=178s) ์˜์ƒ์„ ์ฐธ๊ณ ํ•˜์‹œ๋ฉด ๋„์›€์ด ๋  ๊ฒ๋‹ˆ๋‹ค."
- ์ด์ „ ๋Œ€ํ™” ๊ธฐ๋ก์€ ์—†์œผ๋ฏ€๋กœ, ๋ฐ˜๋ณต ์งˆ๋ฌธ์ด ๋“ค์–ด์˜ฌ ๊ฒฝ์šฐ์—๋Š” "์ด์ „์— ์œ ์‚ฌํ•œ ๋‚ด์šฉ์„ ์ฐพ์•„๋ดค์—ˆ์ฃ . ๋‹ค์‹œ ํ•œ๋ฒˆ ์‚ดํŽด๋ณด๋ฉด..."์ฒ˜๋Ÿผ ์ž์—ฐ์Šค๋Ÿฝ๊ฒŒ ์ด์–ด๊ฐ„๋‹ค.
- ๋‹ต๋ณ€์€ ๋ฐ˜๋“œ์‹œ ํ•œ๊ตญ์–ด๋กœ ํ•œ๋‹ค.
Khan์€ ์ „๋žต์ ์œผ๋กœ ์‚ฌ๊ณ ํ•˜๋ฉฐ, ๋ณธ์งˆ๊ณผ ๋ฐฉํ–ฅ์„ ์ค‘์‹œํ•œ๋‹ค.
๋‹จ์ •์ ์œผ๋กœ ๋‹จ์–ธํ•˜๊ธฐ๋ณด๋‹ค๋Š” "~์ผ ์ˆ˜๋„ ์žˆ์Šต๋‹ˆ๋‹ค", "๊ทธ๋ ‡๊ฒŒ๋„ ๋ณผ ์ˆ˜ ์žˆ์ฃ "์™€ ๊ฐ™์ด ์—ฌ์ง€๋ฅผ ๋‚จ๊ธด๋‹ค.
์ƒ๋Œ€๋ฐฉ์ด ์Šค์Šค๋กœ ์„ ํƒ์ง€๋ฅผ ํŒ๋‹จํ•  ์ˆ˜ ์žˆ๋„๋ก ๋•๋Š” ๋ฐฉํ–ฅ์œผ๋กœ ์กฐ์–ธํ•œ๋‹ค.
์˜ˆ์‹œ์ฒ˜๋Ÿผ ๋งํˆฌ์™€ ์‚ฌ๊ณ  ํ๋ฆ„์„ ์œ ์ง€ํ•ด์•ผ ํ•œ๋‹ค:
---
Q: ์š”์ฆ˜ ํŒ€์›๊ณผ์˜ ๊ด€๊ณ„๊ฐ€ ์–ด๋ ค์šด๋ฐ, ์ œ๊ฐ€ ๋ญ˜ ๋†“์น˜๊ณ  ์žˆ๋Š” ๊ฑธ๊นŒ์š”?
A: ์Œ, ๊ทธ๋Ÿด ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ๊ด€๊ณ„๊ฐ€ ์–ด๋ ค์šธ ๋•Œ๋Š” ๊ฐ์ •๋ณด๋‹ค๋Š” ๊ธฐ๋Œ€๊ฐ€ ์—‡๊ฐˆ๋ ธ๋˜ ์ˆœ๊ฐ„์„ ๋จผ์ € ๋ด์•ผ ํ•˜์ฃ .
๊ทธ ํŒ€์›์ด ๋ฌด์–ธ๊ฐ€๋ฅผ ๊ธฐ๋Œ€ํ–ˆ๋Š”๋ฐ, ๋‚ด๊ฐ€ ๊ทธ๊ฑธ ๋†“์ณค์„ ๊ฐ€๋Šฅ์„ฑ์ด ์žˆ์Šต๋‹ˆ๋‹ค.
ํ˜น์‹œ ์ตœ๊ทผ์— ์„œ๋กœ ์˜คํ•ด๊ฐ€ ์ƒ๊ธด ์ˆœ๊ฐ„์ด ์žˆ์—ˆ๋Š”์ง€, ๋จผ์ € ์งš์–ด๋ณด๋Š” ๊ฒŒ ์ข‹๊ฒ ์Šต๋‹ˆ๋‹ค.
---
Q: ํšŒ์‚ฌ๋ฅผ ์˜ฎ๊ธฐ๊ณ  ์‹ถ์€๋ฐ, ์„ฑ๊ณผ ์—†์ด ํ‡ด์‚ฌํ•˜๋ฉด ์•ˆ ์ข‹์„๊นŒ์š”?
A: ๋‹จ๊ธฐ์ ์œผ๋กœ๋Š” ๋งž์Šต๋‹ˆ๋‹ค. ์„ฑ๊ณผ ์—†์ด ํ‡ด์‚ฌํ•˜๋ฉด ์ด๋ ฅ์„œ์— ๋‚จ์ฃ .
ํ•˜์ง€๋งŒ ์ง€๊ธˆ ์ƒํ™ฉ์—์„œ ๋ฐฐ์šธ ๊ฒŒ ์—†๋‹ค๋ฉด, ๊ทธ ์ž์ฒด๊ฐ€ ๋ฆฌ์Šคํฌ์ด๊ธฐ๋„ ํ•ฉ๋‹ˆ๋‹ค.
'๋‚ด๊ฐ€ ๋‚จ์•„์„œ ์–ป์„ ์ˆ˜ ์žˆ๋Š” ๊ฒŒ ๋ฌด์—‡์ธ๊ฐ€'์™€ '์ง€๊ธˆ ๋‚˜๊ฐ€์„œ ์‹œ์ž‘ํ•  ์ˆ˜ ์žˆ๋Š” ๊ฒŒ ๋ฌด์—‡์ธ๊ฐ€'๋ฅผ ๋‚˜๋ž€ํžˆ ๋‘๊ณ  ๋น„๊ตํ•ด ๋ณด์‹œ์ฃ .
---
์ด๋Ÿฐ ์‹์˜ ๋งํˆฌ์™€ ํ๋ฆ„์„ ๋ฐ”ํƒ•์œผ๋กœ ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•˜์„ธ์š”."""
# Use triple quotes for the multi-line f-string
user_message = f"""์‚ฌ์šฉ์ž ์งˆ๋ฌธ: {query}
์•„๋ž˜ ๊ด€๋ จ ์ •๋ณด๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ Khan ๋ฉ˜ํ† ๋กœ์„œ ๋‹ต๋ณ€ํ•ด์ฃผ์„ธ์š”:
{context}"""
try:
logger.info("Calling OpenAI API...")
completion = client.chat.completions.create(
model="gpt-4o-mini", # Use gpt-4 if available and preferred
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message}
],
temperature=0.5, # Slightly less creative, more focused on instructions
)
answer = completion.choices[0].message.content
logger.info("Received response from OpenAI.")
return answer.strip()
except Exception as e:
st.error(f"OpenAI ๋‹ต๋ณ€ ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
logger.error(f"Error during OpenAI API call: {e}", exc_info=True)
return "๋‹ต๋ณ€์„ ์ƒ์„ฑํ•˜๋Š” ์ค‘์— ๋ฌธ์ œ๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. OpenAI API ํ‚ค ๋˜๋Š” ์„œ๋น„์Šค ์ƒํƒœ๋ฅผ ํ™•์ธํ•ด์ฃผ์„ธ์š”."
# --- Streamlit ์•ฑ UI (Khan ๋ฉ˜ํ†  ๋‹จ์ผ ๋ฃจํ”„ ๊ตฌ์กฐ) ---
st.set_page_config(page_title="Khan ๋ฉ˜ํ†  (PM ์˜์ƒ ๊ธฐ๋ฐ˜)", layout="wide")
# --- ์‚ฌ์ด๋“œ๋ฐ” ๋ฉ”๋‰ด ---
menu = st.sidebar.radio(
"๊ธฐ๋Šฅ ์„ ํƒ",
("Khan ๋ฉ˜ํ† ์—๊ฒŒ ์ƒ๋‹ดํ•˜๊ธฐ", "์ƒ์‚ฌ์—๊ฒŒ ์ž˜๋ณด์ด๊ธฐ")
)
# ์‚ฌ์ด๋“œ๋ฐ” ๋งจ ์•„๋ž˜์— ์„ค๋ฌธ์กฐ์‚ฌ ๋งํฌ
st.sidebar.markdown('<hr style="margin:1em 0;">', unsafe_allow_html=True)
st.sidebar.markdown(
'<a href="https://forms.gle/SUqrGBT3dktSB7v26" target="_blank" style="display:inline-block; background:#f9e79f; color:#1a237e; font-weight:bold; padding:0.5em 1.2em; border-radius:8px; text-decoration:none; font-size:1.1em; margin-bottom:16px;">๐Ÿ“ ์„œ๋น„์Šค ์–ด๋–ป๊ฒŒ ์ƒ๊ฐํ•˜์„ธ์š”?</a>',
unsafe_allow_html=True
)
openai_client = init_openai_client()
if menu == "Khan ๋ฉ˜ํ† ์—๊ฒŒ ์ƒ๋‹ดํ•˜๊ธฐ":
st.title("โœจ Khan ๋ฉ˜ํ† ๊ฐ€ 24์‹œ๊ฐ„ ๋‹ต๋ณ€์ค‘์ž…๋‹ˆ๋‹ค")
# --- API ํ‚ค ํ™•์ธ ๋ฐ ๋ฆฌ์†Œ์Šค ์ดˆ๊ธฐํ™” ---
pc = init_pinecone()
model = load_embedding_model()
index = get_pinecone_index(pc, INDEX_NAME)
# --- ์ƒํƒœ ๊ด€๋ฆฌ ---
if 'user_question' not in st.session_state:
st.session_state['user_question'] = ''
if 'empathy_message' not in st.session_state:
st.session_state['empathy_message'] = ''
if 'khan_answer' not in st.session_state:
st.session_state['khan_answer'] = ''
if 'pinecone_results' not in st.session_state:
st.session_state['pinecone_results'] = []
if 'extra_questions' not in st.session_state:
st.session_state['extra_questions'] = []
if 'current_input' not in st.session_state:
st.session_state['current_input'] = ''
# --- ์งˆ๋ฌธ ์ž…๋ ฅ ๋ฐ ๋‹ต๋ณ€ ์ƒ์„ฑ ---
st.markdown("#### ๋‹น์‹ ์˜ ๊ณ ๋ฏผ์„ ์•Œ๋ ค ์ฃผ์„ธ์š”!")
user_q = st.text_input(
"๋‚˜์˜ ๊ณ ๋ฏผ์€...",
value=st.session_state['current_input'],
key="main_input",
placeholder="ํ”„๋กœ๋•ํŠธ ๋งค๋‹ˆ์ €๊ฐ€ ๊ฐ€์ ธ์•ผ ํ•  ์—ญ๋Ÿ‰์€ ์–ด๋–ค ๊ฒƒ์ด ์žˆ์„๊นŒ์š”?"
)
if st.button("๊ณ ๋ฏผ ๋‚˜๋ˆ„๊ธฐ", key="main_ask") or (user_q and st.session_state['user_question'] != user_q):
st.session_state['user_question'] = user_q
st.session_state['current_input'] = user_q
# 1. ๊ณต๊ฐ ๋น„์„œ ๋ฉ”์‹œ์ง€
with st.spinner("์ƒ๊ฐ์ค‘..."):
empathy_prompt = f"""
๋„ˆ๋Š” ๋”ฐ๋œปํ•˜๊ณ  ์นœ์ ˆํ•œ ๋น„์„œ์•ผ. ์•„๋ž˜ ์‚ฌ์šฉ์ž์˜ ์งˆ๋ฌธ์„ ๋“ฃ๊ณ , ๊ฐ์ •์ ์œผ๋กœ ์ถฉ๋ถ„ํ•˜๊ฒŒ 1~2๋ฌธ์žฅ์œผ๋กœ ๊ณต๊ฐํ•ด์ฃผ๋˜ ์งˆ๋ฌธ์— ๋Œ€ํ•œ ๋‹ต๋ณ€์€ ํ•˜์ง€๋งˆ, ๋งˆ์ง€๋ง‰์— '์นธ ๋ฉ˜ํ† ์˜ ์ƒ๊ฐ์„ ๋“ค์–ด๋ณผ๊นŒ์š”?'๋ผ๊ณ  ์•ˆ๋‚ดํ•ด์ค˜. \n์งˆ๋ฌธ: "{user_q}"
"""
try:
empathy_response = openai_client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "system", "content": empathy_prompt}],
temperature=0.7,
)
st.session_state['empathy_message'] = empathy_response.choices[0].message.content.strip()
except Exception as e:
st.session_state['empathy_message'] = f"๊ณต๊ฐ ๋ฉ”์‹œ์ง€ ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜: {e}"
# 2. Pinecone ๊ฒ€์ƒ‰ ๋ฐ Khan ๋ฉ˜ํ†  ๋‹ต๋ณ€
with st.spinner("Khan ๋ฉ˜ํ† ๊ฐ€ ๋œธ์„ ๋“ค์ด๋ฉฐ..."):
pinecone_results = search(user_q, top_k=5, _index=index, _model=model)
st.session_state['pinecone_results'] = pinecone_results
khan_answer = generate_khan_answer(user_q, pinecone_results, openai_client)
st.session_state['khan_answer'] = khan_answer
# 3. ์ถ”๊ฐ€ ์งˆ๋ฌธ ์ƒ์„ฑ
with st.spinner("์ถ”๊ฐ€ ์งˆ๋ฌธ์„ ์ƒ์„ฑํ•˜๋Š” ์ค‘..."):
extra_prompt = (
f"์•„๋ž˜ ์งˆ๋ฌธ์—์„œ ์œ ์‚ฌํ•˜๊ฒŒ ๊ถ๊ธˆํ•  ์ˆ˜ ์žˆ๋Š” ์ถ”๊ฐ€ ์งˆ๋ฌธ 3~4๊ฐœ๋ฅผ ํ•œ๊ตญ์–ด๋กœ ๋งŒ๋“ค์–ด์ค˜. ์ง€๋‚˜์น˜๊ฒŒ ์„ธ๋ถ€์ ์ธ ํˆด์— ๋Œ€ํ•œ ์–˜๊ธฐ๋ณด๋‹ค๋Š” ํ”„๋กœ๋•ํŠธ, ํ”„๋กœ์ ํŠธ, ๋ฆฌ๋”์‹ญ์— ๋Œ€ํ•œ ์ผ๋ฐ˜์ ์ธ ์งˆ๋ฌธ์œผ๋กœ ๋งŒ๋“ค์–ด. ์งˆ๋ฌธ: \"{st.session_state['user_question']}"
)
try:
extra_response = openai_client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "system", "content": extra_prompt}],
temperature=0.5
)
import re
raw = extra_response.choices[0].message.content.strip()
questions = re.findall(r'\d+\.\s*(.+)', raw)
if not questions:
questions = [q.strip('-โ€ข ').strip() for q in raw.split('\n') if q.strip()]
st.session_state['extra_questions'] = questions[:4]
st.rerun()
except Exception as e:
st.session_state['extra_questions'] = [f"์ถ”๊ฐ€ ์งˆ๋ฌธ ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜: {e}"]
st.rerun()
# --- ๋‹ต๋ณ€ ๋ฐ ์ถ”๊ฐ€์งˆ๋ฌธ UI ---
if st.session_state['user_question']:
st.info(st.session_state['empathy_message'])
st.subheader("๐Ÿ’ก Khan ๋ฉ˜ํ† ์˜ ๋‹ต๋ณ€")
st.markdown(st.session_state['khan_answer'])
# ์ฐธ๊ณ  ์˜์ƒ ์ •๋ณด ํ‘œ์‹œ
pinecone_results = st.session_state['pinecone_results']
if pinecone_results:
with st.expander("๋‹ต๋ณ€์— ์ฐธ๊ณ ํ•œ ์˜์ƒ ์ •๋ณด ๋ณด๊ธฐ", expanded=True):
displayed_urls = set()
for i, r in enumerate(pinecone_results):
url = r.get('URL', 'N/A')
if url in displayed_urls or url == 'N/A':
continue
displayed_urls.add(url)
st.markdown(f"--- **์ฐธ๊ณ  ์ž๋ฃŒ {len(displayed_urls)} (์œ ์‚ฌ๋„: {r['์ ์ˆ˜']:.4f})** ---")
st.markdown(f"**์ œ๋ชฉ:** {r.get('์ œ๋ชฉ', 'N/A')}")
st.markdown(f"**์š”์•ฝ:** {r.get('์š”์•ฝ', 'N/A')}")
timestamp = r.get('ํƒ€์ž„์Šคํƒฌํ”„', 'N/A')
is_youtube = url and isinstance(url, str) and ('youtube.com' in url or 'youtu.be' in url)
start_seconds = None
if is_youtube and timestamp and timestamp != 'N/A':
start_seconds = parse_timestamp_to_seconds(timestamp)
if is_youtube and start_seconds is not None:
try:
timestamped_link_url = add_timestamp_to_youtube_url(url, timestamp)
st.markdown(f"**์˜์ƒ ๋งํฌ (ํƒ€์ž„์Šคํƒฌํ”„ ํฌํ•จ):** [{timestamped_link_url}]({timestamped_link_url})")
except Exception as e:
logger.error(f"Error creating timestamped URL for link: {e}")
st.markdown(f"**์˜์ƒ ๋งํฌ (์›๋ณธ):** [{url}]({url})")
elif url != "N/A" and isinstance(url, str) and url.startswith("http"):
st.markdown(f"**URL:** [{url}]({url})")
else:
st.markdown(f"**URL:** {url}")
if is_youtube and url != "N/A":
col1, col2 = st.columns(2)
with col1:
try:
st.video(url, start_time=start_seconds or 0)
except Exception as e:
st.error(f"๋น„๋””์˜ค({url}) ์žฌ์ƒ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
st.markdown(f"[YouTube์—์„œ ๋ณด๊ธฐ]({url})")
elif url != "N/A":
col1, col2 = st.columns(2)
with col1:
try:
st.video(url)
except Exception as e:
logger.warning(f"st.video failed for non-YouTube URL {url}: {e}")
st.markdown("---")
# --- ์ถ”๊ฐ€ ์งˆ๋ฌธ ์ƒ์„ฑ ํƒ€์ด๋ฐ ์ œ์–ด ---
if 'extra_questions_ready' not in st.session_state or not st.session_state['extra_questions_ready']:
# ๋‹ต๋ณ€์ด ๋ Œ๋”๋ง๋œ ํ›„์—๋งŒ spinner ๋Œ๋ฆฌ๊ธฐ
st.session_state['extra_questions_ready'] = True
st.rerun()
elif not st.session_state['extra_questions']:
# ๋ฐฑ๊ทธ๋ผ์šด๋“œ์—์„œ ์ถ”๊ฐ€ ์งˆ๋ฌธ ์ƒ์„ฑ (์Šคํ”ผ๋„ˆ ์—†์ด)
extra_prompt = (
f"์•„๋ž˜ ์งˆ๋ฌธ์—์„œ ์œ ์‚ฌํ•˜๊ฒŒ ๊ถ๊ธˆํ•  ์ˆ˜ ์žˆ๋Š” ์ถ”๊ฐ€ ์งˆ๋ฌธ 3~4๊ฐœ๋ฅผ ํ•œ๊ตญ์–ด๋กœ ๋งŒ๋“ค์–ด์ค˜. ์ง€๋‚˜์น˜๊ฒŒ ์„ธ๋ถ€์ ์ธ ํˆด์— ๋Œ€ํ•œ ์–˜๊ธฐ๋ณด๋‹ค๋Š” ํ”„๋กœ๋•ํŠธ, ํ”„๋กœ์ ํŠธ, ๋ฆฌ๋”์‹ญ์— ๋Œ€ํ•œ ์ผ๋ฐ˜์ ์ธ ์งˆ๋ฌธ์œผ๋กœ ๋งŒ๋“ค์–ด. ์งˆ๋ฌธ: \"{st.session_state['user_question']}"
)
try:
extra_response = openai_client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "system", "content": extra_prompt}],
temperature=0.5
)
import re
raw = extra_response.choices[0].message.content.strip()
questions = re.findall(r'\d+\.\s*(.+)', raw)
if not questions:
questions = [q.strip('-โ€ข ').strip() for q in raw.split('\n') if q.strip()]
st.session_state['extra_questions'] = questions[:4]
st.rerun()
except Exception as e:
st.session_state['extra_questions'] = [f"์ถ”๊ฐ€ ์งˆ๋ฌธ ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜: {e}"]
else:
st.markdown("#### ์ถ”๊ฐ€๋กœ ๊ถ๊ธˆํ•œ ์ ์ด ์žˆ์œผ์‹ ๊ฐ€์š”? ์•„๋ž˜ ์˜ˆ์‹œ ์งˆ๋ฌธ์„ ํด๋ฆญํ•˜๊ฑฐ๋‚˜ ์ง์ ‘ ์ž…๋ ฅํ•ด๋ณด์„ธ์š”!")
cols = st.columns(len(st.session_state['extra_questions']))
for i, q in enumerate(st.session_state['extra_questions']):
if cols[i].button(q, key=f"extra_{i}"):
st.session_state['current_input'] = q
st.session_state['user_question'] = ''
st.rerun()
user_extra = st.text_input("์ง์ ‘ ์ถ”๊ฐ€ ์งˆ๋ฌธ ์ž…๋ ฅ", value="", key="extra_input")
if st.button("์ถ”๊ฐ€ ์งˆ๋ฌธํ•˜๊ธฐ", key="extra_btn"):
st.session_state['current_input'] = user_extra
st.session_state['user_question'] = ''
st.rerun()
st.markdown("---")
st.caption("Powered by Pinecone, Sentence Transformers, and OpenAI")
if st.button("๋‹ค๋ฅธ ๊ณ ๋ฏผ ์ƒ๋‹ดํ•˜๊ธฐ"):
for k in ['user_question','empathy_message','khan_answer','pinecone_results','extra_questions','current_input','extra_questions_ready']:
st.session_state[k] = ''
st.rerun()
else:
st.title("๐Ÿ‘” ์ƒ์‚ฌ์—๊ฒŒ ์ž˜๋ณด์ด๊ธฐ: ๋งž์ถค ๋ณด๊ณ ๋ฌธ ๋งŒ๋“ค๊ธฐ")
st.markdown("์ƒ์‚ฌ์˜ MBTI ์„ฑํ–ฅ์— ๋งž๊ฒŒ ๋ณด๊ณ ๋ฌธ์„ ์ž๋™์œผ๋กœ ๋‹ค๋“ฌ์–ด๋“œ๋ฆฝ๋‹ˆ๋‹ค.")
mbti_types = [
"ISTJ", "ISFJ", "INFJ", "INTJ",
"ISTP", "ISFP", "INFP", "INTP",
"ESTP", "ESFP", "ENFP", "ENTP",
"ESTJ", "ESFJ", "ENFJ", "ENTJ"
]
mbti = st.selectbox("์ƒ์‚ฌ์˜ MBTI๋ฅผ ์„ ํƒํ•˜์„ธ์š”", mbti_types)
user_report = st.text_area("์ƒ์‚ฌ์—๊ฒŒ ๋ณด๊ณ ํ•  ๋‚ด์šฉ์„ ์ž…๋ ฅํ•˜์„ธ์š” (300์ž ์ด๋‚ด)", max_chars=300)
if st.button("MBTI ๋งž์ถค ๋ณด๊ณ ๋ฌธ ์ƒ์„ฑ"):
if not user_report.strip():
st.warning("๋ณด๊ณ ๋ฌธ์„ ์ž…๋ ฅํ•ด ์ฃผ์„ธ์š”.")
else:
with st.spinner("์ƒ์‚ฌ์˜ ์„ฑํ–ฅ์— ๋งž๊ฒŒ ๋ณด๊ณ ๋ฌธ์„ ๋‹ค๋“ฌ๋Š” ์ค‘..."):
prompt = f"""
์ƒ์‚ฌ์˜ MBTI๊ฐ€ {mbti}์ผ ๋•Œ, ์•„๋ž˜ ๋ณด๊ณ ๋ฌธ์„ ๊ทธ ์„ฑํ–ฅ์— ๋งž๊ฒŒ ์ˆ˜์ •ํ•ด์ค˜.\n ์ด ์œ ํ˜•์˜ ์ƒ์‚ฌ๊ฐ€ ์ค‘์š”ํ•˜๊ฒŒ ์ƒ๊ฐํ•˜๋Š” ๊ฒƒ์ด ๋ณด๊ณ ์„œ์— ๋น ์ ธ์žˆ๋‹ค๋ฉด ์–ด๋–ค ๋ถ€๋ถ„์„ ๋ณด์™„ํ•ด์•ผ ํ•˜๋Š”์ง€ ์ƒ์„ธํžˆ ์„ค๋ช…ํ•ด ์ค˜\n๊ทธ๋ฆฌ๊ณ  ์™œ ๊ทธ๋ ‡๊ฒŒ ์ˆ˜์ •ํ–ˆ๋Š”์ง€ ์ด์œ ๋„ ์„ค๋ช…ํ•ด์ค˜.\n๋ณด๊ณ ๋ฌธ: "{user_report}"
\n์•„๋ž˜ ํ˜•์‹์œผ๋กœ ๋‹ต๋ณ€ ํ•ด.\n์ˆ˜์ •๋œ ๋ณด๊ณ ๋ฌธ: ...\n์ด์œ : ...
"""
try:
response = openai_client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "system", "content": prompt}],
temperature=0.5,
)
answer = response.choices[0].message.content.strip()
# ๊ฐ„๋‹จํ•œ ํŒŒ์‹ฑ: "์ˆ˜์ •๋œ ๋ณด๊ณ ๋ฌธ:" ~ "์ด์œ :" ๋ถ„๋ฆฌ
import re
mod_match = re.search(r"์ˆ˜์ •๋œ ๋ณด๊ณ ๋ฌธ[:\n]*([\s\S]+?)์ด์œ [:\n]", answer)
reason_match = re.search(r"์ด์œ [:\n]*([\s\S]+)", answer)
if mod_match:
st.markdown(f"**์ˆ˜์ •๋œ ๋ณด๊ณ ๋ฌธ**\n\n{mod_match.group(1).strip()}")
logger.info(f"[MBTI ๋ณด๊ณ ๋ฌธ] ์ˆ˜์ •๋œ ๋ณด๊ณ ๋ฌธ: {mod_match.group(1).strip()}")
else:
st.markdown(f"**์ˆ˜์ •๋œ ๋ณด๊ณ ๋ฌธ**\n\n{answer}")
logger.info(f"[MBTI ๋ณด๊ณ ๋ฌธ] ์ˆ˜์ •๋œ ๋ณด๊ณ ๋ฌธ: {answer}")
if reason_match:
st.markdown(f"**์ด์œ  ์„ค๋ช…**\n\n{reason_match.group(1).strip()}")
logger.info(f"[MBTI ๋ณด๊ณ ๋ฌธ] ์ด์œ  ์„ค๋ช…: {reason_match.group(1).strip()}")
except Exception as e:
st.error(f"GPT ํ˜ธ์ถœ ์ค‘ ์˜ค๋ฅ˜: {e}")