openfree commited on
Commit
42dfc01
·
verified ·
1 Parent(s): adf8558

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -825
app.py CHANGED
@@ -814,831 +814,7 @@ class UnifiedAudioConverter:
814
 
815
  conversation_text = "\n".join(
816
  f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
817
- for i, turn in enumerate(import spaces
818
- import gradio as gr
819
- import os
820
- import asyncio
821
- import torch
822
- import io
823
- import json
824
- import re
825
- import httpx
826
- import tempfile
827
- import wave
828
- import base64
829
- import numpy as np
830
- import soundfile as sf
831
- import subprocess
832
- import shutil
833
- import requests
834
- import logging
835
- from datetime import datetime, timedelta
836
- from typing import List, Tuple, Dict, Optional
837
- from pathlib import Path
838
- from threading import Thread
839
- from dotenv import load_dotenv
840
-
841
- # PDF processing imports
842
- from langchain_community.document_loaders import PyPDFLoader
843
-
844
- # Edge TTS imports
845
- import edge_tts
846
- from pydub import AudioSegment
847
-
848
- # OpenAI imports
849
- from openai import OpenAI
850
-
851
- # Transformers imports (for legacy local mode)
852
- from transformers import (
853
- AutoModelForCausalLM,
854
- AutoTokenizer,
855
- TextIteratorStreamer,
856
- BitsAndBytesConfig,
857
- )
858
-
859
- # Llama CPP imports (for new local mode)
860
- try:
861
- from llama_cpp import Llama
862
- from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
863
- from llama_cpp_agent.providers import LlamaCppPythonProvider
864
- from llama_cpp_agent.chat_history import BasicChatHistory
865
- from llama_cpp_agent.chat_history.messages import Roles
866
- from huggingface_hub import hf_hub_download
867
- LLAMA_CPP_AVAILABLE = True
868
- except ImportError:
869
- LLAMA_CPP_AVAILABLE = False
870
-
871
- # Spark TTS imports
872
- try:
873
- from huggingface_hub import snapshot_download
874
- SPARK_AVAILABLE = True
875
- except:
876
- SPARK_AVAILABLE = False
877
-
878
- # MeloTTS imports (for local mode)
879
- try:
880
- # unidic 다운로드를 조건부로 처리
881
- if not os.path.exists("/usr/local/lib/python3.10/site-packages/unidic"):
882
- try:
883
- os.system("python -m unidic download")
884
- except:
885
- pass
886
- from melo.api import TTS as MeloTTS
887
- MELO_AVAILABLE = True
888
- except:
889
- MELO_AVAILABLE = False
890
-
891
- # Import config and prompts
892
- from config_prompts import (
893
- ConversationConfig,
894
- PromptBuilder,
895
- DefaultConversations,
896
- EDGE_TTS_ONLY_LANGUAGES,
897
- EDGE_TTS_VOICES
898
- )
899
-
900
- load_dotenv()
901
-
902
- # Brave Search API 설정
903
- BRAVE_KEY = os.getenv("BSEARCH_API")
904
- BRAVE_ENDPOINT = "https://api.search.brave.com/res/v1/web/search"
905
-
906
-
907
- def brave_search(query: str, count: int = 8, freshness_days: int | None = None):
908
- """Brave Search API를 사용하여 최신 정보 검색"""
909
- if not BRAVE_KEY:
910
- return []
911
- params = {"q": query, "count": str(count)}
912
- if freshness_days:
913
- dt_from = (datetime.utcnow() - timedelta(days=freshness_days)).strftime("%Y-%m-%d")
914
- params["freshness"] = dt_from
915
- try:
916
- r = requests.get(
917
- BRAVE_ENDPOINT,
918
- headers={"Accept": "application/json", "X-Subscription-Token": BRAVE_KEY},
919
- params=params,
920
- timeout=15
921
- )
922
- raw = r.json().get("web", {}).get("results") or []
923
- return [{
924
- "title": r.get("title", ""),
925
- "url": r.get("url", r.get("link", "")),
926
- "snippet": r.get("description", r.get("text", "")),
927
- "host": re.sub(r"https?://(www\.)?", "", r.get("url", "")).split("/")[0]
928
- } for r in raw[:count]]
929
- except Exception as e:
930
- logging.error(f"Brave search error: {e}")
931
- return []
932
-
933
-
934
- def format_search_results(query: str, for_keyword: bool = False) -> str:
935
- """검색 결과를 포맷팅하여 반환"""
936
- # 키워드 검색의 경우 더 많은 결과 사용
937
- count = 5 if for_keyword else 3
938
- rows = brave_search(query, count, freshness_days=7 if not for_keyword else None)
939
- if not rows:
940
- return ""
941
-
942
- results = []
943
- # 키워드 검색의 경우 더 상세한 정보 포함
944
- max_results = 4 if for_keyword else 2
945
- for r in rows[:max_results]:
946
- if for_keyword:
947
- # 키워드 검색은 더 긴 스니펫 사용
948
- snippet = r['snippet'][:200] + "..." if len(r['snippet']) > 200 else r['snippet']
949
- results.append(f"**{r['title']}**\n{snippet}\nSource: {r['host']}")
950
- else:
951
- # 일반 검색은 짧은 스니펫
952
- snippet = r['snippet'][:100] + "..." if len(r['snippet']) > 100 else r['snippet']
953
- results.append(f"- {r['title']}: {snippet}")
954
-
955
- return "\n\n".join(results) + "\n"
956
-
957
-
958
- def extract_keywords_for_search(text: str, language: str = "English") -> List[str]:
959
- """텍스트에서 검색할 키워드 추출 (개선)"""
960
- # 텍스트 앞부분만 사용 (너무 많은 텍스트 처리 방지)
961
- text_sample = text[:500]
962
-
963
- if language == "Korean":
964
- import re
965
- # 한국어 명사 추출 (2글자 이상)
966
- keywords = re.findall(r'[가-힣]{2,}', text_sample)
967
- # 중복 제거하고 가장 긴 단어 1개만 선택
968
- unique_keywords = list(dict.fromkeys(keywords))
969
- # 길이 순으로 정렬하고 가장 의미있을 �� 같은 단어 선택
970
- unique_keywords.sort(key=len, reverse=True)
971
- return unique_keywords[:1] # 1개만 반환
972
- else:
973
- # 영어는 대문자로 시작하는 단어 중 가장 긴 것 1개
974
- words = text_sample.split()
975
- keywords = [word.strip('.,!?;:') for word in words
976
- if len(word) > 4 and word[0].isupper()]
977
- if keywords:
978
- return [max(keywords, key=len)] # 가장 긴 단어 1개
979
- return []
980
-
981
-
982
- def search_and_compile_content(keyword: str, language: str = "English") -> str:
983
- """키워드로 검색하여 충분한 콘텐츠 컴파일"""
984
- if not BRAVE_KEY:
985
- # API 없을 때도 기본 콘텐츠 생성
986
- if language == "Korean":
987
- return f"""
988
- '{keyword}'에 대한 종합적인 정보:
989
-
990
- {keyword}는 현대 사회에서 매우 중요한 주제입니다.
991
- 이 주제는 다양한 측면에서 우리의 삶에 영향을 미치고 있으며,
992
- 최근 들어 더욱 주목받고 있습니다.
993
-
994
- 주요 특징:
995
- 1. 기술적 발전과 혁신
996
- 2. 사회적 영향과 변화
997
- 3. 미래 전망과 가능성
998
- 4. 실용적 활용 방안
999
- 5. 글로벌 트렌드와 동향
1000
-
1001
- 전문가들은 {keyword}가 앞으로 더욱 중요해질 것으로 예상하고 있으며,
1002
- 이에 대한 깊이 있는 이해가 필요한 시점입니다.
1003
- """
1004
- else:
1005
- return f"""
1006
- Comprehensive information about '{keyword}':
1007
-
1008
- {keyword} is a significant topic in modern society.
1009
- This subject impacts our lives in various ways and has been
1010
- gaining increasing attention recently.
1011
-
1012
- Key aspects:
1013
- 1. Technological advancement and innovation
1014
- 2. Social impact and changes
1015
- 3. Future prospects and possibilities
1016
- 4. Practical applications
1017
- 5. Global trends and developments
1018
-
1019
- Experts predict that {keyword} will become even more important,
1020
- and it's crucial to develop a deep understanding of this topic.
1021
- """
1022
-
1023
- # 언어에 따른 다양한 검색 쿼리
1024
- if language == "Korean":
1025
- queries = [
1026
- f"{keyword} 최신 뉴스 2024",
1027
- f"{keyword} 정보 설명",
1028
- f"{keyword} 트렌드 전망",
1029
- f"{keyword} 장점 단점",
1030
- f"{keyword} 활용 방법",
1031
- f"{keyword} 전문가 의견"
1032
- ]
1033
- else:
1034
- queries = [
1035
- f"{keyword} latest news 2024",
1036
- f"{keyword} explained comprehensive",
1037
- f"{keyword} trends forecast",
1038
- f"{keyword} advantages disadvantages",
1039
- f"{keyword} how to use",
1040
- f"{keyword} expert opinions"
1041
- ]
1042
-
1043
- all_content = []
1044
- total_content_length = 0
1045
-
1046
- for query in queries:
1047
- results = brave_search(query, count=5) # 더 많은 결과 가져오기
1048
- for r in results[:3]: # 각 쿼리당 상위 3개
1049
- content = f"**{r['title']}**\n{r['snippet']}\nSource: {r['host']}\n"
1050
- all_content.append(content)
1051
- total_content_length += len(r['snippet'])
1052
-
1053
- # 콘텐츠가 부족하면 추가 생성
1054
- if total_content_length < 1000: # 최소 1000자 확보
1055
- if language == "Korean":
1056
- additional_content = f"""
1057
- 추가 정보:
1058
- {keyword}와 관련된 최근 동향을 살펴보면, 이 분야는 빠르게 발전하고 있습니다.
1059
- 많은 전문가들이 이 주제에 대해 활발히 연구하고 있으며,
1060
- 실생활에서의 응용 가능성도 계속 확대되고 있습니다.
1061
-
1062
- 특히 주목할 점은:
1063
- - 기술 혁신의 가속화
1064
- - 사용자 경험의 개선
1065
- - 접근성의 향상
1066
- - 비용 효율성 증대
1067
- - 글로벌 시장의 성장
1068
-
1069
- 이러한 요소들이 {keyword}의 미래를 더욱 밝게 만들고 있습니다.
1070
- """
1071
- else:
1072
- additional_content = f"""
1073
- Additional insights:
1074
- Recent developments in {keyword} show rapid advancement in this field.
1075
- Many experts are actively researching this topic, and its practical
1076
- applications continue to expand.
1077
-
1078
- Key points to note:
1079
- - Accelerating technological innovation
1080
- - Improving user experience
1081
- - Enhanced accessibility
1082
- - Increased cost efficiency
1083
- - Growing global market
1084
-
1085
- These factors are making the future of {keyword} increasingly promising.
1086
- """
1087
- all_content.append(additional_content)
1088
-
1089
- # 컴파일된 콘텐츠 반환
1090
- compiled = "\n\n".join(all_content)
1091
-
1092
- # 키워드 기반 소개
1093
- if language == "Korean":
1094
- intro = f"### '{keyword}'에 대한 종합적인 정보와 최신 동향:\n\n"
1095
- else:
1096
- intro = f"### Comprehensive information and latest trends about '{keyword}':\n\n"
1097
-
1098
- return intro + compiled
1099
-
1100
-
1101
- class UnifiedAudioConverter:
1102
- def __init__(self, config: ConversationConfig):
1103
- self.config = config
1104
- self.llm_client = None
1105
- self.legacy_local_model = None
1106
- self.legacy_tokenizer = None
1107
- # 새로운 로컬 LLM 관련
1108
- self.local_llm = None
1109
- self.local_llm_model = None
1110
- self.melo_models = None
1111
- self.spark_model_dir = None
1112
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
1113
- # 프롬프트 빌더 추가
1114
- self.prompt_builder = PromptBuilder()
1115
-
1116
- def initialize_api_mode(self, api_key: str):
1117
- """Initialize API mode with Together API"""
1118
- self.llm_client = OpenAI(api_key=api_key, base_url="https://api.together.xyz/v1")
1119
-
1120
- @spaces.GPU(duration=120)
1121
- def initialize_local_mode(self):
1122
- """Initialize new local mode with Llama CPP"""
1123
- if not LLAMA_CPP_AVAILABLE:
1124
- raise RuntimeError("Llama CPP dependencies not available. Please install llama-cpp-python and llama-cpp-agent.")
1125
-
1126
- if self.local_llm is None or self.local_llm_model != self.config.local_model_name:
1127
- try:
1128
- # 모델 다운로드
1129
- model_path = hf_hub_download(
1130
- repo_id=self.config.local_model_repo,
1131
- filename=self.config.local_model_name,
1132
- local_dir="./models"
1133
- )
1134
-
1135
- model_path_local = os.path.join("./models", self.config.local_model_name)
1136
-
1137
- if not os.path.exists(model_path_local):
1138
- raise RuntimeError(f"Model file not found at {model_path_local}")
1139
-
1140
- # Llama 모델 초기화
1141
- self.local_llm = Llama(
1142
- model_path=model_path_local,
1143
- flash_attn=True,
1144
- n_gpu_layers=81 if torch.cuda.is_available() else 0,
1145
- n_batch=1024,
1146
- n_ctx=16384,
1147
- )
1148
- self.local_llm_model = self.config.local_model_name
1149
- print(f"Local LLM initialized: {model_path_local}")
1150
-
1151
- except Exception as e:
1152
- print(f"Failed to initialize local LLM: {e}")
1153
- raise RuntimeError(f"Failed to initialize local LLM: {e}")
1154
-
1155
- @spaces.GPU(duration=60)
1156
- def initialize_legacy_local_mode(self):
1157
- """Initialize legacy local mode with Hugging Face model (fallback)"""
1158
- if self.legacy_local_model is None:
1159
- quantization_config = BitsAndBytesConfig(
1160
- load_in_4bit=True,
1161
- bnb_4bit_compute_dtype=torch.float16
1162
- )
1163
- self.legacy_local_model = AutoModelForCausalLM.from_pretrained(
1164
- self.config.legacy_local_model_name,
1165
- quantization_config=quantization_config
1166
- )
1167
- self.legacy_tokenizer = AutoTokenizer.from_pretrained(
1168
- self.config.legacy_local_model_name,
1169
- revision='8ab73a6800796d84448bc936db9bac5ad9f984ae'
1170
- )
1171
-
1172
- def initialize_spark_tts(self):
1173
- """Initialize Spark TTS model by downloading if needed"""
1174
- if not SPARK_AVAILABLE:
1175
- raise RuntimeError("Spark TTS dependencies not available")
1176
-
1177
- model_dir = "pretrained_models/Spark-TTS-0.5B"
1178
-
1179
- # Check if model exists, if not download it
1180
- if not os.path.exists(model_dir):
1181
- print("Downloading Spark-TTS model...")
1182
- try:
1183
- os.makedirs("pretrained_models", exist_ok=True)
1184
- snapshot_download(
1185
- "SparkAudio/Spark-TTS-0.5B",
1186
- local_dir=model_dir
1187
- )
1188
- print("Spark-TTS model downloaded successfully")
1189
- except Exception as e:
1190
- raise RuntimeError(f"Failed to download Spark-TTS model: {e}")
1191
-
1192
- self.spark_model_dir = model_dir
1193
-
1194
- # Check if we have the CLI inference script
1195
- if not os.path.exists("cli/inference.py"):
1196
- print("Warning: Spark-TTS CLI not found. Please clone the Spark-TTS repository.")
1197
-
1198
- @spaces.GPU(duration=60)
1199
- def initialize_melo_tts(self):
1200
- """Initialize MeloTTS models"""
1201
- if MELO_AVAILABLE and self.melo_models is None:
1202
- self.melo_models = {"EN": MeloTTS(language="EN", device=self.device)}
1203
-
1204
- def fetch_text(self, url: str) -> str:
1205
- """Fetch text content from URL"""
1206
- if not url:
1207
- raise ValueError("URL cannot be empty")
1208
-
1209
- if not url.startswith("http://") and not url.startswith("https://"):
1210
- raise ValueError("URL must start with 'http://' or 'https://'")
1211
-
1212
- full_url = f"{self.config.prefix_url}{url}"
1213
- try:
1214
- response = httpx.get(full_url, timeout=60.0)
1215
- response.raise_for_status()
1216
- return response.text
1217
- except httpx.HTTPError as e:
1218
- raise RuntimeError(f"Failed to fetch URL: {e}")
1219
-
1220
- def extract_text_from_pdf(self, pdf_file) -> str:
1221
- """Extract text content from PDF file"""
1222
- try:
1223
- # Gradio returns file path, not file object
1224
- if isinstance(pdf_file, str):
1225
- pdf_path = pdf_file
1226
- else:
1227
- # If it's a file object (shouldn't happen with Gradio)
1228
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
1229
- tmp_file.write(pdf_file.read())
1230
- pdf_path = tmp_file.name
1231
-
1232
- # PDF 로드 및 텍스트 추출
1233
- loader = PyPDFLoader(pdf_path)
1234
- pages = loader.load()
1235
-
1236
- # 모든 페이지의 텍스트를 결합
1237
- text = "\n".join([page.page_content for page in pages])
1238
-
1239
- # 임시 파일인 경우 삭제
1240
- if not isinstance(pdf_file, str) and os.path.exists(pdf_path):
1241
- os.unlink(pdf_path)
1242
-
1243
- return text
1244
- except Exception as e:
1245
- raise RuntimeError(f"Failed to extract text from PDF: {e}")
1246
-
1247
- def _get_messages_formatter_type(self, model_name):
1248
- """Get appropriate message formatter for the model"""
1249
- if "Mistral" in model_name or "BitSix" in model_name:
1250
- return MessagesFormatterType.CHATML
1251
- else:
1252
- return MessagesFormatterType.LLAMA_3
1253
-
1254
- @spaces.GPU(duration=120)
1255
- def extract_conversation_local(self, text: str, language: str = "English", progress=None) -> Dict:
1256
- """Extract conversation using new local LLM with enhanced professional style"""
1257
- try:
1258
- # 검색 컨텍스트 생성 (키워드 기반이 아닌 경우)
1259
- search_context = ""
1260
- if BRAVE_KEY and not text.startswith("Keyword-based content:"):
1261
- try:
1262
- keywords = extract_keywords_for_search(text, language)
1263
- if keywords:
1264
- search_query = keywords[0] if language == "Korean" else f"{keywords[0]} latest news"
1265
- search_context = format_search_results(search_query)
1266
- print(f"Search context added for: {search_query}")
1267
- except Exception as e:
1268
- print(f"Search failed, continuing without context: {e}")
1269
-
1270
- # 먼저 새로운 로컬 LLM 시도
1271
- self.initialize_local_mode()
1272
-
1273
- chat_template = self._get_messages_formatter_type(self.config.local_model_name)
1274
- provider = LlamaCppPythonProvider(self.local_llm)
1275
-
1276
- # 언어별 시스템 메시지
1277
- system_messages = {
1278
- "Korean": (
1279
- "당신은 한국의 유명 팟캐스트 전문 작가입니다. "
1280
- "청취자들이 깊이 있는 전문 지식을 얻을 수 있는 고품질 대담을 한국어로 만듭니다. "
1281
- "반드시 서로 존댓말을 사용하며, 12회의 대화 교환으로 구성하세요. "
1282
- "모든 대화는 반드시 한국어로 작성하고 JSON 형식으로만 응답하세요."
1283
- ),
1284
- "Japanese": (
1285
- "あなたは日本の有名なポッドキャスト専門作家です。"
1286
- "聴衆が深い専門知識を得られる高品質な対談を日本語で作成します。"
1287
- "必ずお互いに丁寧語を使用し、12回の対話交換で構成してください。"
1288
- "すべての対話は必ず日本語で作成し、JSON形式でのみ回答してください。"
1289
- ),
1290
- "French": (
1291
- "Vous êtes un célèbre scénariste de podcast professionnel français. "
1292
- "Créez des discussions de haute qualité en français qui donnent au public "
1293
- "des connaissances professionnelles approfondies. "
1294
- "Créez exactement 12 échanges de conversation et répondez uniquement en format JSON."
1295
- ),
1296
- "German": (
1297
- "Sie sind ein berühmter professioneller Podcast-Drehbuchautor aus Deutschland. "
1298
- "Erstellen Sie hochwertige Diskussionen auf Deutsch, die dem Publikum "
1299
- "tiefgreifendes Fachwissen vermitteln. "
1300
- "Erstellen Sie genau 12 Gesprächsaustausche und antworten Sie nur im JSON-Format."
1301
- ),
1302
- "Spanish": (
1303
- "Eres un famoso guionista de podcast profesional español. "
1304
- "Crea discusiones de alta calidad en español que brinden al público "
1305
- "conocimientos profesionales profundos. "
1306
- "Crea exactamente 12 intercambios de conversación y responde solo en formato JSON."
1307
- ),
1308
- "Chinese": (
1309
- "您是中国著名的专业播客编剧。"
1310
- "创建高质量的中文讨论,为观众提供深入的专业知识。"
1311
- "创建恰好12次对话交换,仅以JSON格式回答。"
1312
- ),
1313
- "Russian": (
1314
- "Вы известный профессиональный сценарист подкастов из России. "
1315
- "Создавайте высококачественные дискуссии на русском языке, которые дают аудитории "
1316
- "глубокие профессиональные знания. "
1317
- "Создайте ровно 12 обменов разговором и отвечайте только в формате JSON."
1318
- )
1319
- }
1320
-
1321
- system_message = system_messages.get(language,
1322
- f"You are a professional podcast scriptwriter creating high-quality, "
1323
- f"insightful discussions in {language}. Create exactly 12 conversation exchanges "
1324
- f"with professional expertise. All dialogue must be in {language}. "
1325
- f"Respond only in JSON format."
1326
- )
1327
-
1328
- agent = LlamaCppAgent(
1329
- provider,
1330
- system_prompt=system_message,
1331
- predefined_messages_formatter_type=chat_template,
1332
- debug_output=False
1333
- )
1334
-
1335
- settings = provider.get_provider_default_settings()
1336
- settings.temperature = 0.75
1337
- settings.top_k = 40
1338
- settings.top_p = 0.95
1339
- settings.max_tokens = self.config.max_tokens
1340
- settings.repeat_penalty = 1.1
1341
- settings.stream = False
1342
-
1343
- messages = BasicChatHistory()
1344
-
1345
- prompt = self.prompt_builder.build_prompt(text, language, search_context)
1346
- response = agent.get_chat_response(
1347
- prompt,
1348
- llm_sampling_settings=settings,
1349
- chat_history=messages,
1350
- returns_streaming_generator=False,
1351
- print_output=False
1352
- )
1353
-
1354
- # JSON 파싱
1355
- pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
1356
- json_match = re.search(pattern, response)
1357
-
1358
- if json_match:
1359
- conversation_data = json.loads(json_match.group())
1360
- return conversation_data
1361
- else:
1362
- raise ValueError("No valid JSON found in local LLM response")
1363
-
1364
- except Exception as e:
1365
- print(f"Local LLM failed: {e}, falling back to legacy local method")
1366
- return self.extract_conversation_legacy_local(text, language, progress, search_context)
1367
-
1368
- @spaces.GPU(duration=120)
1369
- def extract_conversation_legacy_local(self, text: str, language: str = "English", progress=None, search_context: str = "") -> Dict:
1370
- """Extract conversation using legacy local model"""
1371
- try:
1372
- self.initialize_legacy_local_mode()
1373
-
1374
- # 언어별 시스템 메시지는 config_prompts에서 가져옴
1375
- messages = self.prompt_builder.build_messages_for_local(text, language, search_context)
1376
-
1377
- terminators = [
1378
- self.legacy_tokenizer.eos_token_id,
1379
- self.legacy_tokenizer.convert_tokens_to_ids("<|eot_id|>")
1380
- ]
1381
-
1382
- chat_messages = self.legacy_tokenizer.apply_chat_template(
1383
- messages, tokenize=False, add_generation_prompt=True
1384
- )
1385
- model_inputs = self.legacy_tokenizer([chat_messages], return_tensors="pt").to(self.device)
1386
-
1387
- streamer = TextIteratorStreamer(
1388
- self.legacy_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
1389
- )
1390
-
1391
- generate_kwargs = dict(
1392
- model_inputs,
1393
- streamer=streamer,
1394
- max_new_tokens=self.config.max_new_tokens,
1395
- do_sample=True,
1396
- temperature=0.75,
1397
- eos_token_id=terminators,
1398
- )
1399
-
1400
- t = Thread(target=self.legacy_local_model.generate, kwargs=generate_kwargs)
1401
- t.start()
1402
-
1403
- partial_text = ""
1404
- for new_text in streamer:
1405
- partial_text += new_text
1406
-
1407
- pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
1408
- json_match = re.search(pattern, partial_text)
1409
-
1410
- if json_match:
1411
- return json.loads(json_match.group())
1412
- else:
1413
- raise ValueError("No valid JSON found in legacy local response")
1414
-
1415
- except Exception as e:
1416
- print(f"Legacy local model also failed: {e}")
1417
- return DefaultConversations.get_conversation(language)
1418
-
1419
- def extract_conversation_api(self, text: str, language: str = "English") -> Dict:
1420
- """Extract conversation using API"""
1421
- if not self.llm_client:
1422
- raise RuntimeError("API mode not initialized")
1423
-
1424
- try:
1425
- # 검색 컨텍스트 생성
1426
- search_context = ""
1427
- if BRAVE_KEY and not text.startswith("Keyword-based content:"):
1428
- try:
1429
- keywords = extract_keywords_for_search(text, language)
1430
- if keywords:
1431
- search_query = keywords[0] if language == "Korean" else f"{keywords[0]} latest news"
1432
- search_context = format_search_results(search_query)
1433
- print(f"Search context added for: {search_query}")
1434
- except Exception as e:
1435
- print(f"Search failed, continuing without context: {e}")
1436
-
1437
- # 메시지 빌드
1438
- messages = self.prompt_builder.build_messages_for_local(text, language, search_context)
1439
-
1440
- chat_completion = self.llm_client.chat.completions.create(
1441
- messages=messages,
1442
- model=self.config.api_model_name,
1443
- temperature=0.75,
1444
- )
1445
-
1446
- pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
1447
- json_match = re.search(pattern, chat_completion.choices[0].message.content)
1448
-
1449
- if not json_match:
1450
- raise ValueError("No valid JSON found in response")
1451
-
1452
- return json.loads(json_match.group())
1453
- except Exception as e:
1454
- raise RuntimeError(f"Failed to extract conversation: {e}")
1455
-
1456
- def parse_conversation_text(self, conversation_text: str) -> Dict:
1457
- """Parse conversation text back to JSON format"""
1458
- lines = conversation_text.strip().split('\n')
1459
- conversation_data = {"conversation": []}
1460
-
1461
- for line in lines:
1462
- if ':' in line:
1463
- speaker, text = line.split(':', 1)
1464
- conversation_data["conversation"].append({
1465
- "speaker": speaker.strip(),
1466
- "text": text.strip()
1467
- })
1468
-
1469
- return conversation_data
1470
-
1471
- async def text_to_speech_edge(self, conversation_json: Dict, language: str = "English") -> Tuple[str, str]:
1472
- """Convert text to speech using Edge TTS"""
1473
- output_dir = Path(self._create_output_directory())
1474
- filenames = []
1475
-
1476
- try:
1477
- # 언어별 음성 설정
1478
- voices = EDGE_TTS_VOICES.get(language, EDGE_TTS_VOICES["English"])
1479
-
1480
- for i, turn in enumerate(conversation_json["conversation"]):
1481
- filename = output_dir / f"output_{i}.wav"
1482
- voice = voices[i % len(voices)]
1483
-
1484
- tmp_path = await self._generate_audio_edge(turn["text"], voice)
1485
- os.rename(tmp_path, filename)
1486
- filenames.append(str(filename))
1487
-
1488
- # Combine audio files
1489
- final_output = os.path.join(output_dir, "combined_output.wav")
1490
- self._combine_audio_files(filenames, final_output)
1491
-
1492
- # Generate conversation text
1493
- conversation_text = "\n".join(
1494
- f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
1495
- for i, turn in enumerate(conversation_json["conversation"])
1496
- )
1497
-
1498
- return final_output, conversation_text
1499
- except Exception as e:
1500
- raise RuntimeError(f"Failed to convert text to speech: {e}")
1501
-
1502
- async def _generate_audio_edge(self, text: str, voice: str) -> str:
1503
- """Generate audio using Edge TTS"""
1504
- if not text.strip():
1505
- raise ValueError("Text cannot be empty")
1506
-
1507
- voice_short_name = voice.split(" - ")[0] if " - " in voice else voice
1508
- communicate = edge_tts.Communicate(text, voice_short_name)
1509
-
1510
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
1511
- tmp_path = tmp_file.name
1512
- await communicate.save(tmp_path)
1513
-
1514
- return tmp_path
1515
-
1516
- @spaces.GPU(duration=60)
1517
- def text_to_speech_spark(self, conversation_json: Dict, language: str = "English", progress=None) -> Tuple[str, str]:
1518
- """Convert text to speech using Spark TTS CLI"""
1519
- if not SPARK_AVAILABLE or not self.spark_model_dir:
1520
- raise RuntimeError("Spark TTS not available")
1521
-
1522
- try:
1523
- output_dir = self._create_output_directory()
1524
- audio_files = []
1525
-
1526
- # Create different voice characteristics for different speakers
1527
- speaker1, speaker2 = self.prompt_builder.get_speaker_names(language)
1528
-
1529
- if language == "Korean":
1530
- voice_configs = [
1531
- {"prompt_text": f"안녕하세요, 오늘 팟캐스트 진행을 맡은 {speaker1}입니다.", "gender": "male"},
1532
- {"prompt_text": f"안녕하세요, 저는 오늘 이 주제에 대해 설명드릴 {speaker2}입니다.", "gender": "male"}
1533
- ]
1534
- else:
1535
- voice_configs = [
1536
- {"prompt_text": f"Hello everyone, I'm {speaker1}, your host for today's podcast.", "gender": "male"},
1537
- {"prompt_text": f"Hi, I'm {speaker2}. I'm excited to share my insights with you.", "gender": "male"}
1538
- ]
1539
-
1540
- for i, turn in enumerate(conversation_json["conversation"]):
1541
- text = turn["text"]
1542
- if not text.strip():
1543
- continue
1544
-
1545
- voice_config = voice_configs[i % len(voice_configs)]
1546
- output_file = os.path.join(output_dir, f"spark_output_{i}.wav")
1547
-
1548
- cmd = [
1549
- "python", "-m", "cli.inference",
1550
- "--text", text,
1551
- "--device", "0" if torch.cuda.is_available() else "cpu",
1552
- "--save_dir", output_dir,
1553
- "--model_dir", self.spark_model_dir,
1554
- "--prompt_text", voice_config["prompt_text"],
1555
- "--output_name", f"spark_output_{i}.wav"
1556
- ]
1557
-
1558
- try:
1559
- result = subprocess.run(
1560
- cmd,
1561
- capture_output=True,
1562
- text=True,
1563
- timeout=60,
1564
- cwd="."
1565
- )
1566
-
1567
- if result.returncode == 0:
1568
- audio_files.append(output_file)
1569
- else:
1570
- print(f"Spark TTS error for turn {i}: {result.stderr}")
1571
- silence = np.zeros(int(22050 * 1.0))
1572
- sf.write(output_file, silence, 22050)
1573
- audio_files.append(output_file)
1574
-
1575
- except subprocess.TimeoutExpired:
1576
- print(f"Spark TTS timeout for turn {i}")
1577
- silence = np.zeros(int(22050 * 1.0))
1578
- sf.write(output_file, silence, 22050)
1579
- audio_files.append(output_file)
1580
- except Exception as e:
1581
- print(f"Error running Spark TTS for turn {i}: {e}")
1582
- silence = np.zeros(int(22050 * 1.0))
1583
- sf.write(output_file, silence, 22050)
1584
- audio_files.append(output_file)
1585
-
1586
- # Combine all audio files
1587
- if audio_files:
1588
- final_output = os.path.join(output_dir, "spark_combined.wav")
1589
- self._combine_audio_files(audio_files, final_output)
1590
- else:
1591
- raise RuntimeError("No audio files generated")
1592
-
1593
- conversation_text = "\n".join(
1594
- f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
1595
- for i, turn in enumerate(conversation_json["conversation"])
1596
- )
1597
-
1598
- return final_output, conversation_text
1599
-
1600
- except Exception as e:
1601
- raise RuntimeError(f"Failed to convert text to speech with Spark TTS: {e}")
1602
-
1603
- @spaces.GPU(duration=60)
1604
- def text_to_speech_melo(self, conversation_json: Dict, progress=None) -> Tuple[str, str]:
1605
- """Convert text to speech using MeloTTS"""
1606
- if not MELO_AVAILABLE or not self.melo_models:
1607
- raise RuntimeError("MeloTTS not available")
1608
-
1609
- speakers = ["EN-Default", "EN-US"]
1610
- combined_audio = AudioSegment.empty()
1611
-
1612
- for i, turn in enumerate(conversation_json["conversation"]):
1613
- bio = io.BytesIO()
1614
- text = turn["text"]
1615
- speaker = speakers[i % 2]
1616
- speaker_id = self.melo_models["EN"].hps.data.spk2id[speaker]
1617
-
1618
- self.melo_models["EN"].tts_to_file(
1619
- text, speaker_id, bio, speed=1.0,
1620
- pbar=progress.tqdm if progress else None,
1621
- format="wav"
1622
- )
1623
-
1624
- bio.seek(0)
1625
- audio_segment = AudioSegment.from_file(bio, format="wav")
1626
- combined_audio += audio_segment
1627
-
1628
- final_audio_path = "melo_podcast.mp3"
1629
- combined_audio.export(final_audio_path, format="mp3")
1630
-
1631
- conversation_text = "\n".join(
1632
- f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
1633
- for i, turn in enumerate(
1634
-
1635
-
1636
-
1637
-
1638
-
1639
-
1640
-
1641
- conversation_json["conversation"])
1642
  )
1643
 
1644
  return final_audio_path, conversation_text
 
814
 
815
  conversation_text = "\n".join(
816
  f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
817
+ for i, turn in enumerate(conversation_json["conversation"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
818
  )
819
 
820
  return final_audio_path, conversation_text