Spaces:
Running
Running
import os | |
import json | |
import logging | |
from typing import Dict, Any, List, Optional | |
from openai import OpenAI | |
logger = logging.getLogger(__name__) | |
import os | |
from dotenv import load_dotenv | |
load_dotenv() | |
print("✅ API KEY LOADED:", os.getenv("OPENAI_API_KEY")) | |
class LLMService: | |
"""Service for interacting with OpenAI LLM to process and consolidate scraped data""" | |
def __init__(self, model_name: str = "gpt-4o"): | |
""" | |
Initialize LLM service | |
Args: | |
model_name: Name of the OpenAI model to use (default: gpt-4o) | |
""" | |
# the newest OpenAI model is "gpt-4o" which was released May 13, 2024. | |
# do not change this unless explicitly requested by the user | |
self.model_name = model_name | |
self.api_key = os.environ.get("OPENAI_API_KEY") | |
if not self.api_key: | |
logger.warning("OpenAI API key not found in environment variables") | |
self.client = OpenAI(api_key=self.api_key) | |
# This method will be implemented in api/horoscope_routes.py | |
def consolidate_horoscopes(self, horoscope_data): | |
"""Placeholder method for consolidating horoscopes""" | |
return {"error": "Method not implemented"} | |
def consolidate_data(self, scraped_data: List[Dict[str, Any]]) -> Dict[str, Any]: | |
""" | |
Consolidate data from multiple sources using LLM | |
Args: | |
scraped_data: List of scraped data from different sources | |
Returns: | |
Consolidated information as a dictionary | |
""" | |
if not scraped_data: | |
return {"error": "No data provided for consolidation"} | |
try: | |
# Prepare data for LLM | |
sources_text = "" | |
for i, data in enumerate(scraped_data, 1): | |
source_type = data.get("type", "unknown") | |
title = data.get("title", "Unknown Title") | |
source = data.get("source", "Unknown Source") | |
text = data.get("text_content", "No content available") | |
sources_text += f"SOURCE {i} ({source_type} from {source}):\n" | |
sources_text += f"Title: {title}\n" | |
sources_text += f"Content: {text[:2000]}...\n\n" | |
# Create prompt for consolidation | |
prompt = f""" | |
Please analyze and consolidate the following information from multiple sources. | |
{sources_text} | |
Provide a comprehensive consolidation of this information in JSON format with the following structure: | |
{{ | |
"main_topics": [list of main topics covered], | |
"key_points": [list of key factual points from all sources], | |
"summary": "A 2-3 paragraph summary that synthesizes the information", | |
"analysis": "Brief analysis of the information and any discrepancies between sources", | |
"sources": [list of sources used] | |
}} | |
Only include factual information present in the sources. Do not add any speculative or additional information. | |
""" | |
# Call OpenAI API | |
response = self.client.chat.completions.create( | |
model=self.model_name, | |
messages=[ | |
{"role": "system", "content": "You are a data analysis expert specializing in consolidating information from multiple sources."}, | |
{"role": "user", "content": prompt} | |
], | |
response_format={"type": "json_object"}, | |
temperature=0.2 | |
) | |
# Parse the response | |
content = response.choices[0].message.content | |
if content: | |
result = json.loads(content) | |
return result | |
return {"error": "Empty response from LLM"} | |
except Exception as e: | |
logger.error(f"Error consolidating data with LLM: {str(e)}") | |
return {"error": f"Failed to consolidate data: {str(e)}"} | |
def summarize_content(self, text: str, max_length: int = 500) -> str: | |
""" | |
Summarize a single piece of content | |
Args: | |
text: Text to summarize | |
max_length: Maximum length of summary in characters | |
Returns: | |
Summarized text | |
""" | |
if not text: | |
return "No content to summarize" | |
try: | |
prompt = f""" | |
Please summarize the following text concisely in no more than {max_length} characters, | |
while maintaining all key information: | |
{text[:10000]} | |
""" | |
response = self.client.chat.completions.create( | |
model=self.model_name, | |
messages=[ | |
{"role": "system", "content": "You are a summarization expert."}, | |
{"role": "user", "content": prompt} | |
], | |
temperature=0.3, | |
max_tokens=max_length // 2 # Approximate token count | |
) | |
return response.choices[0].message.content | |
except Exception as e: | |
logger.error(f"Error summarizing content with LLM: {str(e)}") | |
return f"Failed to summarize content: {str(e)}" | |
def extract_key_information(self, text: str, info_type: Optional[str] = None) -> Dict[str, Any]: | |
""" | |
Extract specific type of information from content | |
Args: | |
text: Text to extract information from | |
info_type: Type of information to extract (e.g., "news", "product", "research") | |
Returns: | |
Extracted information as dictionary | |
""" | |
if not text: | |
return {"error": "No content provided"} | |
try: | |
type_instruction = "" | |
if info_type: | |
type_instruction = f"This is {info_type} content. " | |
prompt = f""" | |
{type_instruction}Please extract key structured information from the following text. | |
Return the result as a JSON object with appropriate fields based on the content type. | |
{text[:8000]} | |
""" | |
response = self.client.chat.completions.create( | |
model=self.model_name, | |
messages=[ | |
{"role": "system", "content": "You are a data extraction expert."}, | |
{"role": "user", "content": prompt} | |
], | |
response_format={"type": "json_object"}, | |
temperature=0.1 | |
) | |
# Parse the response | |
content = response.choices[0].message.content | |
if content: | |
result = json.loads(content) | |
return result | |
return {"error": "Empty response from LLM"} | |
except Exception as e: | |
logger.error(f"Error extracting information with LLM: {str(e)}") | |
return {"error": f"Failed to extract information: {str(e)}"} | |
# Create a singleton instance | |
llm_service = LLMService() | |