from openai import OpenAI
import os
import time

# Use your existing Hugging Face endpoint
client = OpenAI(
    base_url="https://zxzbfrlg3ssrk7d9.us-east-1.aws.endpoints.huggingface.cloud/v1/",
    api_key=os.getenv("HF_TOKEN")
)

def analyze_with_model(prompt):
    """Analyze prompt with LLM, returning a generator for streaming"""
    try:
        # Use the Hugging Face Inference API with proper streaming
        response = client.chat.completions.create(
            model="DavidAU/OpenAi-GPT-oss-20b-abliterated-uncensored-NEO-Imatrix-gguf",
            messages=[{"role": "user", "content": prompt}],
            stream=True,  # Enable streaming for real-time responses
            temperature=0.7,
            max_tokens=8192,  # Increased token limit
            timeout=120  # Increased timeout for longer responses
        )

        # Stream the response chunks
        for chunk in response:
            content = chunk.choices[0].delta.content
            if content:
                yield content
            time.sleep(0.01)  # Smooth out the stream

    except Exception as e:
        error_msg = str(e)
        # Enhanced error detection for common Hugging Face issues
        if "503" in error_msg:
            yield f"Error during analysis: Service temporarily unavailable (503). The model server is likely initializing. Please wait 5 minutes and try again. Details: {error_msg}"
        elif "timeout" in error_msg.lower():
            yield f"Error during analysis: Request timed out. The model server may be initializing. Please wait 5 minutes and try again. Details: {error_msg}"
        elif "connection" in error_msg.lower():
            yield f"Error during analysis: Connection error. The model server may be initializing. Please wait 5 minutes and try again. Details: {error_msg}"
        elif "limit" in error_msg.lower():
            yield f"Error during analysis: Rate limit exceeded. Please wait a moment and try again. Details: {error_msg}"
        else:
            yield f"Error during analysis: {error_msg}"