from openai import OpenAI import os import time # Use your existing Hugging Face endpoint client = OpenAI( base_url="https://zxzbfrlg3ssrk7d9.us-east-1.aws.endpoints.huggingface.cloud/v1/", api_key=os.getenv("HF_TOKEN") ) def analyze_with_model(prompt): """Analyze prompt with LLM, returning a generator for streaming""" try: # Use the Hugging Face Inference API with proper streaming response = client.chat.completions.create( model="DavidAU/OpenAi-GPT-oss-20b-abliterated-uncensored-NEO-Imatrix-gguf", messages=[{"role": "user", "content": prompt}], stream=True, # Enable streaming for real-time responses temperature=0.7, max_tokens=8192, # Increased token limit timeout=120 # Increased timeout for longer responses ) # Stream the response chunks for chunk in response: content = chunk.choices[0].delta.content if content: yield content time.sleep(0.01) # Smooth out the stream except Exception as e: error_msg = str(e) # Enhanced error detection for common Hugging Face issues if "503" in error_msg: yield f"Error during analysis: Service temporarily unavailable (503). The model server is likely initializing. Please wait 5 minutes and try again. Details: {error_msg}" elif "timeout" in error_msg.lower(): yield f"Error during analysis: Request timed out. The model server may be initializing. Please wait 5 minutes and try again. Details: {error_msg}" elif "connection" in error_msg.lower(): yield f"Error during analysis: Connection error. The model server may be initializing. Please wait 5 minutes and try again. Details: {error_msg}" elif "limit" in error_msg.lower(): yield f"Error during analysis: Rate limit exceeded. Please wait a moment and try again. Details: {error_msg}" else: yield f"Error during analysis: {error_msg}"