myspace134v / modules /analyzer.py
rdune71's picture
Optimize for Hugging Face Inference API with streaming support and RAG integration
03da349
from openai import OpenAI
import os
import time
# Use your existing Hugging Face endpoint
client = OpenAI(
base_url="https://zxzbfrlg3ssrk7d9.us-east-1.aws.endpoints.huggingface.cloud/v1/",
api_key=os.getenv("HF_TOKEN")
)
def analyze_with_model(prompt):
"""Analyze prompt with LLM, returning a generator for streaming"""
try:
# Use the Hugging Face Inference API with proper streaming
response = client.chat.completions.create(
model="DavidAU/OpenAi-GPT-oss-20b-abliterated-uncensored-NEO-Imatrix-gguf",
messages=[{"role": "user", "content": prompt}],
stream=True, # Enable streaming for real-time responses
temperature=0.7,
max_tokens=8192, # Increased token limit
timeout=120 # Increased timeout for longer responses
)
# Stream the response chunks
for chunk in response:
content = chunk.choices[0].delta.content
if content:
yield content
time.sleep(0.01) # Smooth out the stream
except Exception as e:
error_msg = str(e)
# Enhanced error detection for common Hugging Face issues
if "503" in error_msg:
yield f"Error during analysis: Service temporarily unavailable (503). The model server is likely initializing. Please wait 5 minutes and try again. Details: {error_msg}"
elif "timeout" in error_msg.lower():
yield f"Error during analysis: Request timed out. The model server may be initializing. Please wait 5 minutes and try again. Details: {error_msg}"
elif "connection" in error_msg.lower():
yield f"Error during analysis: Connection error. The model server may be initializing. Please wait 5 minutes and try again. Details: {error_msg}"
elif "limit" in error_msg.lower():
yield f"Error during analysis: Rate limit exceeded. Please wait a moment and try again. Details: {error_msg}"
else:
yield f"Error during analysis: {error_msg}"