Spaces:
Running
Running
from openai import OpenAI | |
import os | |
import time | |
# Use your existing Hugging Face endpoint | |
client = OpenAI( | |
base_url="https://zxzbfrlg3ssrk7d9.us-east-1.aws.endpoints.huggingface.cloud/v1/", | |
api_key=os.getenv("HF_TOKEN") | |
) | |
def analyze_with_model(prompt): | |
"""Analyze prompt with LLM, returning a generator for streaming""" | |
try: | |
# Use the Hugging Face Inference API with proper streaming | |
response = client.chat.completions.create( | |
model="DavidAU/OpenAi-GPT-oss-20b-abliterated-uncensored-NEO-Imatrix-gguf", | |
messages=[{"role": "user", "content": prompt}], | |
stream=True, # Enable streaming for real-time responses | |
temperature=0.7, | |
max_tokens=8192, # Increased token limit | |
timeout=120 # Increased timeout for longer responses | |
) | |
# Stream the response chunks | |
for chunk in response: | |
content = chunk.choices[0].delta.content | |
if content: | |
yield content | |
time.sleep(0.01) # Smooth out the stream | |
except Exception as e: | |
error_msg = str(e) | |
# Enhanced error detection for common Hugging Face issues | |
if "503" in error_msg: | |
yield f"Error during analysis: Service temporarily unavailable (503). The model server is likely initializing. Please wait 5 minutes and try again. Details: {error_msg}" | |
elif "timeout" in error_msg.lower(): | |
yield f"Error during analysis: Request timed out. The model server may be initializing. Please wait 5 minutes and try again. Details: {error_msg}" | |
elif "connection" in error_msg.lower(): | |
yield f"Error during analysis: Connection error. The model server may be initializing. Please wait 5 minutes and try again. Details: {error_msg}" | |
elif "limit" in error_msg.lower(): | |
yield f"Error during analysis: Rate limit exceeded. Please wait a moment and try again. Details: {error_msg}" | |
else: | |
yield f"Error during analysis: {error_msg}" | |