Spaces:
Running
Running
File size: 13,391 Bytes
05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 5e53d9c 05e39b8 b389616 05e39b8 b389616 05e39b8 2c5a3f9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 |
from fastapi import FastAPI, Request, HTTPException
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import platform
import psutil
import time
import tiktoken # For estimating token count
import logging # Import the logging module
from pydantic import BaseModel, Field
# === Configure Logging ===
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
if not logger.handlers:
logger.addHandler(handler)
app = FastAPI(
title="Data Analysis & News AI API",
description="API for efficient news summarization and keyword extraction using local LLMs.",
version="1.0.0"
)
# === Model Config ===
# Recommended Model for 16GB RAM CPU: Mistral-7B-Instruct-v0.2 Q4_K_M
# It offers a great balance of quality, speed, and memory footprint for your hardware.
# You can uncomment other models to test them out.
REPO_ID = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
FILENAME = "mistral-7b-instruct-v0.2.Q4_K_M.gguf"
# Alternative: OpenHermes 2.5 Mistral 7B (also excellent instruction following)
# REPO_ID = "TheBloke/OpenHermes-2.5-Mistral-7B-GGUF"
# FILENAME = "openhermes-2.5-mistral-7b.Q4_K_M.gguf"
# Alternative: Phi-3-mini (if you need extreme efficiency and speed, with good quality for its size)
# REPO_ID = "microsoft/Phi-3-mini-4k-instruct-GGUF"
# FILENAME = "phi-3-mini-4k-instruct-q4.gguf" # Often the standard Q4 for Phi-3
MODEL_DIR = "models"
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
# === Download if model not available ===
if not os.path.exists(MODEL_PATH):
logger.info(f"β¬οΈ Downloading {FILENAME} from Hugging Face...")
try:
model_path = hf_hub_download(
repo_id=REPO_ID,
filename=FILENAME,
cache_dir=MODEL_DIR,
local_dir=MODEL_DIR,
local_dir_use_symlinks=False
)
logger.info(f"β
Model downloaded to: {model_path}")
except Exception as e:
logger.error(f"β Error downloading model: {e}")
# Re-raise the exception or exit, as the app cannot function without the model
raise RuntimeError(f"Failed to download model: {e}")
else:
logger.info(f"β
Model already available at: {MODEL_PATH}")
model_path = MODEL_PATH
# === Optimal thread usage ===
# For llama.cpp on CPU, using physical cores is generally more efficient than logical cores (hyperthreading).
physical_cores = psutil.cpu_count(logical=False)
recommended_threads = max(1, physical_cores) # Ensure at least 1 thread
logger.info(f"Detected physical cores: {physical_cores}, logical cores: {psutil.cpu_count(logical=True)}")
logger.info(f"Using n_threads: {recommended_threads}")
# === Load the model ===
try:
llm = Llama(
model_path=model_path,
n_ctx=4096, # Increased context window for better summarization of news articles
# 4096 is a good balance for 7B models on 16GB RAM.
# Test with 8192 if you often process very long articles.
n_threads=recommended_threads,
n_batch=512, # Max batch size for prompt processing. Larger can be faster for long prompts.
use_mlock=True, # Lock model in RAM for faster access, reducing disk I/O.
n_gpu_layers=0, # CPU only, as specified.
chat_format="chatml", # This works for many instruct models, including Mistral.
verbose=False # Keep llama.cpp's internal verbose logging off
)
logger.info("π Llama model loaded successfully!")
except Exception as e:
logger.error(f"β Error loading Llama model: {e}")
raise RuntimeError(f"Failed to load Llama model: {e}")
# Initialize tiktoken encoder for token counting
try:
encoding = tiktoken.get_encoding("cl100k_base")
except Exception:
logger.warning("β οΈ Could not load tiktoken 'cl100k_base' encoding. Token count for prompt might be less accurate.")
encoding = None
def count_tokens_in_text(text):
"""Estimates tokens in a given text using tiktoken or simple char count."""
if encoding:
return len(encoding.encode(text))
else:
return len(text) // 4 # Rough estimate: 1 token ~ 4 characters
# === Pydantic Models for API Request Bodies ===
class NewsArticle(BaseModel):
article: str = Field(..., min_length=50, description="The full news article text to summarize.")
num_sentences: int = Field(3, ge=1, le=10, description="Number of sentences for the summary (1-10).")
max_tokens: int = Field(200, ge=50, le=500, description="Maximum tokens for the generated summary.")
class TextForKeywords(BaseModel):
text: str = Field(..., min_length=20, description="The text from which to extract keywords.")
num_keywords: int = Field(5, ge=1, le=15, description="Number of keywords to extract (1-15).")
max_tokens: int = Field(100, ge=30, le=200, description="Maximum tokens for the keyword output.")
# === API Endpoints ===
@app.get("/")
def root():
logger.info("Root endpoint accessed.")
return {"message": "β
Data Analysis AI API is live and optimized for speed and accuracy!"}
@app.get("/get_sys")
def get_sys_specs():
"""Returns system specifications including CPU, RAM, and OS details."""
logger.info("System specs endpoint accessed.")
memory = psutil.virtual_memory()
return {
"CPU": {
"physical_cores": physical_cores,
"logical_cores": psutil.cpu_count(logical=True),
"max_freq_mhz": psutil.cpu_freq().max if psutil.cpu_freq() else "N/A",
"cpu_usage_percent": psutil.cpu_percent(interval=1)
},
"RAM": {
"total_GB": round(memory.total / (1024 ** 3), 2),
"available_GB": round(memory.available / (1024 ** 3), 2),
"usage_percent": memory.percent
},
"System": {
"platform": platform.platform(),
"architecture": platform.machine(),
"python_version": platform.python_version()
},
"Model_Config": {
"model_name": FILENAME,
"n_ctx": llm.n_ctx(),
"n_threads": llm.n_threads(),
"n_batch": llm.n_batch(),
"use_mlock": llm.use_mlock(),
"chat_format": llm.chat_format,
"n_gpu_layers": llm.n_gpu_layers()
}
}
@app.get("/process_list")
def process_list():
"""Returns a list of processes consuming significant CPU."""
logger.info("Process list endpoint accessed.")
time.sleep(1)
processes = []
for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
try:
cpu = proc.cpu_percent()
mem = proc.memory_percent()
if cpu > 5 or mem > 2:
processes.append({
"pid": proc.pid,
"name": proc.name(),
"cpu_percent": round(cpu, 2),
"memory_percent": round(mem, 2)
})
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
pass
processes.sort(key=lambda x: x['cpu_percent'], reverse=True)
return {"heavy_processes": processes}
@app.post("/summarize_news")
async def summarize_news(request_body: NewsArticle):
"""
Summarizes a given news article.
"""
logger.info("β‘οΈ /summarize_news endpoint received a request.")
# Define the system prompt - consistent for all LLM interactions
system_prompt_content = (
"You are a highly efficient, objective, and precise Data and News analysis API. "
"Your sole function is to process the provided text (data or news) and instructions, "
"then output ONLY the requested analysis in the exact specified format. "
"**Crucially, do NOT include any conversational text, greetings, introductions "
"(e.g., 'Here is the report', 'Below is the analysis'), conclusions, disclaimers, "
"or any remarks about being an AI. Respond directly with the content.** "
"Adhere strictly to all formatting requirements given in the user's prompt "
"(e.g., 'summary:{}', numbered lists, bullet points, JSON structures). "
"Focus exclusively on data insights, statistics, trends, influencing factors, "
"and actionable recommendations if requested. Be concise, professional, and factual. "
"If a request cannot be fulfilled due to data limitations or model capabilities, "
"respond with: 'STATUS: FAILED_ANALYSIS; REASON: Unable to process this specific analytical request due to limitations.' "
"No other text should be included."
)
prompt = (
f"Summarize the following news article in {request_body.num_sentences} "
"concise sentences, focusing on the main event, key actors, and outcome. "
"Do not include any introductory phrases or conversational elements. "
f"Article: {request_body.article}"
)
messages_for_llm = [
{"role": "system", "content": system_prompt_content},
{"role": "user", "content": prompt}
]
prompt_tokens = count_tokens_in_text(prompt)
logger.info(f"π§Ύ Prompt received (first 100 chars): {prompt[:100]}...")
logger.info(f"Tokens in prompt: {prompt_tokens}")
try:
response = llm.create_chat_completion(
messages=messages_for_llm,
max_tokens=request_body.max_tokens,
temperature=0.7,
stop=["</s>", "<|im_end|>", "\n\n---"],
top_p=0.9,
top_k=40,
repeat_penalty=1.1
)
ai_response_content = response["choices"][0]["message"]["content"].strip()
response_token_count = count_tokens_in_text(ai_response_content)
logger.info("β
Response generated successfully.")
return {
"response": ai_response_content,
"prompt_tokens": prompt_tokens,
"response_token_count": response_token_count
}
except Exception as e:
logger.error(f"β Error during generation: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=f"Failed to generate response: {e}. Please try again.")
@app.post("/extract_keywords")
async def extract_keywords(request_body: TextForKeywords):
"""
Extracts keywords from a given text.
"""
logger.info("β‘οΈ /extract_keywords endpoint received a request.")
# Define the system prompt - consistent for all LLM interactions
system_prompt_content = (
"You are a highly efficient, objective, and precise Data and News analysis API. "
"Your sole function is to process the provided text (data or news) and instructions, "
"then output ONLY the requested analysis in the exact specified format. "
"**Crucially, do NOT include any conversational text, greetings, introductions "
"(e.g., 'Here is the report', 'Below is the analysis'), conclusions, disclaimers, "
"or any remarks about being an AI. Respond directly with the content.** "
"Adhere strictly to all formatting requirements given in the user's prompt "
"(e.g., 'summary:{}', numbered lists, bullet points, JSON structures). "
"Focus exclusively on data insights, statistics, trends, influencing factors, "
"and actionable recommendations if requested. Be concise, professional, and factual. "
"If a request cannot be fulfilled due to data limitations or model capabilities, "
"respond with: 'STATUS: FAILED_ANALYSIS; REASON: Unable to process this specific analytical request due to limitations.' "
"No other text should be included."
)
# Modified prompt for clearer keyword extraction instruction
prompt = (
f"Extract exactly {request_body.num_keywords} most important keywords from the following text. "
"Your output should be ONLY the comma-separated list of keywords, nothing else. "
"For example, if the keywords are 'apple', 'banana', 'cherry', your output should be: 'apple, banana, cherry'. "
f"Text: {request_body.text}"
)
messages_for_llm = [
{"role": "system", "content": system_prompt_content},
{"role": "user", "content": prompt}
]
prompt_tokens = count_tokens_in_text(prompt)
logger.info(f"π§Ύ Prompt received (first 100 chars): {prompt[:100]}...")
logger.info(f"Tokens in prompt: {prompt_tokens}")
try:
response = llm.create_chat_completion(
messages=messages_for_llm,
max_tokens=request_body.max_tokens,
temperature=0.7,
stop=["</s>", "<|im_end|>", "\n\n---"],
top_p=0.9,
top_k=40,
repeat_penalty=1.1
)
ai_response_content = response["choices"][0]["message"]["content"].strip()
response_token_count = count_tokens_in_text(ai_response_content)
logger.info("β
Response generated successfully.")
return {
"response": ai_response_content,
"prompt_tokens": prompt_tokens,
"response_token_count": response_token_count
}
except Exception as e:
logger.error(f"β Error during generation: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=f"Failed to generate response: {e}. Please try again.")
|