from fastapi import FastAPI, Request, HTTPException from llama_cpp import Llama from huggingface_hub import hf_hub_download import os import platform import psutil import time import tiktoken # For estimating token count import logging # Import the logging module from pydantic import BaseModel, Field # === Configure Logging === logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) handler = logging.StreamHandler() formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) if not logger.handlers: logger.addHandler(handler) app = FastAPI( title="Data Analysis & News AI API", description="API for efficient news summarization and keyword extraction using local LLMs.", version="1.0.0" ) # === Model Config === # Recommended Model for 16GB RAM CPU: Mistral-7B-Instruct-v0.2 Q4_K_M # It offers a great balance of quality, speed, and memory footprint for your hardware. # You can uncomment other models to test them out. REPO_ID = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF" FILENAME = "mistral-7b-instruct-v0.2.Q4_K_M.gguf" # Alternative: OpenHermes 2.5 Mistral 7B (also excellent instruction following) # REPO_ID = "TheBloke/OpenHermes-2.5-Mistral-7B-GGUF" # FILENAME = "openhermes-2.5-mistral-7b.Q4_K_M.gguf" # Alternative: Phi-3-mini (if you need extreme efficiency and speed, with good quality for its size) # REPO_ID = "microsoft/Phi-3-mini-4k-instruct-GGUF" # FILENAME = "phi-3-mini-4k-instruct-q4.gguf" # Often the standard Q4 for Phi-3 MODEL_DIR = "models" MODEL_PATH = os.path.join(MODEL_DIR, FILENAME) # === Download if model not available === if not os.path.exists(MODEL_PATH): logger.info(f"⬇️ Downloading {FILENAME} from Hugging Face...") try: model_path = hf_hub_download( repo_id=REPO_ID, filename=FILENAME, cache_dir=MODEL_DIR, local_dir=MODEL_DIR, local_dir_use_symlinks=False ) logger.info(f"✅ Model downloaded to: {model_path}") except Exception as e: logger.error(f"❌ Error downloading model: {e}") # Re-raise the exception or exit, as the app cannot function without the model raise RuntimeError(f"Failed to download model: {e}") else: logger.info(f"✅ Model already available at: {MODEL_PATH}") model_path = MODEL_PATH # === Optimal thread usage === # For llama.cpp on CPU, using physical cores is generally more efficient than logical cores (hyperthreading). physical_cores = psutil.cpu_count(logical=False) recommended_threads = max(1, physical_cores) # Ensure at least 1 thread logger.info(f"Detected physical cores: {physical_cores}, logical cores: {psutil.cpu_count(logical=True)}") logger.info(f"Using n_threads: {recommended_threads}") # === Load the model === try: llm = Llama( model_path=model_path, n_ctx=4096, # Increased context window for better summarization of news articles # 4096 is a good balance for 7B models on 16GB RAM. # Test with 8192 if you often process very long articles. n_threads=recommended_threads, n_batch=512, # Max batch size for prompt processing. Larger can be faster for long prompts. use_mlock=True, # Lock model in RAM for faster access, reducing disk I/O. n_gpu_layers=0, # CPU only, as specified. chat_format="chatml", # This works for many instruct models, including Mistral. verbose=False # Keep llama.cpp's internal verbose logging off ) logger.info("🚀 Llama model loaded successfully!") except Exception as e: logger.error(f"❌ Error loading Llama model: {e}") raise RuntimeError(f"Failed to load Llama model: {e}") # Initialize tiktoken encoder for token counting try: encoding = tiktoken.get_encoding("cl100k_base") except Exception: logger.warning("⚠️ Could not load tiktoken 'cl100k_base' encoding. Token count for prompt might be less accurate.") encoding = None def count_tokens_in_text(text): """Estimates tokens in a given text using tiktoken or simple char count.""" if encoding: return len(encoding.encode(text)) else: return len(text) // 4 # Rough estimate: 1 token ~ 4 characters # === Pydantic Models for API Request Bodies === class NewsArticle(BaseModel): article: str = Field(..., min_length=50, description="The full news article text to summarize.") num_sentences: int = Field(3, ge=1, le=10, description="Number of sentences for the summary (1-10).") max_tokens: int = Field(200, ge=50, le=500, description="Maximum tokens for the generated summary.") class TextForKeywords(BaseModel): text: str = Field(..., min_length=20, description="The text from which to extract keywords.") num_keywords: int = Field(5, ge=1, le=15, description="Number of keywords to extract (1-15).") max_tokens: int = Field(100, ge=30, le=200, description="Maximum tokens for the keyword output.") # === API Endpoints === @app.get("/") def root(): logger.info("Root endpoint accessed.") return {"message": "✅ Data Analysis AI API is live and optimized for speed and accuracy!"} @app.get("/get_sys") def get_sys_specs(): """Returns system specifications including CPU, RAM, and OS details.""" logger.info("System specs endpoint accessed.") memory = psutil.virtual_memory() return { "CPU": { "physical_cores": physical_cores, "logical_cores": psutil.cpu_count(logical=True), "max_freq_mhz": psutil.cpu_freq().max if psutil.cpu_freq() else "N/A", "cpu_usage_percent": psutil.cpu_percent(interval=1) }, "RAM": { "total_GB": round(memory.total / (1024 ** 3), 2), "available_GB": round(memory.available / (1024 ** 3), 2), "usage_percent": memory.percent }, "System": { "platform": platform.platform(), "architecture": platform.machine(), "python_version": platform.python_version() }, "Model_Config": { "model_name": FILENAME, "n_ctx": llm.n_ctx(), "n_threads": llm.n_threads(), "n_batch": llm.n_batch(), "use_mlock": llm.use_mlock(), "chat_format": llm.chat_format, "n_gpu_layers": llm.n_gpu_layers() } } @app.get("/process_list") def process_list(): """Returns a list of processes consuming significant CPU.""" logger.info("Process list endpoint accessed.") time.sleep(1) processes = [] for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']): try: cpu = proc.cpu_percent() mem = proc.memory_percent() if cpu > 5 or mem > 2: processes.append({ "pid": proc.pid, "name": proc.name(), "cpu_percent": round(cpu, 2), "memory_percent": round(mem, 2) }) except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): pass processes.sort(key=lambda x: x['cpu_percent'], reverse=True) return {"heavy_processes": processes} @app.post("/summarize_news") async def summarize_news(request_body: NewsArticle): """ Summarizes a given news article. """ logger.info("➡️ /summarize_news endpoint received a request.") # Define the system prompt - consistent for all LLM interactions system_prompt_content = ( "You are a highly efficient, objective, and precise Data and News analysis API. " "Your sole function is to process the provided text (data or news) and instructions, " "then output ONLY the requested analysis in the exact specified format. " "**Crucially, do NOT include any conversational text, greetings, introductions " "(e.g., 'Here is the report', 'Below is the analysis'), conclusions, disclaimers, " "or any remarks about being an AI. Respond directly with the content.** " "Adhere strictly to all formatting requirements given in the user's prompt " "(e.g., 'summary:{}', numbered lists, bullet points, JSON structures). " "Focus exclusively on data insights, statistics, trends, influencing factors, " "and actionable recommendations if requested. Be concise, professional, and factual. " "If a request cannot be fulfilled due to data limitations or model capabilities, " "respond with: 'STATUS: FAILED_ANALYSIS; REASON: Unable to process this specific analytical request due to limitations.' " "No other text should be included." ) prompt = ( f"Summarize the following news article in {request_body.num_sentences} " "concise sentences, focusing on the main event, key actors, and outcome. " "Do not include any introductory phrases or conversational elements. " f"Article: {request_body.article}" ) messages_for_llm = [ {"role": "system", "content": system_prompt_content}, {"role": "user", "content": prompt} ] prompt_tokens = count_tokens_in_text(prompt) logger.info(f"🧾 Prompt received (first 100 chars): {prompt[:100]}...") logger.info(f"Tokens in prompt: {prompt_tokens}") try: response = llm.create_chat_completion( messages=messages_for_llm, max_tokens=request_body.max_tokens, temperature=0.7, stop=["", "<|im_end|>", "\n\n---"], top_p=0.9, top_k=40, repeat_penalty=1.1 ) ai_response_content = response["choices"][0]["message"]["content"].strip() response_token_count = count_tokens_in_text(ai_response_content) logger.info("✅ Response generated successfully.") return { "response": ai_response_content, "prompt_tokens": prompt_tokens, "response_token_count": response_token_count } except Exception as e: logger.error(f"❌ Error during generation: {e}", exc_info=True) raise HTTPException(status_code=500, detail=f"Failed to generate response: {e}. Please try again.") @app.post("/extract_keywords") async def extract_keywords(request_body: TextForKeywords): """ Extracts keywords from a given text. """ logger.info("➡️ /extract_keywords endpoint received a request.") # Define the system prompt - consistent for all LLM interactions system_prompt_content = ( "You are a highly efficient, objective, and precise Data and News analysis API. " "Your sole function is to process the provided text (data or news) and instructions, " "then output ONLY the requested analysis in the exact specified format. " "**Crucially, do NOT include any conversational text, greetings, introductions " "(e.g., 'Here is the report', 'Below is the analysis'), conclusions, disclaimers, " "or any remarks about being an AI. Respond directly with the content.** " "Adhere strictly to all formatting requirements given in the user's prompt " "(e.g., 'summary:{}', numbered lists, bullet points, JSON structures). " "Focus exclusively on data insights, statistics, trends, influencing factors, " "and actionable recommendations if requested. Be concise, professional, and factual. " "If a request cannot be fulfilled due to data limitations or model capabilities, " "respond with: 'STATUS: FAILED_ANALYSIS; REASON: Unable to process this specific analytical request due to limitations.' " "No other text should be included." ) # Modified prompt for clearer keyword extraction instruction prompt = ( f"Extract exactly {request_body.num_keywords} most important keywords from the following text. " "Your output should be ONLY the comma-separated list of keywords, nothing else. " "For example, if the keywords are 'apple', 'banana', 'cherry', your output should be: 'apple, banana, cherry'. " f"Text: {request_body.text}" ) messages_for_llm = [ {"role": "system", "content": system_prompt_content}, {"role": "user", "content": prompt} ] prompt_tokens = count_tokens_in_text(prompt) logger.info(f"🧾 Prompt received (first 100 chars): {prompt[:100]}...") logger.info(f"Tokens in prompt: {prompt_tokens}") try: response = llm.create_chat_completion( messages=messages_for_llm, max_tokens=request_body.max_tokens, temperature=0.7, stop=["", "<|im_end|>", "\n\n---"], top_p=0.9, top_k=40, repeat_penalty=1.1 ) ai_response_content = response["choices"][0]["message"]["content"].strip() response_token_count = count_tokens_in_text(ai_response_content) logger.info("✅ Response generated successfully.") return { "response": ai_response_content, "prompt_tokens": prompt_tokens, "response_token_count": response_token_count } except Exception as e: logger.error(f"❌ Error during generation: {e}", exc_info=True) raise HTTPException(status_code=500, detail=f"Failed to generate response: {e}. Please try again.")