Spaces:

Priyanshukr-1
/

News-Summary-API

Running

File size: 13,391 Bytes

from fastapi import FastAPI, Request, HTTPException
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import platform
import psutil
import time
import tiktoken # For estimating token count
import logging # Import the logging module
from pydantic import BaseModel, Field

# === Configure Logging ===
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
if not logger.handlers:
    logger.addHandler(handler)

app = FastAPI(
    title="Data Analysis & News AI API",
    description="API for efficient news summarization and keyword extraction using local LLMs.",
    version="1.0.0"
)

# === Model Config ===
# Recommended Model for 16GB RAM CPU: Mistral-7B-Instruct-v0.2 Q4_K_M
# It offers a great balance of quality, speed, and memory footprint for your hardware.
# You can uncomment other models to test them out.
REPO_ID = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
FILENAME = "mistral-7b-instruct-v0.2.Q4_K_M.gguf"

# Alternative: OpenHermes 2.5 Mistral 7B (also excellent instruction following)
# REPO_ID = "TheBloke/OpenHermes-2.5-Mistral-7B-GGUF"
# FILENAME = "openhermes-2.5-mistral-7b.Q4_K_M.gguf"

# Alternative: Phi-3-mini (if you need extreme efficiency and speed, with good quality for its size)
# REPO_ID = "microsoft/Phi-3-mini-4k-instruct-GGUF"
# FILENAME = "phi-3-mini-4k-instruct-q4.gguf" # Often the standard Q4 for Phi-3

MODEL_DIR = "models"
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)

# === Download if model not available ===
if not os.path.exists(MODEL_PATH):
    logger.info(f"⬇️ Downloading {FILENAME} from Hugging Face...")
    try:
        model_path = hf_hub_download(
            repo_id=REPO_ID,
            filename=FILENAME,
            cache_dir=MODEL_DIR,
            local_dir=MODEL_DIR,
            local_dir_use_symlinks=False
        )
        logger.info(f"✅ Model downloaded to: {model_path}")
    except Exception as e:
        logger.error(f"❌ Error downloading model: {e}")
        # Re-raise the exception or exit, as the app cannot function without the model
        raise RuntimeError(f"Failed to download model: {e}")
else:
    logger.info(f"✅ Model already available at: {MODEL_PATH}")
    model_path = MODEL_PATH

# === Optimal thread usage ===
# For llama.cpp on CPU, using physical cores is generally more efficient than logical cores (hyperthreading).
physical_cores = psutil.cpu_count(logical=False)
recommended_threads = max(1, physical_cores) # Ensure at least 1 thread
logger.info(f"Detected physical cores: {physical_cores}, logical cores: {psutil.cpu_count(logical=True)}")
logger.info(f"Using n_threads: {recommended_threads}")

# === Load the model ===
try:
    llm = Llama(
        model_path=model_path,
        n_ctx=4096,  # Increased context window for better summarization of news articles
                     # 4096 is a good balance for 7B models on 16GB RAM.
                     # Test with 8192 if you often process very long articles.
        n_threads=recommended_threads,
        n_batch=512, # Max batch size for prompt processing. Larger can be faster for long prompts.
        use_mlock=True,  # Lock model in RAM for faster access, reducing disk I/O.
        n_gpu_layers=0,  # CPU only, as specified.
        chat_format="chatml", # This works for many instruct models, including Mistral.
        verbose=False # Keep llama.cpp's internal verbose logging off
    )
    logger.info("🚀 Llama model loaded successfully!")
except Exception as e:
    logger.error(f"❌ Error loading Llama model: {e}")
    raise RuntimeError(f"Failed to load Llama model: {e}")

# Initialize tiktoken encoder for token counting
try:
    encoding = tiktoken.get_encoding("cl100k_base")
except Exception:
    logger.warning("⚠️ Could not load tiktoken 'cl100k_base' encoding. Token count for prompt might be less accurate.")
    encoding = None

def count_tokens_in_text(text):
    """Estimates tokens in a given text using tiktoken or simple char count."""
    if encoding:
        return len(encoding.encode(text))
    else:
        return len(text) // 4 # Rough estimate: 1 token ~ 4 characters

# === Pydantic Models for API Request Bodies ===
class NewsArticle(BaseModel):
    article: str = Field(..., min_length=50, description="The full news article text to summarize.")
    num_sentences: int = Field(3, ge=1, le=10, description="Number of sentences for the summary (1-10).")
    max_tokens: int = Field(200, ge=50, le=500, description="Maximum tokens for the generated summary.")

class TextForKeywords(BaseModel):
    text: str = Field(..., min_length=20, description="The text from which to extract keywords.")
    num_keywords: int = Field(5, ge=1, le=15, description="Number of keywords to extract (1-15).")
    max_tokens: int = Field(100, ge=30, le=200, description="Maximum tokens for the keyword output.")

# === API Endpoints ===

@app.get("/")
def root():
    logger.info("Root endpoint accessed.")
    return {"message": "✅ Data Analysis AI API is live and optimized for speed and accuracy!"}

@app.get("/get_sys")
def get_sys_specs():
    """Returns system specifications including CPU, RAM, and OS details."""
    logger.info("System specs endpoint accessed.")
    memory = psutil.virtual_memory()
    return {
        "CPU": {
            "physical_cores": physical_cores,
            "logical_cores": psutil.cpu_count(logical=True),
            "max_freq_mhz": psutil.cpu_freq().max if psutil.cpu_freq() else "N/A",
            "cpu_usage_percent": psutil.cpu_percent(interval=1)
        },
        "RAM": {
            "total_GB": round(memory.total / (1024 ** 3), 2),
            "available_GB": round(memory.available / (1024 ** 3), 2),
            "usage_percent": memory.percent
        },
        "System": {
            "platform": platform.platform(),
            "architecture": platform.machine(),
            "python_version": platform.python_version()
        },
        "Model_Config": {
            "model_name": FILENAME,
            "n_ctx": llm.n_ctx(),
            "n_threads": llm.n_threads(),
            "n_batch": llm.n_batch(),
            "use_mlock": llm.use_mlock(),
            "chat_format": llm.chat_format,
            "n_gpu_layers": llm.n_gpu_layers()
        }
    }

@app.get("/process_list")
def process_list():
    """Returns a list of processes consuming significant CPU."""
    logger.info("Process list endpoint accessed.")
    time.sleep(1)
    processes = []
    for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
        try:
            cpu = proc.cpu_percent()
            mem = proc.memory_percent()
            if cpu > 5 or mem > 2:
                processes.append({
                    "pid": proc.pid,
                    "name": proc.name(),
                    "cpu_percent": round(cpu, 2),
                    "memory_percent": round(mem, 2)
                })
        except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
            pass
    processes.sort(key=lambda x: x['cpu_percent'], reverse=True)
    return {"heavy_processes": processes}

@app.post("/summarize_news")
async def summarize_news(request_body: NewsArticle):
    """
    Summarizes a given news article.
    """
    logger.info("➡️ /summarize_news endpoint received a request.")

    # Define the system prompt - consistent for all LLM interactions
    system_prompt_content = (
        "You are a highly efficient, objective, and precise Data and News analysis API. "
        "Your sole function is to process the provided text (data or news) and instructions, "
        "then output ONLY the requested analysis in the exact specified format. "
        "**Crucially, do NOT include any conversational text, greetings, introductions "
        "(e.g., 'Here is the report', 'Below is the analysis'), conclusions, disclaimers, "
        "or any remarks about being an AI. Respond directly with the content.** "
        "Adhere strictly to all formatting requirements given in the user's prompt "
        "(e.g., 'summary:{}', numbered lists, bullet points, JSON structures). "
        "Focus exclusively on data insights, statistics, trends, influencing factors, "
        "and actionable recommendations if requested. Be concise, professional, and factual. "
        "If a request cannot be fulfilled due to data limitations or model capabilities, "
        "respond with: 'STATUS: FAILED_ANALYSIS; REASON: Unable to process this specific analytical request due to limitations.' "
        "No other text should be included."
    )

    prompt = (
        f"Summarize the following news article in {request_body.num_sentences} "
        "concise sentences, focusing on the main event, key actors, and outcome. "
        "Do not include any introductory phrases or conversational elements. "
        f"Article: {request_body.article}"
    )

    messages_for_llm = [
        {"role": "system", "content": system_prompt_content},
        {"role": "user", "content": prompt}
    ]

    prompt_tokens = count_tokens_in_text(prompt)
    logger.info(f"🧾 Prompt received (first 100 chars): {prompt[:100]}...")
    logger.info(f"Tokens in prompt: {prompt_tokens}")

    try:
        response = llm.create_chat_completion(
            messages=messages_for_llm,
            max_tokens=request_body.max_tokens,
            temperature=0.7,
            stop=["</s>", "<|im_end|>", "\n\n---"],
            top_p=0.9,
            top_k=40,
            repeat_penalty=1.1
        )
        ai_response_content = response["choices"][0]["message"]["content"].strip()
        response_token_count = count_tokens_in_text(ai_response_content)

        logger.info("✅ Response generated successfully.")
        return {
            "response": ai_response_content,
            "prompt_tokens": prompt_tokens,
            "response_token_count": response_token_count
        }
    except Exception as e:
        logger.error(f"❌ Error during generation: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=f"Failed to generate response: {e}. Please try again.")


@app.post("/extract_keywords")
async def extract_keywords(request_body: TextForKeywords):
    """
    Extracts keywords from a given text.
    """
    logger.info("➡️ /extract_keywords endpoint received a request.")

    # Define the system prompt - consistent for all LLM interactions
    system_prompt_content = (
        "You are a highly efficient, objective, and precise Data and News analysis API. "
        "Your sole function is to process the provided text (data or news) and instructions, "
        "then output ONLY the requested analysis in the exact specified format. "
        "**Crucially, do NOT include any conversational text, greetings, introductions "
        "(e.g., 'Here is the report', 'Below is the analysis'), conclusions, disclaimers, "
        "or any remarks about being an AI. Respond directly with the content.** "
        "Adhere strictly to all formatting requirements given in the user's prompt "
        "(e.g., 'summary:{}', numbered lists, bullet points, JSON structures). "
        "Focus exclusively on data insights, statistics, trends, influencing factors, "
        "and actionable recommendations if requested. Be concise, professional, and factual. "
        "If a request cannot be fulfilled due to data limitations or model capabilities, "
        "respond with: 'STATUS: FAILED_ANALYSIS; REASON: Unable to process this specific analytical request due to limitations.' "
        "No other text should be included."
    )

    # Modified prompt for clearer keyword extraction instruction
    prompt = (
        f"Extract exactly {request_body.num_keywords} most important keywords from the following text. "
        "Your output should be ONLY the comma-separated list of keywords, nothing else. "
        "For example, if the keywords are 'apple', 'banana', 'cherry', your output should be: 'apple, banana, cherry'. "
        f"Text: {request_body.text}"
    )

    messages_for_llm = [
        {"role": "system", "content": system_prompt_content},
        {"role": "user", "content": prompt}
    ]

    prompt_tokens = count_tokens_in_text(prompt)
    logger.info(f"🧾 Prompt received (first 100 chars): {prompt[:100]}...")
    logger.info(f"Tokens in prompt: {prompt_tokens}")

    try:
        response = llm.create_chat_completion(
            messages=messages_for_llm,
            max_tokens=request_body.max_tokens,
            temperature=0.7,
            stop=["</s>", "<|im_end|>", "\n\n---"],
            top_p=0.9,
            top_k=40,
            repeat_penalty=1.1
        )
        ai_response_content = response["choices"][0]["message"]["content"].strip()
        response_token_count = count_tokens_in_text(ai_response_content)

        logger.info("✅ Response generated successfully.")
        return {
            "response": ai_response_content,
            "prompt_tokens": prompt_tokens,
            "response_token_count": response_token_count
        }
    except Exception as e:
        logger.error(f"❌ Error during generation: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=f"Failed to generate response: {e}. Please try again.")