File size: 13,391 Bytes
05e39b8
5e53d9c
 
 
 
 
 
 
 
05e39b8
5e53d9c
 
 
 
 
 
 
 
 
 
05e39b8
 
 
 
 
5e53d9c
 
05e39b8
 
 
 
 
 
 
 
 
 
 
 
 
 
5e53d9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05e39b8
 
5e53d9c
 
 
 
 
05e39b8
5e53d9c
 
05e39b8
5e53d9c
 
 
 
 
 
05e39b8
 
 
5e53d9c
05e39b8
 
 
 
5e53d9c
 
05e39b8
5e53d9c
 
05e39b8
5e53d9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05e39b8
 
 
 
 
 
 
 
 
 
 
 
 
5e53d9c
 
 
05e39b8
5e53d9c
 
 
 
 
 
 
 
 
05e39b8
5e53d9c
05e39b8
5e53d9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05e39b8
 
 
 
5e53d9c
 
 
 
 
 
 
05e39b8
5e53d9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05e39b8
 
5e53d9c
05e39b8
5e53d9c
05e39b8
5e53d9c
05e39b8
5e53d9c
05e39b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e53d9c
 
 
 
 
 
 
 
05e39b8
5e53d9c
 
 
 
 
05e39b8
 
 
 
 
 
5e53d9c
 
 
05e39b8
5e53d9c
 
 
05e39b8
5e53d9c
 
 
05e39b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b389616
05e39b8
b389616
 
 
 
05e39b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c5a3f9
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
from fastapi import FastAPI, Request, HTTPException
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import platform
import psutil
import time
import tiktoken # For estimating token count
import logging # Import the logging module
from pydantic import BaseModel, Field

# === Configure Logging ===
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
if not logger.handlers:
    logger.addHandler(handler)

app = FastAPI(
    title="Data Analysis & News AI API",
    description="API for efficient news summarization and keyword extraction using local LLMs.",
    version="1.0.0"
)

# === Model Config ===
# Recommended Model for 16GB RAM CPU: Mistral-7B-Instruct-v0.2 Q4_K_M
# It offers a great balance of quality, speed, and memory footprint for your hardware.
# You can uncomment other models to test them out.
REPO_ID = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
FILENAME = "mistral-7b-instruct-v0.2.Q4_K_M.gguf"

# Alternative: OpenHermes 2.5 Mistral 7B (also excellent instruction following)
# REPO_ID = "TheBloke/OpenHermes-2.5-Mistral-7B-GGUF"
# FILENAME = "openhermes-2.5-mistral-7b.Q4_K_M.gguf"

# Alternative: Phi-3-mini (if you need extreme efficiency and speed, with good quality for its size)
# REPO_ID = "microsoft/Phi-3-mini-4k-instruct-GGUF"
# FILENAME = "phi-3-mini-4k-instruct-q4.gguf" # Often the standard Q4 for Phi-3

MODEL_DIR = "models"
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)

# === Download if model not available ===
if not os.path.exists(MODEL_PATH):
    logger.info(f"⬇️ Downloading {FILENAME} from Hugging Face...")
    try:
        model_path = hf_hub_download(
            repo_id=REPO_ID,
            filename=FILENAME,
            cache_dir=MODEL_DIR,
            local_dir=MODEL_DIR,
            local_dir_use_symlinks=False
        )
        logger.info(f"βœ… Model downloaded to: {model_path}")
    except Exception as e:
        logger.error(f"❌ Error downloading model: {e}")
        # Re-raise the exception or exit, as the app cannot function without the model
        raise RuntimeError(f"Failed to download model: {e}")
else:
    logger.info(f"βœ… Model already available at: {MODEL_PATH}")
    model_path = MODEL_PATH

# === Optimal thread usage ===
# For llama.cpp on CPU, using physical cores is generally more efficient than logical cores (hyperthreading).
physical_cores = psutil.cpu_count(logical=False)
recommended_threads = max(1, physical_cores) # Ensure at least 1 thread
logger.info(f"Detected physical cores: {physical_cores}, logical cores: {psutil.cpu_count(logical=True)}")
logger.info(f"Using n_threads: {recommended_threads}")

# === Load the model ===
try:
    llm = Llama(
        model_path=model_path,
        n_ctx=4096,  # Increased context window for better summarization of news articles
                     # 4096 is a good balance for 7B models on 16GB RAM.
                     # Test with 8192 if you often process very long articles.
        n_threads=recommended_threads,
        n_batch=512, # Max batch size for prompt processing. Larger can be faster for long prompts.
        use_mlock=True,  # Lock model in RAM for faster access, reducing disk I/O.
        n_gpu_layers=0,  # CPU only, as specified.
        chat_format="chatml", # This works for many instruct models, including Mistral.
        verbose=False # Keep llama.cpp's internal verbose logging off
    )
    logger.info("πŸš€ Llama model loaded successfully!")
except Exception as e:
    logger.error(f"❌ Error loading Llama model: {e}")
    raise RuntimeError(f"Failed to load Llama model: {e}")

# Initialize tiktoken encoder for token counting
try:
    encoding = tiktoken.get_encoding("cl100k_base")
except Exception:
    logger.warning("⚠️ Could not load tiktoken 'cl100k_base' encoding. Token count for prompt might be less accurate.")
    encoding = None

def count_tokens_in_text(text):
    """Estimates tokens in a given text using tiktoken or simple char count."""
    if encoding:
        return len(encoding.encode(text))
    else:
        return len(text) // 4 # Rough estimate: 1 token ~ 4 characters

# === Pydantic Models for API Request Bodies ===
class NewsArticle(BaseModel):
    article: str = Field(..., min_length=50, description="The full news article text to summarize.")
    num_sentences: int = Field(3, ge=1, le=10, description="Number of sentences for the summary (1-10).")
    max_tokens: int = Field(200, ge=50, le=500, description="Maximum tokens for the generated summary.")

class TextForKeywords(BaseModel):
    text: str = Field(..., min_length=20, description="The text from which to extract keywords.")
    num_keywords: int = Field(5, ge=1, le=15, description="Number of keywords to extract (1-15).")
    max_tokens: int = Field(100, ge=30, le=200, description="Maximum tokens for the keyword output.")

# === API Endpoints ===

@app.get("/")
def root():
    logger.info("Root endpoint accessed.")
    return {"message": "βœ… Data Analysis AI API is live and optimized for speed and accuracy!"}

@app.get("/get_sys")
def get_sys_specs():
    """Returns system specifications including CPU, RAM, and OS details."""
    logger.info("System specs endpoint accessed.")
    memory = psutil.virtual_memory()
    return {
        "CPU": {
            "physical_cores": physical_cores,
            "logical_cores": psutil.cpu_count(logical=True),
            "max_freq_mhz": psutil.cpu_freq().max if psutil.cpu_freq() else "N/A",
            "cpu_usage_percent": psutil.cpu_percent(interval=1)
        },
        "RAM": {
            "total_GB": round(memory.total / (1024 ** 3), 2),
            "available_GB": round(memory.available / (1024 ** 3), 2),
            "usage_percent": memory.percent
        },
        "System": {
            "platform": platform.platform(),
            "architecture": platform.machine(),
            "python_version": platform.python_version()
        },
        "Model_Config": {
            "model_name": FILENAME,
            "n_ctx": llm.n_ctx(),
            "n_threads": llm.n_threads(),
            "n_batch": llm.n_batch(),
            "use_mlock": llm.use_mlock(),
            "chat_format": llm.chat_format,
            "n_gpu_layers": llm.n_gpu_layers()
        }
    }

@app.get("/process_list")
def process_list():
    """Returns a list of processes consuming significant CPU."""
    logger.info("Process list endpoint accessed.")
    time.sleep(1)
    processes = []
    for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
        try:
            cpu = proc.cpu_percent()
            mem = proc.memory_percent()
            if cpu > 5 or mem > 2:
                processes.append({
                    "pid": proc.pid,
                    "name": proc.name(),
                    "cpu_percent": round(cpu, 2),
                    "memory_percent": round(mem, 2)
                })
        except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
            pass
    processes.sort(key=lambda x: x['cpu_percent'], reverse=True)
    return {"heavy_processes": processes}

@app.post("/summarize_news")
async def summarize_news(request_body: NewsArticle):
    """
    Summarizes a given news article.
    """
    logger.info("➑️ /summarize_news endpoint received a request.")

    # Define the system prompt - consistent for all LLM interactions
    system_prompt_content = (
        "You are a highly efficient, objective, and precise Data and News analysis API. "
        "Your sole function is to process the provided text (data or news) and instructions, "
        "then output ONLY the requested analysis in the exact specified format. "
        "**Crucially, do NOT include any conversational text, greetings, introductions "
        "(e.g., 'Here is the report', 'Below is the analysis'), conclusions, disclaimers, "
        "or any remarks about being an AI. Respond directly with the content.** "
        "Adhere strictly to all formatting requirements given in the user's prompt "
        "(e.g., 'summary:{}', numbered lists, bullet points, JSON structures). "
        "Focus exclusively on data insights, statistics, trends, influencing factors, "
        "and actionable recommendations if requested. Be concise, professional, and factual. "
        "If a request cannot be fulfilled due to data limitations or model capabilities, "
        "respond with: 'STATUS: FAILED_ANALYSIS; REASON: Unable to process this specific analytical request due to limitations.' "
        "No other text should be included."
    )

    prompt = (
        f"Summarize the following news article in {request_body.num_sentences} "
        "concise sentences, focusing on the main event, key actors, and outcome. "
        "Do not include any introductory phrases or conversational elements. "
        f"Article: {request_body.article}"
    )

    messages_for_llm = [
        {"role": "system", "content": system_prompt_content},
        {"role": "user", "content": prompt}
    ]

    prompt_tokens = count_tokens_in_text(prompt)
    logger.info(f"🧾 Prompt received (first 100 chars): {prompt[:100]}...")
    logger.info(f"Tokens in prompt: {prompt_tokens}")

    try:
        response = llm.create_chat_completion(
            messages=messages_for_llm,
            max_tokens=request_body.max_tokens,
            temperature=0.7,
            stop=["</s>", "<|im_end|>", "\n\n---"],
            top_p=0.9,
            top_k=40,
            repeat_penalty=1.1
        )
        ai_response_content = response["choices"][0]["message"]["content"].strip()
        response_token_count = count_tokens_in_text(ai_response_content)

        logger.info("βœ… Response generated successfully.")
        return {
            "response": ai_response_content,
            "prompt_tokens": prompt_tokens,
            "response_token_count": response_token_count
        }
    except Exception as e:
        logger.error(f"❌ Error during generation: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=f"Failed to generate response: {e}. Please try again.")


@app.post("/extract_keywords")
async def extract_keywords(request_body: TextForKeywords):
    """
    Extracts keywords from a given text.
    """
    logger.info("➑️ /extract_keywords endpoint received a request.")

    # Define the system prompt - consistent for all LLM interactions
    system_prompt_content = (
        "You are a highly efficient, objective, and precise Data and News analysis API. "
        "Your sole function is to process the provided text (data or news) and instructions, "
        "then output ONLY the requested analysis in the exact specified format. "
        "**Crucially, do NOT include any conversational text, greetings, introductions "
        "(e.g., 'Here is the report', 'Below is the analysis'), conclusions, disclaimers, "
        "or any remarks about being an AI. Respond directly with the content.** "
        "Adhere strictly to all formatting requirements given in the user's prompt "
        "(e.g., 'summary:{}', numbered lists, bullet points, JSON structures). "
        "Focus exclusively on data insights, statistics, trends, influencing factors, "
        "and actionable recommendations if requested. Be concise, professional, and factual. "
        "If a request cannot be fulfilled due to data limitations or model capabilities, "
        "respond with: 'STATUS: FAILED_ANALYSIS; REASON: Unable to process this specific analytical request due to limitations.' "
        "No other text should be included."
    )

    # Modified prompt for clearer keyword extraction instruction
    prompt = (
        f"Extract exactly {request_body.num_keywords} most important keywords from the following text. "
        "Your output should be ONLY the comma-separated list of keywords, nothing else. "
        "For example, if the keywords are 'apple', 'banana', 'cherry', your output should be: 'apple, banana, cherry'. "
        f"Text: {request_body.text}"
    )

    messages_for_llm = [
        {"role": "system", "content": system_prompt_content},
        {"role": "user", "content": prompt}
    ]

    prompt_tokens = count_tokens_in_text(prompt)
    logger.info(f"🧾 Prompt received (first 100 chars): {prompt[:100]}...")
    logger.info(f"Tokens in prompt: {prompt_tokens}")

    try:
        response = llm.create_chat_completion(
            messages=messages_for_llm,
            max_tokens=request_body.max_tokens,
            temperature=0.7,
            stop=["</s>", "<|im_end|>", "\n\n---"],
            top_p=0.9,
            top_k=40,
            repeat_penalty=1.1
        )
        ai_response_content = response["choices"][0]["message"]["content"].strip()
        response_token_count = count_tokens_in_text(ai_response_content)

        logger.info("βœ… Response generated successfully.")
        return {
            "response": ai_response_content,
            "prompt_tokens": prompt_tokens,
            "response_token_count": response_token_count
        }
    except Exception as e:
        logger.error(f"❌ Error during generation: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=f"Failed to generate response: {e}. Please try again.")