news / api.py
Subham629's picture
Update api.py
45f19fe verified
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import nltk
import concurrent.futures # βœ… Enables parallel processing
nltk.download("punkt")
# βœ… Load a Faster Summarization Model (DistilBART for Speed)
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-6-6") # βœ… 3X Faster Model
# βœ… NewsAPI Configuration
NEWS_API_KEY = "c272443116c025bc14b1e6bb62d7a1d8"
NEWS_API_URL = "https://gnews.io/api/v4/search"
# βœ… Function to Scrape Full Article (Limits Text for Speed)
def scrape_full_article(url):
"""Scrapes only the first 3 paragraphs of an article to reduce processing time."""
try:
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers, timeout=5)
if response.status_code == 200:
soup = BeautifulSoup(response.text, "html.parser")
# βœ… Fetch only first 3 paragraphs (Reduces processing time)
paragraphs = soup.find_all("p")[:3]
full_text = " ".join([para.get_text() for para in paragraphs])
return full_text if len(full_text) > 50 else "Full article unavailable."
else:
return "Could not retrieve article."
except Exception as e:
return f"Error fetching article: {str(e)}"
# βœ… Function to Summarize Text (Dynamically Adjusts Length)
def summarize_text(text):
"""Summarizes text efficiently with adaptive length settings."""
if not text.strip(): # βœ… Check if text is empty
return "No content available to summarize."
max_input_length = 400 # βœ… Reduce input size for speed
words = text.split()
# βœ… Truncate text to 400 words for faster summarization
if len(words) > max_input_length:
text = " ".join(words[:max_input_length])
# βœ… Dynamically adjust summarization length
input_length = len(text.split())
max_summary_length = max(50, int(input_length * 0.5)) # 50% of input
min_summary_length = max(25, int(input_length * 0.2)) # 20% of input
return summarizer(text, max_length=max_summary_length, min_length=min_summary_length, do_sample=False)[0]["summary_text"]
# βœ… Function to Fetch News (Parallel Processing for Speed)
def get_news(company, query=None):
"""Fetches news articles from GNews API and scrapes full content in parallel."""
params = {
"q": f"{company} {query}" if query else company,
"token": NEWS_API_KEY,
"lang": "en",
"sortby": "publishedAt",
"max": 5 # βœ… Limit to 5 results for faster performance
}
response = requests.get(NEWS_API_URL, params=params)
if response.status_code == 200:
articles = response.json().get("articles", [])
results = []
# βœ… Process news articles in parallel (Boosts Speed)
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: # βœ… Use 5 threads
future_to_article = {executor.submit(scrape_full_article, art["url"]): art for art in articles}
for future in concurrent.futures.as_completed(future_to_article):
art = future_to_article[future]
try:
full_text = future.result() # βœ… Get scraped text
# βœ… Ensure non-empty content before summarization
summarized_text = summarize_text(full_text) if full_text and full_text != "Full article unavailable." else "Summary unavailable."
results.append({
"title": art["title"],
"content": summarized_text,
"url": art["url"],
"image": art.get("image", ""),
"publishedAt": art["publishedAt"],
"source": art["source"]["name"]
})
except Exception as e:
print(f"Error processing article: {str(e)}")
return results
return []