|
import requests |
|
from bs4 import BeautifulSoup |
|
from transformers import pipeline |
|
import nltk |
|
import concurrent.futures |
|
|
|
nltk.download("punkt") |
|
|
|
|
|
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-6-6") |
|
|
|
|
|
NEWS_API_KEY = "c272443116c025bc14b1e6bb62d7a1d8" |
|
NEWS_API_URL = "https://gnews.io/api/v4/search" |
|
|
|
|
|
def scrape_full_article(url): |
|
"""Scrapes only the first 3 paragraphs of an article to reduce processing time.""" |
|
try: |
|
headers = {"User-Agent": "Mozilla/5.0"} |
|
response = requests.get(url, headers=headers, timeout=5) |
|
|
|
if response.status_code == 200: |
|
soup = BeautifulSoup(response.text, "html.parser") |
|
|
|
|
|
paragraphs = soup.find_all("p")[:3] |
|
full_text = " ".join([para.get_text() for para in paragraphs]) |
|
|
|
return full_text if len(full_text) > 50 else "Full article unavailable." |
|
else: |
|
return "Could not retrieve article." |
|
except Exception as e: |
|
return f"Error fetching article: {str(e)}" |
|
|
|
|
|
def summarize_text(text): |
|
"""Summarizes text efficiently with adaptive length settings.""" |
|
|
|
if not text.strip(): |
|
return "No content available to summarize." |
|
|
|
max_input_length = 400 |
|
words = text.split() |
|
|
|
|
|
if len(words) > max_input_length: |
|
text = " ".join(words[:max_input_length]) |
|
|
|
|
|
input_length = len(text.split()) |
|
max_summary_length = max(50, int(input_length * 0.5)) |
|
min_summary_length = max(25, int(input_length * 0.2)) |
|
|
|
return summarizer(text, max_length=max_summary_length, min_length=min_summary_length, do_sample=False)[0]["summary_text"] |
|
|
|
|
|
def get_news(company, query=None): |
|
"""Fetches news articles from GNews API and scrapes full content in parallel.""" |
|
params = { |
|
"q": f"{company} {query}" if query else company, |
|
"token": NEWS_API_KEY, |
|
"lang": "en", |
|
"sortby": "publishedAt", |
|
"max": 5 |
|
} |
|
|
|
response = requests.get(NEWS_API_URL, params=params) |
|
|
|
if response.status_code == 200: |
|
articles = response.json().get("articles", []) |
|
|
|
results = [] |
|
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: |
|
future_to_article = {executor.submit(scrape_full_article, art["url"]): art for art in articles} |
|
for future in concurrent.futures.as_completed(future_to_article): |
|
art = future_to_article[future] |
|
try: |
|
full_text = future.result() |
|
|
|
|
|
summarized_text = summarize_text(full_text) if full_text and full_text != "Full article unavailable." else "Summary unavailable." |
|
|
|
results.append({ |
|
"title": art["title"], |
|
"content": summarized_text, |
|
"url": art["url"], |
|
"image": art.get("image", ""), |
|
"publishedAt": art["publishedAt"], |
|
"source": art["source"]["name"] |
|
}) |
|
except Exception as e: |
|
print(f"Error processing article: {str(e)}") |
|
|
|
return results |
|
|
|
return [] |
|
|