import requests from bs4 import BeautifulSoup from transformers import pipeline import nltk import concurrent.futures # ✅ Enables parallel processing nltk.download("punkt") # ✅ Load a Faster Summarization Model (DistilBART for Speed) summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-6-6") # ✅ 3X Faster Model # ✅ NewsAPI Configuration NEWS_API_KEY = "c272443116c025bc14b1e6bb62d7a1d8" NEWS_API_URL = "https://gnews.io/api/v4/search" # ✅ Function to Scrape Full Article (Limits Text for Speed) def scrape_full_article(url): """Scrapes only the first 3 paragraphs of an article to reduce processing time.""" try: headers = {"User-Agent": "Mozilla/5.0"} response = requests.get(url, headers=headers, timeout=5) if response.status_code == 200: soup = BeautifulSoup(response.text, "html.parser") # ✅ Fetch only first 3 paragraphs (Reduces processing time) paragraphs = soup.find_all("p")[:3] full_text = " ".join([para.get_text() for para in paragraphs]) return full_text if len(full_text) > 50 else "Full article unavailable." else: return "Could not retrieve article." except Exception as e: return f"Error fetching article: {str(e)}" # ✅ Function to Summarize Text (Dynamically Adjusts Length) def summarize_text(text): """Summarizes text efficiently with adaptive length settings.""" if not text.strip(): # ✅ Check if text is empty return "No content available to summarize." max_input_length = 400 # ✅ Reduce input size for speed words = text.split() # ✅ Truncate text to 400 words for faster summarization if len(words) > max_input_length: text = " ".join(words[:max_input_length]) # ✅ Dynamically adjust summarization length input_length = len(text.split()) max_summary_length = max(50, int(input_length * 0.5)) # 50% of input min_summary_length = max(25, int(input_length * 0.2)) # 20% of input return summarizer(text, max_length=max_summary_length, min_length=min_summary_length, do_sample=False)[0]["summary_text"] # ✅ Function to Fetch News (Parallel Processing for Speed) def get_news(company, query=None): """Fetches news articles from GNews API and scrapes full content in parallel.""" params = { "q": f"{company} {query}" if query else company, "token": NEWS_API_KEY, "lang": "en", "sortby": "publishedAt", "max": 5 # ✅ Limit to 5 results for faster performance } response = requests.get(NEWS_API_URL, params=params) if response.status_code == 200: articles = response.json().get("articles", []) results = [] # ✅ Process news articles in parallel (Boosts Speed) with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: # ✅ Use 5 threads future_to_article = {executor.submit(scrape_full_article, art["url"]): art for art in articles} for future in concurrent.futures.as_completed(future_to_article): art = future_to_article[future] try: full_text = future.result() # ✅ Get scraped text # ✅ Ensure non-empty content before summarization summarized_text = summarize_text(full_text) if full_text and full_text != "Full article unavailable." else "Summary unavailable." results.append({ "title": art["title"], "content": summarized_text, "url": art["url"], "image": art.get("image", ""), "publishedAt": art["publishedAt"], "source": art["source"]["name"] }) except Exception as e: print(f"Error processing article: {str(e)}") return results return []