Spaces:

Subham629
/

news

Sleeping

App Files Files Community

news / api.py

Subham629

Update api.py

45f19fe verified 5 months ago

raw

history blame contribute delete

4.02 kB

	import requests
	from bs4 import BeautifulSoup
	from transformers import pipeline
	import nltk
	import concurrent.futures # ✅ Enables parallel processing

	nltk.download("punkt")

	# ✅ Load a Faster Summarization Model (DistilBART for Speed)
	summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-6-6") # ✅ 3X Faster Model

	# ✅ NewsAPI Configuration
	NEWS_API_KEY = "c272443116c025bc14b1e6bb62d7a1d8"
	NEWS_API_URL = "https://gnews.io/api/v4/search"

	# ✅ Function to Scrape Full Article (Limits Text for Speed)
	def scrape_full_article(url):
	"""Scrapes only the first 3 paragraphs of an article to reduce processing time."""
	try:
	headers = {"User-Agent": "Mozilla/5.0"}
	response = requests.get(url, headers=headers, timeout=5)

	if response.status_code == 200:
	soup = BeautifulSoup(response.text, "html.parser")

	# ✅ Fetch only first 3 paragraphs (Reduces processing time)
	paragraphs = soup.find_all("p")[:3]
	full_text = " ".join([para.get_text() for para in paragraphs])

	return full_text if len(full_text) > 50 else "Full article unavailable."
	else:
	return "Could not retrieve article."
	except Exception as e:
	return f"Error fetching article: {str(e)}"

	# ✅ Function to Summarize Text (Dynamically Adjusts Length)
	def summarize_text(text):
	"""Summarizes text efficiently with adaptive length settings."""

	if not text.strip(): # ✅ Check if text is empty
	return "No content available to summarize."

	max_input_length = 400 # ✅ Reduce input size for speed
	words = text.split()

	# ✅ Truncate text to 400 words for faster summarization
	if len(words) > max_input_length:
	text = " ".join(words[:max_input_length])

	# ✅ Dynamically adjust summarization length
	input_length = len(text.split())
	max_summary_length = max(50, int(input_length * 0.5)) # 50% of input
	min_summary_length = max(25, int(input_length * 0.2)) # 20% of input

	return summarizer(text, max_length=max_summary_length, min_length=min_summary_length, do_sample=False)[0]["summary_text"]

	# ✅ Function to Fetch News (Parallel Processing for Speed)
	def get_news(company, query=None):
	"""Fetches news articles from GNews API and scrapes full content in parallel."""
	params = {
	"q": f"{company} {query}" if query else company,
	"token": NEWS_API_KEY,
	"lang": "en",
	"sortby": "publishedAt",
	"max": 5 # ✅ Limit to 5 results for faster performance
	}

	response = requests.get(NEWS_API_URL, params=params)

	if response.status_code == 200:
	articles = response.json().get("articles", [])

	results = []

	# ✅ Process news articles in parallel (Boosts Speed)
	with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: # ✅ Use 5 threads
	future_to_article = {executor.submit(scrape_full_article, art["url"]): art for art in articles}
	for future in concurrent.futures.as_completed(future_to_article):
	art = future_to_article[future]
	try:
	full_text = future.result() # ✅ Get scraped text

	# ✅ Ensure non-empty content before summarization
	summarized_text = summarize_text(full_text) if full_text and full_text != "Full article unavailable." else "Summary unavailable."

	results.append({
	"title": art["title"],
	"content": summarized_text,
	"url": art["url"],
	"image": art.get("image", ""),
	"publishedAt": art["publishedAt"],
	"source": art["source"]["name"]
	})
	except Exception as e:
	print(f"Error processing article: {str(e)}")

	return results

	return []