File size: 4,020 Bytes
45f19fe e1745d2 b156679 77809e8 173dc66 eddc833 173dc66 8b3b91e 173dc66 c3b1136 2b667a8 a86a3e9 eddc833 8b3b91e e1745d2 8b3b91e e1745d2 8b3b91e e1745d2 8b3b91e e1745d2 8b3b91e c3b1136 8b3b91e 2b667a8 77809e8 8b3b91e 2b667a8 8b3b91e 2b667a8 77809e8 8b3b91e c3b1136 77809e8 b156679 77809e8 eddc833 e1745d2 88bef16 77809e8 eddc833 88bef16 eddc833 e1745d2 a258ae5 c3b1136 e1745d2 77809e8 8b3b91e 77809e8 e1745d2 88bef16 b156679 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import nltk
import concurrent.futures # β
Enables parallel processing
nltk.download("punkt")
# β
Load a Faster Summarization Model (DistilBART for Speed)
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-6-6") # β
3X Faster Model
# β
NewsAPI Configuration
NEWS_API_KEY = "c272443116c025bc14b1e6bb62d7a1d8"
NEWS_API_URL = "https://gnews.io/api/v4/search"
# β
Function to Scrape Full Article (Limits Text for Speed)
def scrape_full_article(url):
"""Scrapes only the first 3 paragraphs of an article to reduce processing time."""
try:
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers, timeout=5)
if response.status_code == 200:
soup = BeautifulSoup(response.text, "html.parser")
# β
Fetch only first 3 paragraphs (Reduces processing time)
paragraphs = soup.find_all("p")[:3]
full_text = " ".join([para.get_text() for para in paragraphs])
return full_text if len(full_text) > 50 else "Full article unavailable."
else:
return "Could not retrieve article."
except Exception as e:
return f"Error fetching article: {str(e)}"
# β
Function to Summarize Text (Dynamically Adjusts Length)
def summarize_text(text):
"""Summarizes text efficiently with adaptive length settings."""
if not text.strip(): # β
Check if text is empty
return "No content available to summarize."
max_input_length = 400 # β
Reduce input size for speed
words = text.split()
# β
Truncate text to 400 words for faster summarization
if len(words) > max_input_length:
text = " ".join(words[:max_input_length])
# β
Dynamically adjust summarization length
input_length = len(text.split())
max_summary_length = max(50, int(input_length * 0.5)) # 50% of input
min_summary_length = max(25, int(input_length * 0.2)) # 20% of input
return summarizer(text, max_length=max_summary_length, min_length=min_summary_length, do_sample=False)[0]["summary_text"]
# β
Function to Fetch News (Parallel Processing for Speed)
def get_news(company, query=None):
"""Fetches news articles from GNews API and scrapes full content in parallel."""
params = {
"q": f"{company} {query}" if query else company,
"token": NEWS_API_KEY,
"lang": "en",
"sortby": "publishedAt",
"max": 5 # β
Limit to 5 results for faster performance
}
response = requests.get(NEWS_API_URL, params=params)
if response.status_code == 200:
articles = response.json().get("articles", [])
results = []
# β
Process news articles in parallel (Boosts Speed)
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: # β
Use 5 threads
future_to_article = {executor.submit(scrape_full_article, art["url"]): art for art in articles}
for future in concurrent.futures.as_completed(future_to_article):
art = future_to_article[future]
try:
full_text = future.result() # β
Get scraped text
# β
Ensure non-empty content before summarization
summarized_text = summarize_text(full_text) if full_text and full_text != "Full article unavailable." else "Summary unavailable."
results.append({
"title": art["title"],
"content": summarized_text,
"url": art["url"],
"image": art.get("image", ""),
"publishedAt": art["publishedAt"],
"source": art["source"]["name"]
})
except Exception as e:
print(f"Error processing article: {str(e)}")
return results
return []
|