Spaces:

Subham629
/

news

Sleeping

File size: 4,020 Bytes

45f19fe
e1745d2
b156679
 
77809e8
173dc66
eddc833
173dc66
8b3b91e
 
173dc66
c3b1136
2b667a8
a86a3e9
eddc833
8b3b91e
e1745d2
8b3b91e
e1745d2
 
 
 
 
 
 
8b3b91e
 
e1745d2
 
8b3b91e
e1745d2
 
 
 
 
8b3b91e
c3b1136
8b3b91e
2b667a8
77809e8
 
 
8b3b91e
2b667a8
 
8b3b91e
2b667a8
77809e8
 
8b3b91e
 
 
 
 
 
c3b1136
77809e8
b156679
77809e8
eddc833
 
e1745d2
88bef16
 
77809e8
eddc833
88bef16
eddc833
e1745d2
a258ae5
 
c3b1136
e1745d2
77809e8
 
8b3b91e
77809e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e1745d2
 
88bef16
b156679

import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import nltk
import concurrent.futures  # ✅ Enables parallel processing

nltk.download("punkt")

# ✅ Load a Faster Summarization Model (DistilBART for Speed)
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-6-6")  # ✅ 3X Faster Model

# ✅ NewsAPI Configuration
NEWS_API_KEY = "c272443116c025bc14b1e6bb62d7a1d8" 
NEWS_API_URL = "https://gnews.io/api/v4/search"

# ✅ Function to Scrape Full Article (Limits Text for Speed)
def scrape_full_article(url):
    """Scrapes only the first 3 paragraphs of an article to reduce processing time."""
    try:
        headers = {"User-Agent": "Mozilla/5.0"}
        response = requests.get(url, headers=headers, timeout=5)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")

            # ✅ Fetch only first 3 paragraphs (Reduces processing time)
            paragraphs = soup.find_all("p")[:3]  
            full_text = " ".join([para.get_text() for para in paragraphs])

            return full_text if len(full_text) > 50 else "Full article unavailable."
        else:
            return "Could not retrieve article."
    except Exception as e:
        return f"Error fetching article: {str(e)}"

# ✅ Function to Summarize Text (Dynamically Adjusts Length)
def summarize_text(text):
    """Summarizes text efficiently with adaptive length settings."""
    
    if not text.strip():  # ✅ Check if text is empty
        return "No content available to summarize."

    max_input_length = 400  # ✅ Reduce input size for speed
    words = text.split()

    # ✅ Truncate text to 400 words for faster summarization
    if len(words) > max_input_length:
        text = " ".join(words[:max_input_length])  

    # ✅ Dynamically adjust summarization length
    input_length = len(text.split())
    max_summary_length = max(50, int(input_length * 0.5))  # 50% of input
    min_summary_length = max(25, int(input_length * 0.2))  # 20% of input

    return summarizer(text, max_length=max_summary_length, min_length=min_summary_length, do_sample=False)[0]["summary_text"]

# ✅ Function to Fetch News (Parallel Processing for Speed)
def get_news(company, query=None):
    """Fetches news articles from GNews API and scrapes full content in parallel."""
    params = {
        "q": f"{company} {query}" if query else company,
        "token": NEWS_API_KEY,  
        "lang": "en",
        "sortby": "publishedAt",
        "max": 5  # ✅ Limit to 5 results for faster performance
    }
    
    response = requests.get(NEWS_API_URL, params=params)

    if response.status_code == 200:
        articles = response.json().get("articles", [])

        results = []

        # ✅ Process news articles in parallel (Boosts Speed)
        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:  # ✅ Use 5 threads
            future_to_article = {executor.submit(scrape_full_article, art["url"]): art for art in articles}
            for future in concurrent.futures.as_completed(future_to_article):
                art = future_to_article[future]
                try:
                    full_text = future.result()  # ✅ Get scraped text
                    
                    # ✅ Ensure non-empty content before summarization
                    summarized_text = summarize_text(full_text) if full_text and full_text != "Full article unavailable." else "Summary unavailable."

                    results.append({
                        "title": art["title"],
                        "content": summarized_text,
                        "url": art["url"],
                        "image": art.get("image", ""),
                        "publishedAt": art["publishedAt"],
                        "source": art["source"]["name"]
                    })
                except Exception as e:
                    print(f"Error processing article: {str(e)}")

        return results
    
    return []