File size: 5,717 Bytes
6d24925
 
 
 
 
8cb2491
fbd9dbe
8cb2491
 
 
6d24925
 
 
 
 
 
 
 
 
 
8cb2491
 
 
 
 
 
 
a62e0f6
8cb2491
 
 
 
 
a62e0f6
8cb2491
 
 
 
c72e054
6d24925
a62e0f6
8cb2491
 
 
 
 
 
a62e0f6
8cb2491
 
 
 
 
 
 
a62e0f6
8cb2491
 
 
 
 
 
 
 
 
 
 
 
a62e0f6
 
8cb2491
 
 
 
 
 
 
fbd9dbe
8cb2491
 
 
 
fbd9dbe
 
 
8cb2491
 
 
 
 
fbd9dbe
8cb2491
 
c72e054
6d24925
 
fbd9dbe
8cb2491
 
 
 
fbd9dbe
8cb2491
 
 
 
 
 
 
 
 
fbd9dbe
 
8cb2491
 
 
 
 
6d24925
fbd9dbe
6d24925
a62e0f6
6d24925
fbd9dbe
6d24925
 
a62e0f6
 
 
8cb2491
a62e0f6
 
fbd9dbe
 
8cb2491
 
 
 
6d24925
8cb2491
6d24925
8cb2491
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import requests
import trafilatura
from newspaper import Article
from typing import Optional
from bs4 import BeautifulSoup
import logging
import re 

# Configure logging at the beginning of your script or module
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/115.0.0.0 Safari/537.36"
    )
}

def clean_text(text: str) -> str:
    """
    Cleans extracted text by removing HTML tags, normalizing whitespace,
    and optionally removing common non-content patterns.
    """
    if not text:
        return ""
    
    soup = BeautifulSoup(text, "html.parser")

    # Add double newlines after paragraphs to preserve some structure
    for p in soup.find_all('p'):
        p.append('\n\n')

    cleaned = soup.get_text(separator=" ", strip=True)
    
    # Normalize all whitespace characters to single spaces, then strip leading/trailing
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()

    return cleaned

def is_low_quality(text: str) -> bool:
    """
    Detect navigation garbage, footers, or low-word-count dumps.
    Uses an expanded list of junk markers and word count checks.
    """
    if not text:
        logging.debug("Text is empty, considered low quality.")
        return True

    words = text.split()
    if len(words) < 150: # Increased minimum word count slightly for better content
        logging.debug(f"Text has only {len(words)} words, considered low quality (min 150).")
        return True

    # Expanded list of common junk phrases/markers
    junk_markers = [
        "subscribe to our newsletter", "cookie policy", "terms and conditions",
        "privacy statement", "all rights reserved", "contact us", "about us",
        "careers", "sitemap", "advertisement", "sponsored content",
        "read more", "view all", "back to top", "connect with us",
        "follow us on", "email us", "download our app", "footer",
        "comments policy", "disclaimer", "affiliate links", "related posts",
        "latest updates", "breaking news", "trending topics", "more news",
        "featured stories", "sign up", "login", "register", "join us",
        "newsletter signup", "skip to content", "navigation", "main menu",
        "sidebar", "archive", "categories", "tags", "go to top", "licence",
        "unlimited access", "support us", "exclusive content", "follow @",
        "copyright", "imprint", "impressum", "legal notice"
    ]

    low_quality_score = 0
    lower_text = text.lower()

    for marker in junk_markers:
        if marker in lower_text:
            low_quality_score += 1

    if low_quality_score >= 4: 
        logging.debug(f"Detected {low_quality_score} junk markers, considered low quality.")
        return True
    
    lines = text.split('\n')
    if len(lines) > 15: 
        short_lines_count = sum(1 for line in lines if 0 < len(line.split()) < 7) 
        if short_lines_count / len(lines) > 0.4: 
            logging.debug(f"Detected {short_lines_count}/{len(lines)} ({short_lines_count/len(lines):.1%}) short lines, potential low quality.")
            return True

    return False

def scrape_url(url: str, timeout: int = 15) -> Optional[str]: 
    logging.info(f"Attempting to scrape: {url}")

    # Try Trafilatura first
    try:
        response = requests.get(url, timeout=timeout, headers=HEADERS)
        response.raise_for_status() 
        
        try:
            html = response.content.decode(response.apparent_encoding)
        except UnicodeDecodeError:
            html = response.content.decode('utf-8', errors='ignore') 

        extracted = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False)
        
        if extracted:
            text = clean_text(extracted)
            if not is_low_quality(text):
                logging.info(f"Successfully extracted content using Trafilatura for: {url}")
                return text
            else:
                # Log when content is identified as low quality by Trafilatura
                logging.warning(f"LOW_QUALITY_CONTENT (Trafilatura): {url} - Reason: Content identified as low quality.")
        else:
            logging.info(f"Trafilatura returned no main content for: {url}. Trying fallback.")

    except requests.exceptions.RequestException as req_err:
        logging.error(f"Trafilatura (Requests) failed for {url}: {req_err}")
    except Exception as e:
        logging.error(f"Trafilatura (Extraction/Processing) failed for {url}: {e}", exc_info=False)

    # Fallback to newspaper3k
    try:
        article = Article(url, headers=HEADERS, keep_article_html=False) 
        article.download()
        article.parse()
        if article.text:
            text = clean_text(article.text)
            if not is_low_quality(text):
                logging.info(f"Successfully extracted content using Newspaper3k for: {url}")
                return text
            else:
                # Log when content is identified as low quality by Newspaper3k
                logging.warning(f"LOW_QUALITY_CONTENT (Newspaper3k): {url} - Reason: Content identified as low quality.")
        else:
            logging.info(f"Newspaper3k returned no main content for: {url}.")
    except requests.exceptions.RequestException as req_err:
        logging.error(f"Newspaper3k (Requests) failed for {url}: {req_err}")
    except Exception as e:
        logging.error(f"Newspaper3k (Parsing/Processing) failed for {url}: {e}", exc_info=False)

    logging.error(f"Failed to extract quality content from: {url} using both methods.")
    return None