File size: 1,070 Bytes
8eb0c1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from typing import Optional
import requests
from bs4 import BeautifulSoup, ResultSet


class GenericScraper:

    def __init__(self) -> None:
        pass

    def scrape(self, url: str) -> str:
        response: requests.Response = requests.get(url)

        if response.status_code != 200:
            raise Exception(
                f'Failed to fetch url: {url} with status code {response.status_code}'
            )

        soup: BeautifulSoup = BeautifulSoup(response.content, 'html.parser')

        sections: ResultSet[BeautifulSoup] = soup.find_all(
            ['div', 'section', 'article']
        )
        max_p_len = 0
        best_section: Optional[BeautifulSoup] = None

        for section in sections:
            ps = section.find_all('p', recursive=False)
            p_len = len('\n'.join([p.get_text() for p in ps]))
            if p_len > max_p_len:
                max_p_len = p_len
                best_section = section

        if best_section is None:
            raise Exception('No sections found')

        return best_section.get_text()