Spaces:
Sleeping
Sleeping
from typing import Optional | |
import requests | |
from bs4 import BeautifulSoup, ResultSet | |
class GenericScraper: | |
def __init__(self) -> None: | |
pass | |
def scrape(self, url: str) -> str: | |
response: requests.Response = requests.get(url) | |
if response.status_code != 200: | |
raise Exception( | |
f'Failed to fetch url: {url} with status code {response.status_code}' | |
) | |
soup: BeautifulSoup = BeautifulSoup(response.content, 'html.parser') | |
sections: ResultSet[BeautifulSoup] = soup.find_all( | |
['div', 'section', 'article'] | |
) | |
max_p_len = 0 | |
best_section: Optional[BeautifulSoup] = None | |
for section in sections: | |
ps = section.find_all('p', recursive=False) | |
p_len = len('\n'.join([p.get_text() for p in ps])) | |
if p_len > max_p_len: | |
max_p_len = p_len | |
best_section = section | |
if best_section is None: | |
raise Exception('No sections found') | |
return best_section.get_text() | |