Spaces:
Sleeping
Sleeping
File size: 1,070 Bytes
8eb0c1a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
from typing import Optional
import requests
from bs4 import BeautifulSoup, ResultSet
class GenericScraper:
def __init__(self) -> None:
pass
def scrape(self, url: str) -> str:
response: requests.Response = requests.get(url)
if response.status_code != 200:
raise Exception(
f'Failed to fetch url: {url} with status code {response.status_code}'
)
soup: BeautifulSoup = BeautifulSoup(response.content, 'html.parser')
sections: ResultSet[BeautifulSoup] = soup.find_all(
['div', 'section', 'article']
)
max_p_len = 0
best_section: Optional[BeautifulSoup] = None
for section in sections:
ps = section.find_all('p', recursive=False)
p_len = len('\n'.join([p.get_text() for p in ps]))
if p_len > max_p_len:
max_p_len = p_len
best_section = section
if best_section is None:
raise Exception('No sections found')
return best_section.get_text()
|