shoukaku's picture
add get input from url
8eb0c1a
from typing import Optional
import requests
from bs4 import BeautifulSoup, ResultSet
class GenericScraper:
def __init__(self) -> None:
pass
def scrape(self, url: str) -> str:
response: requests.Response = requests.get(url)
if response.status_code != 200:
raise Exception(
f'Failed to fetch url: {url} with status code {response.status_code}'
)
soup: BeautifulSoup = BeautifulSoup(response.content, 'html.parser')
sections: ResultSet[BeautifulSoup] = soup.find_all(
['div', 'section', 'article']
)
max_p_len = 0
best_section: Optional[BeautifulSoup] = None
for section in sections:
ps = section.find_all('p', recursive=False)
p_len = len('\n'.join([p.get_text() for p in ps]))
if p_len > max_p_len:
max_p_len = p_len
best_section = section
if best_section is None:
raise Exception('No sections found')
return best_section.get_text()