Final_Assignment_Template

Sleeping

Final_Assignment_Template / wikipedia_tables_parser.py

Markiian Tsalyk

LlamaIndex agent

12c47a4 3 months ago

3.52 kB

	import pandas as pd
	import requests
	from bs4 import BeautifulSoup


	def fetch_wikipedia_tables(
	url: str,
	handle_special_chars: bool = True,
	) -> list[pd.DataFrame]:
	"""
	Fetch tables from a Wikipedia URL with robust error handling.

	Parameters:
	-----------
	url : str
	The Wikipedia URL to fetch tables from.
	handle_special_chars : bool, default True
	Whether to clean special characters in data before parsing.

	Returns:
	--------
	list of pd.DataFrame
	A list of pandas DataFrames containing the tables found on the page.
	"""
	try:
	all_tables = _fetch_tables_with_bs4(url)

	if handle_special_chars:
	# Clean tables to handle special characters and formatting issues
	for i, table in enumerate(all_tables):
	all_tables[i] = _clean_table(table)

	if all_tables:
	return all_tables
	else:
	print(f"No tables found at {url}")
	return []
	except Exception as e:
	print(f"Error fetching tables: {e}")
	return []


	def _fetch_tables_with_bs4(url: str) -> list[pd.DataFrame]:
	"""Method to fetch tables using BeautifulSoup."""
	try:
	response = requests.get(url)
	response.raise_for_status()
	soup = BeautifulSoup(response.content, "html.parser")
	tables = []

	for table in soup.find_all("table", {"class": "wikitable"}):
	data = []
	headers = []

	# Extract headers
	for th in table.find_all("th"):
	headers.append(th.text.strip())

	# If no headers found in th tags, try first tr
	if not headers and table.find("tr"):
	for td in table.find("tr").find_all(["th", "td"]):
	headers.append(td.text.strip())

	# Extract rows
	for row in table.find_all("tr")[1:] if headers else table.find_all("tr"):
	row_data = []
	for cell in row.find_all(["td", "th"]):
	row_data.append(cell.text.strip())
	if row_data: # Skip empty rows
	data.append(row_data)

	# Create DataFrame
	if data:
	if headers and len(headers) == len(data[0]):
	df = pd.DataFrame(data, columns=headers)
	else:
	df = pd.DataFrame(data)
	tables.append(df)

	return tables
	except Exception as e:
	print(f"Error in BeautifulSoup fallback: {e}")
	return []


	def _clean_table(df: pd.DataFrame) -> pd.DataFrame:
	"""Clean a table by handling special characters and formatting issues."""
	# Make a copy to avoid modifying the original
	df = df.copy()

	# Handle all string columns
	for col in df.columns:
	if df[col].dtype == "object":
	# Replace common problematic characters
	df[col] = df[col].astype(str).str.replace(";", "", regex=False)
	df[col] = df[col].str.replace("−", "-", regex=False) # Replace minus sign
	df[col] = df[col].str.replace(
	"\xa0", " ", regex=False
	) # Replace non-breaking space
	df[col] = df[col].str.replace("\n", " ", regex=False) # Replace newlines
	df[col] = df[col].str.strip() # Strip whitespace

	# Remove reference tags like [1], [2], etc.
	df[col] = df[col].str.replace(r"\[\d+\]", "", regex=True)

	return df