Spaces:
Running
Running
""" | |
ISO20022 SWIFT MT564 Documentation Scraper | |
This script scrapes the SWIFT MT564 (Corporate Action Notification) documentation | |
from the ISO20022 website and converts it into structured JSON data for model training. | |
""" | |
import os | |
import json | |
import argparse | |
import logging | |
from typing import Dict, List, Any, Optional | |
import requests | |
from bs4 import BeautifulSoup | |
import trafilatura | |
# Configure logging | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
# Constants | |
BASE_URL = "https://www.iso20022.org/15022/uhb/finmt564.htm" | |
OUTPUT_DIR = "../data/raw" | |
def parse_args(): | |
parser = argparse.ArgumentParser(description="Scrape SWIFT MT564 documentation from ISO20022 website") | |
parser.add_argument( | |
"--url", | |
type=str, | |
default=BASE_URL, | |
help="URL of the ISO20022 SWIFT MT564 documentation" | |
) | |
parser.add_argument( | |
"--output_dir", | |
type=str, | |
default=OUTPUT_DIR, | |
help="Directory to save scraped data" | |
) | |
parser.add_argument( | |
"--format", | |
type=str, | |
choices=["json", "txt"], | |
default="json", | |
help="Output format (json or txt)" | |
) | |
return parser.parse_args() | |
def fetch_page(url: str) -> Optional[str]: | |
"""Fetch HTML content from URL""" | |
try: | |
logger.info(f"Fetching {url}") | |
response = requests.get(url, timeout=30) | |
response.raise_for_status() | |
return response.text | |
except requests.RequestException as e: | |
logger.error(f"Error fetching {url}: {e}") | |
return None | |
def extract_text_with_trafilatura(html: str) -> Optional[str]: | |
"""Extract main text content using trafilatura""" | |
try: | |
return trafilatura.extract(html) | |
except Exception as e: | |
logger.error(f"Error extracting text with trafilatura: {e}") | |
return None | |
def extract_table_structure(soup: BeautifulSoup) -> List[Dict[str, Any]]: | |
"""Extract structured data from tables in the documentation""" | |
tables = soup.find_all("table") | |
results = [] | |
for table_idx, table in enumerate(tables): | |
logger.info(f"Processing table {table_idx+1} of {len(tables)}") | |
# Extract table headers | |
headers = [] | |
header_row = table.find("tr") | |
if header_row: | |
for th in header_row.find_all(["th", "td"]): | |
headers.append(th.text.strip()) | |
# Process rows | |
rows = [] | |
for row in table.find_all("tr")[1:]: # Skip header row | |
cell_data = {} | |
cells = row.find_all(["td", "th"]) | |
for i, cell in enumerate(cells): | |
if i < len(headers): | |
header = headers[i] | |
cell_data[header] = cell.text.strip() | |
else: | |
# Handle case where cell doesn't have a matching header | |
cell_data[f"column_{i}"] = cell.text.strip() | |
if cell_data: | |
rows.append(cell_data) | |
results.append({ | |
"table_id": table_idx + 1, | |
"headers": headers, | |
"rows": rows | |
}) | |
return results | |
def extract_sequence_information(soup: BeautifulSoup) -> Dict[str, Any]: | |
"""Extract information about MT564 sequence structure""" | |
sequences = [] | |
# Look for sequence headers (typically h2 or h3 elements) | |
sequence_headers = soup.find_all(["h2", "h3", "h4"]) | |
for header in sequence_headers: | |
text = header.text.strip() | |
# Check if it's a sequence header (contains "Sequence" and a letter) | |
if "sequence" in text.lower() and any(f"sequence {letter}" in text.lower() for letter in "abcdef"): | |
sequence_name = text | |
# Find details about this sequence | |
details = [] | |
next_element = header.find_next() | |
while next_element and next_element.name not in ["h2", "h3", "h4"]: | |
if next_element.name == "p": | |
details.append(next_element.text.strip()) | |
next_element = next_element.find_next() | |
# Find the table immediately following this header | |
table = header.find_next("table") | |
fields = [] | |
if table: | |
rows = table.find_all("tr")[1:] # Skip header row | |
for row in rows: | |
cells = row.find_all(["td", "th"]) | |
if len(cells) >= 3: | |
field = { | |
"tag": cells[0].text.strip(), | |
"name": cells[1].text.strip(), | |
"status": cells[2].text.strip() | |
} | |
if len(cells) > 3: | |
field["format"] = cells[3].text.strip() | |
fields.append(field) | |
sequences.append({ | |
"name": sequence_name, | |
"details": details, | |
"fields": fields | |
}) | |
return {"sequences": sequences} | |
def parse_documentation(html: str) -> Dict[str, Any]: | |
"""Parse the MT564 documentation and extract structured information""" | |
soup = BeautifulSoup(html, "html.parser") | |
# Extract general information | |
title = soup.title.text.strip() if soup.title else "MT564 Documentation" | |
# Get all paragraphs | |
paragraphs = [p.text.strip() for p in soup.find_all("p") if p.text.strip()] | |
# Extract tables | |
tables = extract_table_structure(soup) | |
# Extract sequence information | |
sequence_info = extract_sequence_information(soup) | |
# Combine all information | |
result = { | |
"title": title, | |
"general_description": paragraphs[:3] if len(paragraphs) >= 3 else paragraphs, | |
"tables": tables, | |
"sequences": sequence_info["sequences"] | |
} | |
return result | |
def save_output(data: Dict[str, Any], output_dir: str, format: str = "json"): | |
"""Save extracted data to the output directory""" | |
os.makedirs(output_dir, exist_ok=True) | |
if format == "json": | |
output_file = os.path.join(output_dir, "mt564_documentation.json") | |
with open(output_file, "w", encoding="utf-8") as f: | |
json.dump(data, f, indent=2, ensure_ascii=False) | |
logger.info(f"Saved JSON data to {output_file}") | |
else: # txt format | |
output_file = os.path.join(output_dir, "mt564_documentation.txt") | |
with open(output_file, "w", encoding="utf-8") as f: | |
f.write(f"# {data['title']}\n\n") | |
f.write("## General Description\n\n") | |
for para in data["general_description"]: | |
f.write(f"{para}\n\n") | |
f.write("## Sequences\n\n") | |
for seq in data["sequences"]: | |
f.write(f"### {seq['name']}\n\n") | |
for detail in seq["details"]: | |
f.write(f"{detail}\n\n") | |
f.write("#### Fields\n\n") | |
for field in seq["fields"]: | |
f.write(f"- {field['tag']}: {field['name']} ({field['status']})\n") | |
f.write("\n") | |
logger.info(f"Saved text data to {output_file}") | |
def main(): | |
args = parse_args() | |
# Fetch the HTML content | |
html_content = fetch_page(args.url) | |
if not html_content: | |
logger.error("Failed to fetch the documentation. Exiting.") | |
return | |
# Extract and save raw text content | |
text_content = extract_text_with_trafilatura(html_content) | |
if text_content: | |
os.makedirs(args.output_dir, exist_ok=True) | |
raw_text_file = os.path.join(args.output_dir, "mt564_raw_text.txt") | |
with open(raw_text_file, "w", encoding="utf-8") as f: | |
f.write(text_content) | |
logger.info(f"Saved raw text to {raw_text_file}") | |
# Parse structured information | |
structured_data = parse_documentation(html_content) | |
# Save output in the requested format | |
save_output(structured_data, args.output_dir, args.format) | |
logger.info("Scraping complete!") | |
if __name__ == "__main__": | |
main() |