File size: 3,204 Bytes
7f8188c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from langchain_community.document_loaders.firecrawl import FireCrawlLoader
import os
from dotenv import load_dotenv
import asyncio
from rich.pretty import pprint  # noqa
from typing import List
from langchain_core.documents import Document
import re
import aiohttp
from tenacity import (
    retry,
    stop_after_attempt,
    wait_exponential,
    retry_if_exception_type,
)
import logging


load_dotenv()

FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


def clean_markdown(document: Document) -> Document:
    raw_content = document.page_content
    metadata = {
        "url": document.metadata.get("og:url", document.metadata.get("ogUrl", None)),
        "title": document.metadata.get(
            "og:title", document.metadata.get("ogTitle", None)
        ),
        "description": document.metadata.get(
            "og:description", document.metadata.get("ogDescription", None)
        ),
    }

    try:
        cleaned_content = re.sub(r"\!\[.*?\]\(.*?\)", "", raw_content)
        cleaned_content = re.sub(r"![.*?](.*?)", "", cleaned_content)
        cleaned_content = re.sub(r"\[.*?\]\(.*?\)", "", cleaned_content)
        cleaned_content = re.sub(r"(\w)-\n(\w)", r"\1\2", cleaned_content)
        cleaned_content = re.sub(r"\n\n\n+", "\n\n", cleaned_content)
        cleaned_content = re.sub(r"([^a-zA-Z0-9\s])\1{3,}", r"\1\1", cleaned_content)
        cleaned_content = re.sub(r"[\U0001F300-\U0001F9FF]+\n\n", "", cleaned_content)
        cleaned_content = re.sub(r"\n\n[/#]\n\n", "\n\n", cleaned_content)
        cleaned_content = cleaned_content.strip()
    except Exception as e:
        logger.error(f"Error cleaning markdown: {e}")
        raise e

    document.page_content = cleaned_content
    document.metadata = metadata

    return document


@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=4, max=15),
    retry=retry_if_exception_type(
        (Exception, asyncio.TimeoutError, aiohttp.ClientError)
    ),
    retry_error_callback=lambda retry_state: None,
)
async def scrape_website(url: str):
    logger.info(f"Scraping url : {url}")
    try:
        lc_loader = FireCrawlLoader(
            url=url,
            api_key=FIRECRAWL_API_KEY,
            mode="scrape",
            params={
                "formats": ["markdown"],
                "onlyMainContent": True,
                "removeBase64Images": True,
                "skipTlsVerification": True,
            },
        )
        lc_doc = await lc_loader.aload()
        cleaned_lc_doc = clean_markdown(lc_doc[0])
        return cleaned_lc_doc
    except Exception as e:
        logger.error(f"Error scraping {url}: {e}")
        raise e


async def scrape_main(urls: List[str]):
    tasks = [scrape_website(url) for url in urls]
    responses = await asyncio.gather(*tasks, return_exceptions=True)
    return [
        response
        for response in responses
        if response is not None or isinstance(response, Exception)
    ]


if __name__ == "__main__":
    urls = ["https://www.artisan.co", "https://www.artisan.co/about"]
    responses = asyncio.run(scrape_main(urls))
    pprint(responses)