Spaces:
No application file
No application file
File size: 3,204 Bytes
7f8188c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
from langchain_community.document_loaders.firecrawl import FireCrawlLoader
import os
from dotenv import load_dotenv
import asyncio
from rich.pretty import pprint # noqa
from typing import List
from langchain_core.documents import Document
import re
import aiohttp
from tenacity import (
retry,
stop_after_attempt,
wait_exponential,
retry_if_exception_type,
)
import logging
load_dotenv()
FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
def clean_markdown(document: Document) -> Document:
raw_content = document.page_content
metadata = {
"url": document.metadata.get("og:url", document.metadata.get("ogUrl", None)),
"title": document.metadata.get(
"og:title", document.metadata.get("ogTitle", None)
),
"description": document.metadata.get(
"og:description", document.metadata.get("ogDescription", None)
),
}
try:
cleaned_content = re.sub(r"\!\[.*?\]\(.*?\)", "", raw_content)
cleaned_content = re.sub(r"", "", cleaned_content)
cleaned_content = re.sub(r"\[.*?\]\(.*?\)", "", cleaned_content)
cleaned_content = re.sub(r"(\w)-\n(\w)", r"\1\2", cleaned_content)
cleaned_content = re.sub(r"\n\n\n+", "\n\n", cleaned_content)
cleaned_content = re.sub(r"([^a-zA-Z0-9\s])\1{3,}", r"\1\1", cleaned_content)
cleaned_content = re.sub(r"[\U0001F300-\U0001F9FF]+\n\n", "", cleaned_content)
cleaned_content = re.sub(r"\n\n[/#]\n\n", "\n\n", cleaned_content)
cleaned_content = cleaned_content.strip()
except Exception as e:
logger.error(f"Error cleaning markdown: {e}")
raise e
document.page_content = cleaned_content
document.metadata = metadata
return document
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=4, max=15),
retry=retry_if_exception_type(
(Exception, asyncio.TimeoutError, aiohttp.ClientError)
),
retry_error_callback=lambda retry_state: None,
)
async def scrape_website(url: str):
logger.info(f"Scraping url : {url}")
try:
lc_loader = FireCrawlLoader(
url=url,
api_key=FIRECRAWL_API_KEY,
mode="scrape",
params={
"formats": ["markdown"],
"onlyMainContent": True,
"removeBase64Images": True,
"skipTlsVerification": True,
},
)
lc_doc = await lc_loader.aload()
cleaned_lc_doc = clean_markdown(lc_doc[0])
return cleaned_lc_doc
except Exception as e:
logger.error(f"Error scraping {url}: {e}")
raise e
async def scrape_main(urls: List[str]):
tasks = [scrape_website(url) for url in urls]
responses = await asyncio.gather(*tasks, return_exceptions=True)
return [
response
for response in responses
if response is not None or isinstance(response, Exception)
]
if __name__ == "__main__":
urls = ["https://www.artisan.co", "https://www.artisan.co/about"]
responses = asyncio.run(scrape_main(urls))
pprint(responses)
|