Spaces:
No application file
No application file
| from langchain_community.document_loaders.firecrawl import FireCrawlLoader | |
| import os | |
| from dotenv import load_dotenv | |
| import asyncio | |
| from rich.pretty import pprint # noqa | |
| from typing import List | |
| from langchain_core.documents import Document | |
| import re | |
| import aiohttp | |
| from tenacity import ( | |
| retry, | |
| stop_after_attempt, | |
| wait_exponential, | |
| retry_if_exception_type, | |
| ) | |
| import logging | |
| load_dotenv() | |
| FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY") | |
| logger = logging.getLogger(__name__) | |
| logger.setLevel(logging.INFO) | |
| def clean_markdown(document: Document) -> Document: | |
| raw_content = document.page_content | |
| metadata = { | |
| "url": document.metadata.get("og:url", document.metadata.get("ogUrl", None)), | |
| "title": document.metadata.get( | |
| "og:title", document.metadata.get("ogTitle", None) | |
| ), | |
| "description": document.metadata.get( | |
| "og:description", document.metadata.get("ogDescription", None) | |
| ), | |
| } | |
| try: | |
| cleaned_content = re.sub(r"\!\[.*?\]\(.*?\)", "", raw_content) | |
| cleaned_content = re.sub(r"", "", cleaned_content) | |
| cleaned_content = re.sub(r"\[.*?\]\(.*?\)", "", cleaned_content) | |
| cleaned_content = re.sub(r"(\w)-\n(\w)", r"\1\2", cleaned_content) | |
| cleaned_content = re.sub(r"\n\n\n+", "\n\n", cleaned_content) | |
| cleaned_content = re.sub(r"([^a-zA-Z0-9\s])\1{3,}", r"\1\1", cleaned_content) | |
| cleaned_content = re.sub(r"[\U0001F300-\U0001F9FF]+\n\n", "", cleaned_content) | |
| cleaned_content = re.sub(r"\n\n[/#]\n\n", "\n\n", cleaned_content) | |
| cleaned_content = cleaned_content.strip() | |
| except Exception as e: | |
| logger.error(f"Error cleaning markdown: {e}") | |
| raise e | |
| document.page_content = cleaned_content | |
| document.metadata = metadata | |
| return document | |
| async def scrape_website(url: str): | |
| logger.info(f"Scraping url : {url}") | |
| try: | |
| lc_loader = FireCrawlLoader( | |
| url=url, | |
| api_key=FIRECRAWL_API_KEY, | |
| mode="scrape", | |
| params={ | |
| "formats": ["markdown"], | |
| "onlyMainContent": True, | |
| "removeBase64Images": True, | |
| "skipTlsVerification": True, | |
| }, | |
| ) | |
| lc_doc = await lc_loader.aload() | |
| cleaned_lc_doc = clean_markdown(lc_doc[0]) | |
| return cleaned_lc_doc | |
| except Exception as e: | |
| logger.error(f"Error scraping {url}: {e}") | |
| raise e | |
| async def scrape_main(urls: List[str]): | |
| tasks = [scrape_website(url) for url in urls] | |
| responses = await asyncio.gather(*tasks, return_exceptions=True) | |
| return [ | |
| response | |
| for response in responses | |
| if response is not None or isinstance(response, Exception) | |
| ] | |
| if __name__ == "__main__": | |
| urls = ["https://www.artisan.co", "https://www.artisan.co/about"] | |
| responses = asyncio.run(scrape_main(urls)) | |
| pprint(responses) | |