clickloom-scraper-2 / clickloom.py
apexherbert200's picture
Test 1
267487c
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
from typing import Dict
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
async def scraper(link: str) -> Dict:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
try:
await page.goto(link, timeout=15000)
except PlaywrightTimeoutError:
await browser.close()
return {"error": "Timeout while loading the page."}
# Get body text
page_text = await page.locator("body").inner_text()
# Get all <script src=...>
script_sources = await page.eval_on_selector_all(
"script[src]", "elements => elements.map(e => e.src)"
)
# Get all <link href=...>
link_sources = await page.eval_on_selector_all(
"link[href]", "elements => elements.map(e => e.href)"
)
await browser.close()
return {
"page_text": page_text,
"script_sources": script_sources,
"link_sources": link_sources
}
app = FastAPI()
class ScrapeRequest(BaseModel):
url: str
@app.post("/scrape")
async def scrape_endpoint(request: ScrapeRequest):
try:
data = await scraper(request.url)
return data
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))