Spaces:

apexherbert200
/

clickloom-scraper-2

Running

clickloom-scraper-2 / clickloom.py

Test 1

267487c 2 months ago

1.52 kB

	from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
	from typing import Dict
	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel


	async def scraper(link: str) -> Dict:
	async with async_playwright() as p:
	browser = await p.chromium.launch(headless=True)
	context = await browser.new_context()
	page = await context.new_page()

	try:
	await page.goto(link, timeout=15000)
	except PlaywrightTimeoutError:
	await browser.close()
	return {"error": "Timeout while loading the page."}


	# Get body text
	page_text = await page.locator("body").inner_text()

	# Get all <script src=...>
	script_sources = await page.eval_on_selector_all(
	"script[src]", "elements => elements.map(e => e.src)"
	)

	# Get all <link href=...>
	link_sources = await page.eval_on_selector_all(
	"link[href]", "elements => elements.map(e => e.href)"
	)

	await browser.close()

	return {
	"page_text": page_text,
	"script_sources": script_sources,
	"link_sources": link_sources
	}


	app = FastAPI()

	class ScrapeRequest(BaseModel):
	url: str

	@app.post("/scrape")
	async def scrape_endpoint(request: ScrapeRequest):
	try:
	data = await scraper(request.url)
	return data
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))