File size: 1,516 Bytes
267487c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
from typing import Dict
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
async def scraper(link: str) -> Dict:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
try:
await page.goto(link, timeout=15000)
except PlaywrightTimeoutError:
await browser.close()
return {"error": "Timeout while loading the page."}
# Get body text
page_text = await page.locator("body").inner_text()
# Get all <script src=...>
script_sources = await page.eval_on_selector_all(
"script[src]", "elements => elements.map(e => e.src)"
)
# Get all <link href=...>
link_sources = await page.eval_on_selector_all(
"link[href]", "elements => elements.map(e => e.href)"
)
await browser.close()
return {
"page_text": page_text,
"script_sources": script_sources,
"link_sources": link_sources
}
app = FastAPI()
class ScrapeRequest(BaseModel):
url: str
@app.post("/scrape")
async def scrape_endpoint(request: ScrapeRequest):
try:
data = await scraper(request.url)
return data
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
|