|
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError |
|
from typing import Dict |
|
from fastapi import FastAPI, HTTPException |
|
from pydantic import BaseModel |
|
|
|
|
|
async def scraper(link: str) -> Dict: |
|
async with async_playwright() as p: |
|
browser = await p.chromium.launch(headless=True) |
|
context = await browser.new_context() |
|
page = await context.new_page() |
|
|
|
try: |
|
await page.goto(link, timeout=15000) |
|
except PlaywrightTimeoutError: |
|
await browser.close() |
|
return {"error": "Timeout while loading the page."} |
|
|
|
|
|
|
|
page_text = await page.locator("body").inner_text() |
|
|
|
|
|
script_sources = await page.eval_on_selector_all( |
|
"script[src]", "elements => elements.map(e => e.src)" |
|
) |
|
|
|
|
|
link_sources = await page.eval_on_selector_all( |
|
"link[href]", "elements => elements.map(e => e.href)" |
|
) |
|
|
|
await browser.close() |
|
|
|
return { |
|
"page_text": page_text, |
|
"script_sources": script_sources, |
|
"link_sources": link_sources |
|
} |
|
|
|
|
|
app = FastAPI() |
|
|
|
class ScrapeRequest(BaseModel): |
|
url: str |
|
|
|
@app.post("/scrape") |
|
async def scrape_endpoint(request: ScrapeRequest): |
|
try: |
|
data = await scraper(request.url) |
|
return data |
|
except Exception as e: |
|
raise HTTPException(status_code=500, detail=str(e)) |
|
|
|
|