File size: 1,516 Bytes
267487c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
from typing import Dict
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel


async def scraper(link: str) -> Dict:
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        try:
            await page.goto(link, timeout=15000)  
        except PlaywrightTimeoutError:
            await browser.close()
            return {"error": "Timeout while loading the page."}


        # Get body text
        page_text = await page.locator("body").inner_text()

        # Get all <script src=...>
        script_sources = await page.eval_on_selector_all(
            "script[src]", "elements => elements.map(e => e.src)"
        )

        # Get all <link href=...>
        link_sources = await page.eval_on_selector_all(
            "link[href]", "elements => elements.map(e => e.href)"
        )

        await browser.close()

        return {
            "page_text": page_text,
            "script_sources": script_sources,
            "link_sources": link_sources
        }


app = FastAPI()

class ScrapeRequest(BaseModel):
    url: str

@app.post("/scrape")
async def scrape_endpoint(request: ScrapeRequest):
    try:
        data = await scraper(request.url)
        return data
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))