Spaces:
Running
Running
# | |
# SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org> | |
# SPDX-License-Identifier: Apache-2.0 | |
# | |
import aiohttp | |
import asyncio | |
from urllib.parse import quote | |
from config import ( | |
CONTENT_EXTRACTION, | |
SEARCH_SELECTION, | |
TCP_CONNECTOR_ENABLE_DNS_CACHE, | |
TCP_CONNECTOR_TTL_DNS_CACHE, | |
TCP_CONNECTOR_LIMIT, | |
TCP_CONNECTOR_LIMIT_PER_HOST, | |
TCP_CONNECTOR_FORCE_CLOSE, | |
TCP_CONNECTOR_ENABLE_CLEANUP, | |
ENABLE_TRUST_ENV, | |
ENABLE_CONNECTOR_OWNER | |
) | |
from src.core.web_loader import web_loader | |
class BrowserEngine: | |
def __init__(self, configuration): | |
self.config = configuration | |
def generate_headers(self): | |
ipv4 = web_loader.get_ipv4() | |
ipv6 = web_loader.get_ipv6() | |
user_agent = web_loader.get_user_agent() | |
origin = web_loader.get_origin() | |
referrer = web_loader.get_referrer() | |
location = web_loader.get_location() | |
return { | |
"User-Agent": user_agent, | |
"X-Forwarded-For": f"{ipv4}, {ipv6}", | |
"X-Real-IP": ipv4, | |
"X-Originating-IP": ipv4, | |
"X-Remote-IP": ipv4, | |
"X-Remote-Addr": ipv4, | |
"X-Client-IP": ipv4, | |
"X-Forwarded-Host": origin.replace("https://", "").replace("http://", ""), | |
"Origin": origin, | |
"Referer": referrer, | |
"Accept-Language": f"{location['language']},en;q=0.9", | |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", | |
"Accept-Encoding": "gzip, deflate, br", | |
"DNT": "1", | |
"Connection": "keep-alive", | |
"Upgrade-Insecure-Requests": "1", | |
"Sec-Fetch-Dest": "document", | |
"Sec-Fetch-Mode": "navigate", | |
"Sec-Fetch-Site": "cross-site", | |
"Sec-Fetch-User": "?1", | |
"Cache-Control": "max-age=0", | |
"X-Country": location['country'], | |
"X-Timezone": location['timezone'] | |
} | |
def web_selector(self, search_query: str, search_provider: str = "google"): | |
if search_provider == "baidu": | |
return ( | |
f"{self.config.content_reader_api}{self.config.baidu_endpoint}?wd={quote(search_query)}", | |
"#content_left" | |
) | |
provider_prefix = "!go" if search_provider == "google" else "!bi" | |
return ( | |
f"{self.config.content_reader_api}{self.config.searxng_endpoint}?q={quote(f'{provider_prefix} {search_query}')}", | |
"#urls" | |
) | |
async def web_request(self, method: str, url: str, headers: dict, data: dict = None): | |
timeout = aiohttp.ClientTimeout(total=self.config.request_timeout) | |
connector = aiohttp.TCPConnector( | |
use_dns_cache=TCP_CONNECTOR_ENABLE_DNS_CACHE, | |
ttl_dns_cache=TCP_CONNECTOR_TTL_DNS_CACHE, | |
limit=TCP_CONNECTOR_LIMIT, | |
limit_per_host=TCP_CONNECTOR_LIMIT_PER_HOST, | |
force_close=TCP_CONNECTOR_FORCE_CLOSE, | |
enable_cleanup_closed=TCP_CONNECTOR_ENABLE_CLEANUP | |
) | |
async with aiohttp.ClientSession( | |
timeout=timeout, | |
connector=connector, | |
trust_env=ENABLE_TRUST_ENV, | |
connector_owner=ENABLE_CONNECTOR_OWNER | |
) as session: | |
async with session.request(method, url, headers=headers, data=data) as response: | |
text = await response.text() | |
response.raise_for_status() | |
return text | |
async def _post(self, url: str, data: dict, headers: dict): | |
return await self.web_request("POST", url, headers, data) | |
async def _get(self, url: str, headers: dict): | |
return await self.web_request("GET", url, headers) | |
def extract_page_content(self, target_url: str) -> str: | |
headers = self.generate_headers() | |
payload = {"url": target_url} | |
try: | |
extracted_content = asyncio.run(self._post(self.config.content_reader_api, payload, headers)) | |
return f"{extracted_content}\n\n\n{CONTENT_EXTRACTION}\n\n\n" | |
except Exception as error: | |
return f"Error reading URL: {str(error)}" | |
def perform_search(self, search_query: str, search_provider: str = "google") -> str: | |
headers = self.generate_headers() | |
full_url, selector = self.web_selector(search_query, search_provider) | |
headers["X-Target-Selector"] = selector | |
try: | |
search_results = asyncio.run(self._get(full_url, headers)) | |
return f"{search_results}\n\n\n{SEARCH_SELECTION}\n\n\n" | |
except Exception as error: | |
return f"Error during search: {str(error)}" |