searchgpt / src /engine /browser_engine.py
hadadrjt's picture
SearchGPT: Release pre-stable scripts.
02ce7c3
#
# SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
# SPDX-License-Identifier: Apache-2.0
#
import aiohttp
import asyncio
from urllib.parse import quote
from config import (
CONTENT_EXTRACTION,
SEARCH_SELECTION,
TCP_CONNECTOR_ENABLE_DNS_CACHE,
TCP_CONNECTOR_TTL_DNS_CACHE,
TCP_CONNECTOR_LIMIT,
TCP_CONNECTOR_LIMIT_PER_HOST,
TCP_CONNECTOR_FORCE_CLOSE,
TCP_CONNECTOR_ENABLE_CLEANUP,
ENABLE_TRUST_ENV,
ENABLE_CONNECTOR_OWNER
)
from src.core.web_loader import web_loader
class BrowserEngine:
def __init__(self, configuration):
self.config = configuration
def generate_headers(self):
ipv4 = web_loader.get_ipv4()
ipv6 = web_loader.get_ipv6()
user_agent = web_loader.get_user_agent()
origin = web_loader.get_origin()
referrer = web_loader.get_referrer()
location = web_loader.get_location()
return {
"User-Agent": user_agent,
"X-Forwarded-For": f"{ipv4}, {ipv6}",
"X-Real-IP": ipv4,
"X-Originating-IP": ipv4,
"X-Remote-IP": ipv4,
"X-Remote-Addr": ipv4,
"X-Client-IP": ipv4,
"X-Forwarded-Host": origin.replace("https://", "").replace("http://", ""),
"Origin": origin,
"Referer": referrer,
"Accept-Language": f"{location['language']},en;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "cross-site",
"Sec-Fetch-User": "?1",
"Cache-Control": "max-age=0",
"X-Country": location['country'],
"X-Timezone": location['timezone']
}
def web_selector(self, search_query: str, search_provider: str = "google"):
if search_provider == "baidu":
return (
f"{self.config.content_reader_api}{self.config.baidu_endpoint}?wd={quote(search_query)}",
"#content_left"
)
provider_prefix = "!go" if search_provider == "google" else "!bi"
return (
f"{self.config.content_reader_api}{self.config.searxng_endpoint}?q={quote(f'{provider_prefix} {search_query}')}",
"#urls"
)
async def web_request(self, method: str, url: str, headers: dict, data: dict = None):
timeout = aiohttp.ClientTimeout(total=self.config.request_timeout)
connector = aiohttp.TCPConnector(
use_dns_cache=TCP_CONNECTOR_ENABLE_DNS_CACHE,
ttl_dns_cache=TCP_CONNECTOR_TTL_DNS_CACHE,
limit=TCP_CONNECTOR_LIMIT,
limit_per_host=TCP_CONNECTOR_LIMIT_PER_HOST,
force_close=TCP_CONNECTOR_FORCE_CLOSE,
enable_cleanup_closed=TCP_CONNECTOR_ENABLE_CLEANUP
)
async with aiohttp.ClientSession(
timeout=timeout,
connector=connector,
trust_env=ENABLE_TRUST_ENV,
connector_owner=ENABLE_CONNECTOR_OWNER
) as session:
async with session.request(method, url, headers=headers, data=data) as response:
text = await response.text()
response.raise_for_status()
return text
async def _post(self, url: str, data: dict, headers: dict):
return await self.web_request("POST", url, headers, data)
async def _get(self, url: str, headers: dict):
return await self.web_request("GET", url, headers)
def extract_page_content(self, target_url: str) -> str:
headers = self.generate_headers()
payload = {"url": target_url}
try:
extracted_content = asyncio.run(self._post(self.config.content_reader_api, payload, headers))
return f"{extracted_content}\n\n\n{CONTENT_EXTRACTION}\n\n\n"
except Exception as error:
return f"Error reading URL: {str(error)}"
def perform_search(self, search_query: str, search_provider: str = "google") -> str:
headers = self.generate_headers()
full_url, selector = self.web_selector(search_query, search_provider)
headers["X-Target-Selector"] = selector
try:
search_results = asyncio.run(self._get(full_url, headers))
return f"{search_results}\n\n\n{SEARCH_SELECTION}\n\n\n"
except Exception as error:
return f"Error during search: {str(error)}"