|
import requests |
|
from googlesearch import search |
|
from bs4 import BeautifulSoup |
|
from typing import List, Dict |
|
import time |
|
from config import Config |
|
from urllib.parse import urlparse |
|
|
|
|
|
class GoogleSearchTool: |
|
"""Google Search tool for legal questions with insufficient information""" |
|
|
|
def __init__(self): |
|
self.search_delay = 1 |
|
|
|
def search_legal_info( |
|
self, query: str, num_results: int = None |
|
) -> List[Dict[str, str]]: |
|
if num_results is None: |
|
num_results = Config.GOOGLE_SEARCH_RESULTS_COUNT |
|
|
|
try: |
|
|
|
enhanced_queries = [ |
|
f"{query} luật pháp Việt Nam site:thuvienphapluat.vn", |
|
f"{query} pháp luật Việt Nam site:moj.gov.vn", |
|
f"{query} quy định pháp luật Việt Nam", |
|
f"{query} luật việt nam điều khoản", |
|
] |
|
|
|
all_results = [] |
|
seen_urls = set() |
|
|
|
|
|
for enhanced_query in enhanced_queries: |
|
if len(all_results) >= num_results: |
|
break |
|
|
|
try: |
|
search_results = search(enhanced_query, num_results=3, lang="vi") |
|
|
|
for url in search_results: |
|
if len(all_results) >= num_results: |
|
break |
|
|
|
if url in seen_urls: |
|
continue |
|
|
|
seen_urls.add(url) |
|
|
|
try: |
|
|
|
content = self._get_page_content(url) |
|
if content and content.get("snippet"): |
|
all_results.append( |
|
{ |
|
"url": url, |
|
"title": content.get( |
|
"title", "Không có tiêu đề" |
|
), |
|
"snippet": content.get( |
|
"snippet", "Không có nội dung" |
|
), |
|
"domain": self._extract_domain(url), |
|
} |
|
) |
|
|
|
time.sleep(self.search_delay) |
|
|
|
except Exception as e: |
|
print(f"Error fetching content from {url}: {e}") |
|
continue |
|
|
|
except Exception as e: |
|
print(f"Error with search query '{enhanced_query}': {e}") |
|
continue |
|
|
|
return all_results[:num_results] |
|
|
|
except Exception as e: |
|
print(f"Error performing Google search: {e}") |
|
return [] |
|
|
|
def _extract_domain(self, url: str) -> str: |
|
"""Extract domain from URL""" |
|
try: |
|
parsed = urlparse(url) |
|
return parsed.netloc |
|
except: |
|
return "Unknown" |
|
|
|
def _get_page_content(self, url: str) -> Dict[str, str]: |
|
"""Extract content from a web page with better Vietnamese content handling""" |
|
try: |
|
headers = { |
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", |
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", |
|
"Accept-Language": "vi-VN,vi;q=0.9,en;q=0.8", |
|
"Accept-Encoding": "gzip, deflate", |
|
"Connection": "keep-alive", |
|
} |
|
|
|
response = requests.get(url, headers=headers, timeout=15) |
|
response.raise_for_status() |
|
|
|
|
|
if response.encoding.lower() in ["iso-8859-1", "windows-1252"]: |
|
response.encoding = "utf-8" |
|
|
|
soup = BeautifulSoup(response.content, "html.parser") |
|
|
|
|
|
title_tag = soup.find("title") |
|
title = title_tag.get_text().strip() if title_tag else "Không có tiêu đề" |
|
|
|
|
|
for element in soup( |
|
["script", "style", "nav", "header", "footer", "aside", "iframe"] |
|
): |
|
element.decompose() |
|
|
|
|
|
main_content = None |
|
content_selectors = [ |
|
"article", |
|
"main", |
|
".content", |
|
".post-content", |
|
".entry-content", |
|
".article-content", |
|
".news-content", |
|
"#content", |
|
".main-content", |
|
] |
|
|
|
for selector in content_selectors: |
|
main_content = soup.select_one(selector) |
|
if main_content: |
|
break |
|
|
|
|
|
if not main_content: |
|
main_content = soup.find("body") |
|
|
|
if main_content: |
|
text = main_content.get_text() |
|
else: |
|
text = soup.get_text() |
|
|
|
|
|
lines = (line.strip() for line in text.splitlines()) |
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) |
|
text = " ".join(chunk for chunk in chunks if chunk and len(chunk) > 3) |
|
|
|
|
|
legal_keywords = [ |
|
"luật", |
|
"điều", |
|
"khoản", |
|
"quy định", |
|
"nghị định", |
|
"thông tư", |
|
"quyền", |
|
"nghĩa vụ", |
|
] |
|
|
|
|
|
sentences = text.split(".") |
|
relevant_sentences = [] |
|
|
|
for sentence in sentences: |
|
if any(keyword in sentence.lower() for keyword in legal_keywords): |
|
relevant_sentences.append(sentence.strip()) |
|
if len(" ".join(relevant_sentences)) > 400: |
|
break |
|
|
|
if relevant_sentences: |
|
snippet = ". ".join(relevant_sentences[:3]) |
|
else: |
|
snippet = text[:600] + "..." if len(text) > 600 else text |
|
|
|
return {"title": title, "snippet": snippet} |
|
|
|
except Exception as e: |
|
print(f"Error extracting content from {url}: {e}") |
|
return {} |
|
|
|
def format_search_results(self, results: List[Dict[str, str]]) -> str: |
|
"""Format search results for LLM context""" |
|
if not results: |
|
return "Không tìm thấy thông tin liên quan." |
|
|
|
formatted_results = "" |
|
|
|
for i, result in enumerate(results, 1): |
|
formatted_results += f"**Nguồn {i}: {result['title']}**\n" |
|
formatted_results += f"Website: {result.get('domain', 'Unknown')}\n" |
|
formatted_results += f"Nội dung: {result['snippet']}\n" |
|
formatted_results += f"Link: {result['url']}\n\n" |
|
|
|
return formatted_results |
|
|
|
def format_search_results_for_display(self, results: List[Dict[str, str]]) -> str: |
|
"""Format search results for UI display with clickable links""" |
|
if not results: |
|
return "Không tìm thấy thông tin tham khảo từ web." |
|
|
|
|
|
formatted_html = '<div style="background-color: #f8f9fa; padding: 15px; border-radius: 8px; margin: 10px 0;">' |
|
formatted_html += '<h4 style="color: #1e40af; margin-bottom: 15px;">🌐 Nguồn tham khảo từ web:</h4>' |
|
|
|
for i, result in enumerate(results, 1): |
|
|
|
title_escaped = result["title"].replace("<", "<").replace(">", ">") |
|
snippet_escaped = ( |
|
result["snippet"][:200].replace("<", "<").replace(">", ">") |
|
) |
|
if len(result["snippet"]) > 200: |
|
snippet_escaped += "..." |
|
|
|
formatted_html += f"""<div style="background-color: white; padding: 12px; margin-bottom: 10px; border-radius: 6px; border-left: 4px solid #3b82f6;"> |
|
<h5 style="margin: 0; color: #1e40af;"> |
|
<a href="{result['url']}" target="_blank" style="text-decoration: none; color: #1e40af;"> |
|
{i}. {title_escaped} |
|
</a> |
|
</h5> |
|
<p style="color: #6b7280; font-size: 0.9em; margin: 5px 0;"> |
|
📄 {result.get('domain', 'Unknown')} |
|
</p> |
|
<p style="margin: 8px 0; color: #374151; line-height: 1.5;"> |
|
{snippet_escaped} |
|
</p> |
|
<a href="{result['url']}" target="_blank" style="color: #3b82f6; text-decoration: none; font-size: 0.9em;"> |
|
🔗 Xem chi tiết → |
|
</a> |
|
</div>""" |
|
|
|
formatted_html += "</div>" |
|
return formatted_html |
|
|