Spaces:

loringuyen
/

QA-system-in-Vietnamese-law

Sleeping

App Files Files Community

QA-system-in-Vietnamese-law / utils /google_search.py

loringuyen

Upload folder using huggingface_hub

32238e9 verified 3 months ago

raw

history blame contribute delete

9.04 kB

	import requests
	from googlesearch import search
	from bs4 import BeautifulSoup
	from typing import List, Dict
	import time
	from config import Config
	from urllib.parse import urlparse


	class GoogleSearchTool:
	"""Google Search tool for legal questions with insufficient information"""

	def __init__(self):
	self.search_delay = 1

	def search_legal_info(
	self, query: str, num_results: int = None
	) -> List[Dict[str, str]]:
	if num_results is None:
	num_results = Config.GOOGLE_SEARCH_RESULTS_COUNT

	try:
	# Enhanced Vietnamese legal query patterns
	enhanced_queries = [
	f"{query} luật pháp Việt Nam site:thuvienphapluat.vn",
	f"{query} pháp luật Việt Nam site:moj.gov.vn",
	f"{query} quy định pháp luật Việt Nam",
	f"{query} luật việt nam điều khoản",
	]

	all_results = []
	seen_urls = set()

	# Try different search queries to get better results
	for enhanced_query in enhanced_queries:
	if len(all_results) >= num_results:
	break

	try:
	search_results = search(enhanced_query, num_results=3, lang="vi")

	for url in search_results:
	if len(all_results) >= num_results:
	break

	if url in seen_urls:
	continue

	seen_urls.add(url)

	try:
	# Get page content
	content = self._get_page_content(url)
	if content and content.get("snippet"):
	all_results.append(
	{
	"url": url,
	"title": content.get(
	"title", "Không có tiêu đề"
	),
	"snippet": content.get(
	"snippet", "Không có nội dung"
	),
	"domain": self._extract_domain(url),
	}
	)

	time.sleep(self.search_delay)

	except Exception as e:
	print(f"Error fetching content from {url}: {e}")
	continue

	except Exception as e:
	print(f"Error with search query '{enhanced_query}': {e}")
	continue

	return all_results[:num_results]

	except Exception as e:
	print(f"Error performing Google search: {e}")
	return []

	def _extract_domain(self, url: str) -> str:
	"""Extract domain from URL"""
	try:
	parsed = urlparse(url)
	return parsed.netloc
	except:
	return "Unknown"

	def _get_page_content(self, url: str) -> Dict[str, str]:
	"""Extract content from a web page with better Vietnamese content handling"""
	try:
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	"Accept-Language": "vi-VN,vi;q=0.9,en;q=0.8",
	"Accept-Encoding": "gzip, deflate",
	"Connection": "keep-alive",
	}

	response = requests.get(url, headers=headers, timeout=15)
	response.raise_for_status()

	# Handle encoding for Vietnamese content
	if response.encoding.lower() in ["iso-8859-1", "windows-1252"]:
	response.encoding = "utf-8"

	soup = BeautifulSoup(response.content, "html.parser")

	# Extract title
	title_tag = soup.find("title")
	title = title_tag.get_text().strip() if title_tag else "Không có tiêu đề"

	# Remove unwanted elements
	for element in soup(
	["script", "style", "nav", "header", "footer", "aside", "iframe"]
	):
	element.decompose()

	# Try to find main content areas
	main_content = None
	content_selectors = [
	"article",
	"main",
	".content",
	".post-content",
	".entry-content",
	".article-content",
	".news-content",
	"#content",
	".main-content",
	]

	for selector in content_selectors:
	main_content = soup.select_one(selector)
	if main_content:
	break

	# If no main content found, use body
	if not main_content:
	main_content = soup.find("body")

	if main_content:
	text = main_content.get_text()
	else:
	text = soup.get_text()

	# Clean up text for Vietnamese content
	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	text = " ".join(chunk for chunk in chunks if chunk and len(chunk) > 3)

	# Extract meaningful snippet (prioritize Vietnamese legal terms)
	legal_keywords = [
	"luật",
	"điều",
	"khoản",
	"quy định",
	"nghị định",
	"thông tư",
	"quyền",
	"nghĩa vụ",
	]

	# Try to find sentences with legal keywords
	sentences = text.split(".")
	relevant_sentences = []

	for sentence in sentences:
	if any(keyword in sentence.lower() for keyword in legal_keywords):
	relevant_sentences.append(sentence.strip())
	if len(" ".join(relevant_sentences)) > 400:
	break

	if relevant_sentences:
	snippet = ". ".join(relevant_sentences[:3])
	else:
	snippet = text[:600] + "..." if len(text) > 600 else text

	return {"title": title, "snippet": snippet}

	except Exception as e:
	print(f"Error extracting content from {url}: {e}")
	return {}

	def format_search_results(self, results: List[Dict[str, str]]) -> str:
	"""Format search results for LLM context"""
	if not results:
	return "Không tìm thấy thông tin liên quan."

	formatted_results = ""

	for i, result in enumerate(results, 1):
	formatted_results += f"Nguồn {i}: {result['title']}\n"
	formatted_results += f"Website: {result.get('domain', 'Unknown')}\n"
	formatted_results += f"Nội dung: {result['snippet']}\n"
	formatted_results += f"Link: {result['url']}\n\n"

	return formatted_results

	def format_search_results_for_display(self, results: List[Dict[str, str]]) -> str:
	"""Format search results for UI display with clickable links"""
	if not results:
	return "Không tìm thấy thông tin tham khảo từ web."

	# Clean HTML formatting without leading whitespaces
	formatted_html = '<div style="background-color: #f8f9fa; padding: 15px; border-radius: 8px; margin: 10px 0;">'
	formatted_html += '<h4 style="color: #1e40af; margin-bottom: 15px;">🌐 Nguồn tham khảo từ web:</h4>'

	for i, result in enumerate(results, 1):
	# Escape HTML characters in content
	title_escaped = result["title"].replace("<", "<").replace(">", ">")
	snippet_escaped = (
	result["snippet"][:200].replace("<", "<").replace(">", ">")
	)
	if len(result["snippet"]) > 200:
	snippet_escaped += "..."

	formatted_html += f"""<div style="background-color: white; padding: 12px; margin-bottom: 10px; border-radius: 6px; border-left: 4px solid #3b82f6;">
	<h5 style="margin: 0; color: #1e40af;">
	<a href="{result['url']}" target="_blank" style="text-decoration: none; color: #1e40af;">
	{i}. {title_escaped}
	</a>
	</h5>
	<p style="color: #6b7280; font-size: 0.9em; margin: 5px 0;">
	📄 {result.get('domain', 'Unknown')}
	</p>
	<p style="margin: 8px 0; color: #374151; line-height: 1.5;">
	{snippet_escaped}
	</p>
	<a href="{result['url']}" target="_blank" style="color: #3b82f6; text-decoration: none; font-size: 0.9em;">
	🔗 Xem chi tiết →
	</a>
	</div>"""

	formatted_html += "</div>"
	return formatted_html