Spaces:

pjdevelop
/

tenderbot

Build error

App Files Files Community

tenderbot / app.py

pjdevelop

Deploy TenderBot to Hugging Face Spaces

d5e14e4 10 months ago

raw

history blame contribute delete

16.1 kB

	import os
	import re
	import requests
	import pandas as pd
	import gradio as gr
	import time
	import random
	from bs4 import BeautifulSoup
	from dateutil.parser import parse
	from datetime import datetime, timedelta
	from requests.adapters import HTTPAdapter
	from urllib3.util.retry import Retry

	# ─── 1. OPTIONAL: LLM FOR CORRECTION & PARAPHRASING ────────────────────────────
	try:
	from transformers import T5ForConditionalGeneration, T5Tokenizer
	tokenizer = T5Tokenizer.from_pretrained("t5-small")
	model = T5ForConditionalGeneration.from_pretrained("t5-small")

	def correct_text(raw_text: str) -> str:
	"""Paraphrase & correct via T5-small, with fallback on error."""
	try:
	prompt = "paraphrase and correct: " + raw_text.strip()
	inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
	outputs = model.generate(**inputs, max_length=128)
	return tokenizer.decode(outputs[0], skip_special_tokens=True)
	except Exception:
	return raw_text
	except ImportError:
	def correct_text(raw_text: str) -> str:
	# If transformers not installed, return raw text
	return raw_text

	# ─── 2. CREATE REQUESTS SESSION WITH RETRY LOGIC ──────────────────────────────
	def create_robust_session():
	"""Create a requests session with retry logic"""
	session = requests.Session()

	# Configure retry strategy
	retry_strategy = Retry(
	total=5, # Total number of retries
	backoff_factor=1, # Exponential backoff
	status_forcelist=[429, 500, 502, 503, 504], # Retry on these HTTP status codes
	allowed_methods=["GET", "POST"] # Allow retrying on POST requests
	)

	# Mount adapter with retry strategy
	adapter = HTTPAdapter(max_retries=retry_strategy)
	session.mount("http://", adapter)
	session.mount("https://", adapter)

	return session

	# ─── 3. SCRAPER FOR GeM CPPP ────────────────────────────────────────────────────
	def scrape_gem_cppp(keyword="", org_name="", start_date=None, end_date=None, max_pages=10):
	"""Scrape tender data from GeM CPPP portal with robust error handling"""
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Content-Type': 'application/x-www-form-urlencoded',
	'Referer': 'https://gem.gov.in/cppp',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	'Connection': 'keep-alive'
	}

	# Create a robust session with retry logic
	session = create_robust_session()

	tenders = []
	page = 1
	total_pages = max_pages

	while page <= total_pages and page <= max_pages:
	try:
	print(f"Fetching page {page} of maximum {max_pages}")

	# Prepare form data for the request
	form_data = {
	'page': str(page),
	'tid': '',
	'title': keyword,
	'orgname': org_name,
	'startdate': start_date.strftime('%d-%m-%Y') if start_date else '',
	'enddate': end_date.strftime('%d-%m-%Y') if end_date else '',
	't_outrefid': '',
	'search': '1',
	}

	# Add a small random delay to avoid rate limiting
	time.sleep(random.uniform(0.5, 1.5))

	# Make POST request with increased timeouts
	resp = session.post(
	"https://gem.gov.in/cppp",
	headers=headers,
	data=form_data,
	timeout=(30, 60) # (Connect timeout, Read timeout)
	)

	# Check if request was successful
	if resp.status_code != 200:
	print(f"Error: Received status code {resp.status_code}")
	break

	# Parse the response
	soup = BeautifulSoup(resp.text, "html.parser")

	# Find the tender table
	table = soup.find("table", {"class": "table"})
	if not table:
	print(f"No tender table found on page {page}")
	break

	# Extract data from rows (skip header row)
	rows = table.find_all("tr")[1:]
	if not rows:
	print(f"No tender rows found on page {page}")
	break

	print(f"Found {len(rows)} tender rows on page {page}")

	# Process each row
	for row in rows:
	cols = row.find_all("td")
	if len(cols) < 8:
	continue

	try:
	# Extract fields with detailed error handling
	closing = cols[0].get_text(strip=True)
	opening_date = cols[1].get_text(strip=True)
	publish_date = cols[2].get_text(strip=True)

	# Extract title and link with careful error handling
	title_el = cols[3].find("a")
	title = title_el.get_text(strip=True) if title_el else cols[3].get_text(strip=True)

	# Extract full link with proper domain
	link = ""
	if title_el and title_el.has_attr("href"):
	link = title_el["href"]
	if link and link.startswith("/"):
	link = "https://gem.gov.in" + link

	# Extract organization
	org = cols[4].get_text(strip=True)

	# Extract reference ID with better parsing
	full_text = cols[3].get_text(strip=True)
	ref_id = ""
	if title in full_text:
	ref_id = full_text.replace(title, "").strip("/").strip()
	else:
	# Try to extract any alphanumeric ID patterns
	id_match = re.search(r'[A-Za-z0-9_-]+/\d+', full_text)
	if id_match:
	ref_id = id_match.group(0)

	# Extract download link with proper error handling
	dl_el = cols[7].find("a")
	dl_link = ""
	if dl_el and dl_el.has_attr("href"):
	dl_link = dl_el["href"]
	# Ensure it's a complete URL
	if dl_link and dl_link.startswith("/"):
	dl_link = "https://gem.gov.in" + dl_link

	# Apply date filters if specified
	try:
	if closing:
	cdate = parse(closing)
	if start_date and cdate < start_date:
	continue
	if end_date and cdate > end_date:
	continue
	except Exception:
	# If date parsing fails, include the tender anyway
	pass

	# Add to results
	tenders.append({
	"Title": title,
	"Organization": org,
	"Closing Date": closing,
	"Opening Date": opening_date,
	"Published Date": publish_date,
	"Reference/Tender ID": ref_id,
	"Tender Link": link,
	"Download Link": dl_link
	})

	except Exception as row_err:
	print(f"Error processing row on page {page}: {row_err}")
	continue

	# Check for pagination
	pag = soup.find("ul", {"class": "pagination"})
	next_page_exists = False

	if pag:
	# Look for "Next" button or links to next pages
	next_link = pag.find("a", string=re.compile(r"Next", re.I))
	if next_link:
	next_page_exists = True

	# Also check for numbered page links
	page_links = pag.find_all("a")
	for link in page_links:
	try:
	page_num = int(link.get_text(strip=True))
	total_pages = max(total_pages, page_num)
	except (ValueError, TypeError):
	pass

	if not next_page_exists:
	print(f"No next page found after page {page}")
	break

	# Move to the next page
	page += 1

	except requests.Timeout:
	print(f"Timeout error on page {page}. Retrying...")
	continue

	except requests.RequestException as e:
	print(f"Request error on page {page}: {e}")
	# Wait before retrying
	time.sleep(5)
	continue

	except Exception as e:
	print(f"Unexpected error on page {page}: {e}")
	break

	print(f"Scraping completed: found {len(tenders)} tenders across {page} pages")
	return tenders

	# ─── 4. SUMMARY GENERATOR (ALL RESULTS) ────────────────────────────────────────
	def summarize_tenders(tenders: list[dict]) -> str:
	if not tenders:
	return "No tenders were found matching those criteria."

	lines = [f"I found {len(tenders)} tenders matching your criteria:\n"]

	# Sort tenders by closing date (newest first)
	try:
	tenders = sorted(tenders,
	key=lambda x: parse(x.get("Closing Date", "01-01-2000")),
	reverse=True)
	except Exception:
	# If sorting fails, continue with unsorted data
	pass

	# Generate the summary
	for idx, t in enumerate(tenders, 1):
	# Format title with link if available
	title_line = f"{idx}. "
	if t.get("Tender Link"):
	title_line += f"[{t['Title']}]({t['Tender Link']})"
	else:
	title_line += t['Title']

	lines.append(title_line)

	# Add organization info
	lines.append(f" • Organization: {t['Organization']}")

	# Add date information
	lines.append(f" • Closing Date: {t['Closing Date']}")

	if t.get("Opening Date") and t["Opening Date"].strip():
	lines.append(f" • Opening Date: {t['Opening Date']}")

	if t.get("Published Date") and t["Published Date"].strip():
	lines.append(f" • Published Date: {t['Published Date']}")

	# Add Reference ID
	if t.get("Reference/Tender ID") and t["Reference/Tender ID"].strip():
	lines.append(f" • Ref ID: {t['Reference/Tender ID']}")

	# Add download link if available
	if t.get("Download Link") and t["Download Link"].strip():
	lines.append(f" • [Download Tender Document]({t['Download Link']})")

	lines.append("") # Add a blank line between tenders

	# Return the formatted summary
	return "\n".join(lines)

	# ─── 5. CHAT FUNCTION ──────────────────────────────────────────────────────────
	def chat_fn(user_message: str, history):
	"""Process chat messages and extract search parameters"""
	# Debug output
	print(f"User Message: {user_message}")

	try:
	# Clean and potentially correct user message
	corrected = correct_text(user_message)
	print(f"Corrected Text: {corrected}")

	# Extract date ranges with flexible patterns
	date_patterns = [
	# Format: "from DD/MM/YYYY to DD/MM/YYYY"
	r"from\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})\s+to\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})",
	# Format: "between DD/MM/YYYY and DD/MM/YYYY"
	r"between\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})\s+and\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})"
	]

	start_date = end_date = None

	for pattern in date_patterns:
	match = re.search(pattern, corrected, re.I)
	if match:
	try:
	start_date = parse(match.group(1))
	end_date = parse(match.group(2))
	print(f"Dates extracted: {start_date} to {end_date}")
	break
	except Exception as e:
	print(f"Date parsing error: {e}")

	# Extract organization with multiple patterns
	org_patterns = [
	r"from\s+ministry\s+of\s+(\w+)",
	r"from\s+(\w+)\s+ministry",
	r"by\s+(\w+\s+\w+)",
	r"organization\s+(\w+\s+\w+)"
	]

	org = ""
	for pattern in org_patterns:
	org_match = re.search(pattern, corrected.lower())
	if org_match:
	org = org_match.group(1)
	print(f"Organization extracted: {org}")
	break

	# Extract keywords with smarter filtering
	stops = {"find", "search", "get", "tenders", "tender", "from", "to",
	"between", "after", "before", "the", "and", "of", "in"}

	# Try pattern matching first
	keyword = ""
	kw_match = re.search(r"(?:get\|find\|search)\s+(.*?)\s+tenders?", corrected.lower())
	if kw_match:
	keyword = kw_match.group(1).strip()
	else:
	# Fallback to word filtering
	words = re.findall(r"\b\w+\b", corrected.lower())
	keyword = " ".join(w for w in words if w not in stops and len(w) > 2)

	print(f"Final keyword: '{keyword}'")

	# Search for tenders
	results = scrape_gem_cppp(
	keyword=keyword.strip(),
	org_name=org,
	start_date=start_date,
	end_date=end_date,
	max_pages=10 # Increased max pages
	)

	# Generate reply
	bot_reply = summarize_tenders(results)

	except Exception as e:
	import traceback
	print(f"Error in chat function: {e}")
	print(traceback.format_exc())
	bot_reply = f"Sorry, an error occurred while processing your request: {str(e)}"

	return bot_reply

	# ─── 6. GRADIO APP ─────────────────────────────────────────────────────────────
	with gr.Blocks() as demo:
	gr.Markdown("## Government Tender Search Chatbot")
	gr.Markdown("Ask me to find tenders by keyword, organization, or date range.")
	gr.ChatInterface(
	fn=chat_fn,
	title="TenderBot",
	description="E.g. Search solar panel tenders from 01/06/2025 to 30/06/2025",
	examples=[
	"Find solar panel tenders",
	"Search for IT tenders from Ministry of Defense",
	"Get construction tenders from 01/05/2025 to 30/06/2025"
	],

	)

	if __name__ == "__main__":
	# Launch with appropriate parameters
	demo.launch(debug=True, share=False)