| | import os |
| | import re |
| | import requests |
| | import pandas as pd |
| | import gradio as gr |
| | import time |
| | import random |
| | from bs4 import BeautifulSoup |
| | from dateutil.parser import parse |
| | from datetime import datetime, timedelta |
| | from requests.adapters import HTTPAdapter |
| | from urllib3.util.retry import Retry |
| |
|
| | |
| | try: |
| | from transformers import T5ForConditionalGeneration, T5Tokenizer |
| | tokenizer = T5Tokenizer.from_pretrained("t5-small") |
| | model = T5ForConditionalGeneration.from_pretrained("t5-small") |
| |
|
| | def correct_text(raw_text: str) -> str: |
| | """Paraphrase & correct via T5-small, with fallback on error.""" |
| | try: |
| | prompt = "paraphrase and correct: " + raw_text.strip() |
| | inputs = tokenizer(prompt, return_tensors="pt", truncation=True) |
| | outputs = model.generate(**inputs, max_length=128) |
| | return tokenizer.decode(outputs[0], skip_special_tokens=True) |
| | except Exception: |
| | return raw_text |
| | except ImportError: |
| | def correct_text(raw_text: str) -> str: |
| | |
| | return raw_text |
| |
|
| | |
| | def create_robust_session(): |
| | """Create a requests session with retry logic""" |
| | session = requests.Session() |
| | |
| | |
| | retry_strategy = Retry( |
| | total=5, |
| | backoff_factor=1, |
| | status_forcelist=[429, 500, 502, 503, 504], |
| | allowed_methods=["GET", "POST"] |
| | ) |
| | |
| | |
| | adapter = HTTPAdapter(max_retries=retry_strategy) |
| | session.mount("http://", adapter) |
| | session.mount("https://", adapter) |
| | |
| | return session |
| |
|
| | |
| | def scrape_gem_cppp(keyword="", org_name="", start_date=None, end_date=None, max_pages=10): |
| | """Scrape tender data from GeM CPPP portal with robust error handling""" |
| | headers = { |
| | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', |
| | 'Content-Type': 'application/x-www-form-urlencoded', |
| | 'Referer': 'https://gem.gov.in/cppp', |
| | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', |
| | 'Accept-Language': 'en-US,en;q=0.5', |
| | 'Connection': 'keep-alive' |
| | } |
| | |
| | |
| | session = create_robust_session() |
| | |
| | tenders = [] |
| | page = 1 |
| | total_pages = max_pages |
| | |
| | while page <= total_pages and page <= max_pages: |
| | try: |
| | print(f"Fetching page {page} of maximum {max_pages}") |
| | |
| | |
| | form_data = { |
| | 'page': str(page), |
| | 'tid': '', |
| | 'title': keyword, |
| | 'orgname': org_name, |
| | 'startdate': start_date.strftime('%d-%m-%Y') if start_date else '', |
| | 'enddate': end_date.strftime('%d-%m-%Y') if end_date else '', |
| | 't_outrefid': '', |
| | 'search': '1', |
| | } |
| | |
| | |
| | time.sleep(random.uniform(0.5, 1.5)) |
| | |
| | |
| | resp = session.post( |
| | "https://gem.gov.in/cppp", |
| | headers=headers, |
| | data=form_data, |
| | timeout=(30, 60) |
| | ) |
| | |
| | |
| | if resp.status_code != 200: |
| | print(f"Error: Received status code {resp.status_code}") |
| | break |
| | |
| | |
| | soup = BeautifulSoup(resp.text, "html.parser") |
| | |
| | |
| | table = soup.find("table", {"class": "table"}) |
| | if not table: |
| | print(f"No tender table found on page {page}") |
| | break |
| | |
| | |
| | rows = table.find_all("tr")[1:] |
| | if not rows: |
| | print(f"No tender rows found on page {page}") |
| | break |
| | |
| | print(f"Found {len(rows)} tender rows on page {page}") |
| | |
| | |
| | for row in rows: |
| | cols = row.find_all("td") |
| | if len(cols) < 8: |
| | continue |
| | |
| | try: |
| | |
| | closing = cols[0].get_text(strip=True) |
| | opening_date = cols[1].get_text(strip=True) |
| | publish_date = cols[2].get_text(strip=True) |
| | |
| | |
| | title_el = cols[3].find("a") |
| | title = title_el.get_text(strip=True) if title_el else cols[3].get_text(strip=True) |
| | |
| | |
| | link = "" |
| | if title_el and title_el.has_attr("href"): |
| | link = title_el["href"] |
| | if link and link.startswith("/"): |
| | link = "https://gem.gov.in" + link |
| | |
| | |
| | org = cols[4].get_text(strip=True) |
| | |
| | |
| | full_text = cols[3].get_text(strip=True) |
| | ref_id = "" |
| | if title in full_text: |
| | ref_id = full_text.replace(title, "").strip("/").strip() |
| | else: |
| | |
| | id_match = re.search(r'[A-Za-z0-9_-]+/\d+', full_text) |
| | if id_match: |
| | ref_id = id_match.group(0) |
| | |
| | |
| | dl_el = cols[7].find("a") |
| | dl_link = "" |
| | if dl_el and dl_el.has_attr("href"): |
| | dl_link = dl_el["href"] |
| | |
| | if dl_link and dl_link.startswith("/"): |
| | dl_link = "https://gem.gov.in" + dl_link |
| | |
| | |
| | try: |
| | if closing: |
| | cdate = parse(closing) |
| | if start_date and cdate < start_date: |
| | continue |
| | if end_date and cdate > end_date: |
| | continue |
| | except Exception: |
| | |
| | pass |
| | |
| | |
| | tenders.append({ |
| | "Title": title, |
| | "Organization": org, |
| | "Closing Date": closing, |
| | "Opening Date": opening_date, |
| | "Published Date": publish_date, |
| | "Reference/Tender ID": ref_id, |
| | "Tender Link": link, |
| | "Download Link": dl_link |
| | }) |
| | |
| | except Exception as row_err: |
| | print(f"Error processing row on page {page}: {row_err}") |
| | continue |
| | |
| | |
| | pag = soup.find("ul", {"class": "pagination"}) |
| | next_page_exists = False |
| | |
| | if pag: |
| | |
| | next_link = pag.find("a", string=re.compile(r"Next", re.I)) |
| | if next_link: |
| | next_page_exists = True |
| | |
| | |
| | page_links = pag.find_all("a") |
| | for link in page_links: |
| | try: |
| | page_num = int(link.get_text(strip=True)) |
| | total_pages = max(total_pages, page_num) |
| | except (ValueError, TypeError): |
| | pass |
| | |
| | if not next_page_exists: |
| | print(f"No next page found after page {page}") |
| | break |
| | |
| | |
| | page += 1 |
| | |
| | except requests.Timeout: |
| | print(f"Timeout error on page {page}. Retrying...") |
| | continue |
| | |
| | except requests.RequestException as e: |
| | print(f"Request error on page {page}: {e}") |
| | |
| | time.sleep(5) |
| | continue |
| | |
| | except Exception as e: |
| | print(f"Unexpected error on page {page}: {e}") |
| | break |
| | |
| | print(f"Scraping completed: found {len(tenders)} tenders across {page} pages") |
| | return tenders |
| |
|
| | |
| | def summarize_tenders(tenders: list[dict]) -> str: |
| | if not tenders: |
| | return "No tenders were found matching those criteria." |
| | |
| | lines = [f"I found {len(tenders)} tenders matching your criteria:\n"] |
| | |
| | |
| | try: |
| | tenders = sorted(tenders, |
| | key=lambda x: parse(x.get("Closing Date", "01-01-2000")), |
| | reverse=True) |
| | except Exception: |
| | |
| | pass |
| | |
| | |
| | for idx, t in enumerate(tenders, 1): |
| | |
| | title_line = f"{idx}. " |
| | if t.get("Tender Link"): |
| | title_line += f"[{t['Title']}]({t['Tender Link']})" |
| | else: |
| | title_line += t['Title'] |
| | |
| | lines.append(title_line) |
| | |
| | |
| | lines.append(f" β’ Organization: {t['Organization']}") |
| | |
| | |
| | lines.append(f" β’ Closing Date: {t['Closing Date']}") |
| | |
| | if t.get("Opening Date") and t["Opening Date"].strip(): |
| | lines.append(f" β’ Opening Date: {t['Opening Date']}") |
| | |
| | if t.get("Published Date") and t["Published Date"].strip(): |
| | lines.append(f" β’ Published Date: {t['Published Date']}") |
| | |
| | |
| | if t.get("Reference/Tender ID") and t["Reference/Tender ID"].strip(): |
| | lines.append(f" β’ Ref ID: {t['Reference/Tender ID']}") |
| | |
| | |
| | if t.get("Download Link") and t["Download Link"].strip(): |
| | lines.append(f" β’ [Download Tender Document]({t['Download Link']})") |
| | |
| | lines.append("") |
| | |
| | |
| | return "\n".join(lines) |
| |
|
| | |
| | def chat_fn(user_message: str, history): |
| | """Process chat messages and extract search parameters""" |
| | |
| | print(f"User Message: {user_message}") |
| | |
| | try: |
| | |
| | corrected = correct_text(user_message) |
| | print(f"Corrected Text: {corrected}") |
| | |
| | |
| | date_patterns = [ |
| | |
| | r"from\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})\s+to\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})", |
| | |
| | r"between\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})\s+and\s+(\d{1,2}[/-]\d{1,2}[/-]\d{4})" |
| | ] |
| | |
| | start_date = end_date = None |
| | |
| | for pattern in date_patterns: |
| | match = re.search(pattern, corrected, re.I) |
| | if match: |
| | try: |
| | start_date = parse(match.group(1)) |
| | end_date = parse(match.group(2)) |
| | print(f"Dates extracted: {start_date} to {end_date}") |
| | break |
| | except Exception as e: |
| | print(f"Date parsing error: {e}") |
| | |
| | |
| | org_patterns = [ |
| | r"from\s+ministry\s+of\s+(\w+)", |
| | r"from\s+(\w+)\s+ministry", |
| | r"by\s+(\w+\s+\w+)", |
| | r"organization\s+(\w+\s+\w+)" |
| | ] |
| | |
| | org = "" |
| | for pattern in org_patterns: |
| | org_match = re.search(pattern, corrected.lower()) |
| | if org_match: |
| | org = org_match.group(1) |
| | print(f"Organization extracted: {org}") |
| | break |
| | |
| | |
| | stops = {"find", "search", "get", "tenders", "tender", "from", "to", |
| | "between", "after", "before", "the", "and", "of", "in"} |
| | |
| | |
| | keyword = "" |
| | kw_match = re.search(r"(?:get|find|search)\s+(.*?)\s+tenders?", corrected.lower()) |
| | if kw_match: |
| | keyword = kw_match.group(1).strip() |
| | else: |
| | |
| | words = re.findall(r"\b\w+\b", corrected.lower()) |
| | keyword = " ".join(w for w in words if w not in stops and len(w) > 2) |
| | |
| | print(f"Final keyword: '{keyword}'") |
| | |
| | |
| | results = scrape_gem_cppp( |
| | keyword=keyword.strip(), |
| | org_name=org, |
| | start_date=start_date, |
| | end_date=end_date, |
| | max_pages=10 |
| | ) |
| | |
| | |
| | bot_reply = summarize_tenders(results) |
| | |
| | except Exception as e: |
| | import traceback |
| | print(f"Error in chat function: {e}") |
| | print(traceback.format_exc()) |
| | bot_reply = f"Sorry, an error occurred while processing your request: {str(e)}" |
| | |
| | return bot_reply |
| |
|
| | |
| | with gr.Blocks() as demo: |
| | gr.Markdown("## Government Tender Search Chatbot") |
| | gr.Markdown("Ask me to find tenders by keyword, organization, or date range.") |
| | gr.ChatInterface( |
| | fn=chat_fn, |
| | title="TenderBot", |
| | description="E.g. Search solar panel tenders from 01/06/2025 to 30/06/2025", |
| | examples=[ |
| | "Find solar panel tenders", |
| | "Search for IT tenders from Ministry of Defense", |
| | "Get construction tenders from 01/05/2025 to 30/06/2025" |
| | ], |
| |
|
| | ) |
| |
|
| | if __name__ == "__main__": |
| | |
| | demo.launch(debug=True, share=False) |