Spaces:
Sleeping
Sleeping
# app.py | |
import gradio as gr | |
from bs4 import BeautifulSoup | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
# Import Firefox specific classes | |
from selenium.webdriver.firefox.service import Service as FirefoxService | |
from selenium.webdriver.firefox.options import Options as FirefoxOptions | |
# from selenium.webdriver.chrome.service import Service as ChromeService # No longer needed | |
# from selenium.webdriver.chrome.options import Options as ChromeOptions # No longer needed | |
from geopy.geocoders import Nominatim, ArcGIS | |
from geopy.exc import GeocoderTimedOut, GeocoderUnavailable, GeocoderServiceError | |
import time | |
import pandas as pd | |
import re | |
import os | |
import shutil # For finding geckodriver | |
def driversetup_huggingface(): | |
"""Custom driver setup for Hugging Face Spaces using Firefox (headless).""" | |
options = FirefoxOptions() | |
options.add_argument("--headless") | |
options.add_argument("--window-size=1920,1080") # Set a reasonable window size | |
options.add_argument("--disable-gpu") # Often recommended for headless | |
# Firefox doesn't use --no-sandbox or --disable-dev-shm-usage in the same way as Chrome | |
# User agent and other settings | |
options.set_preference("intl.accept_languages", "en-US, en") | |
options.set_preference("general.useragent.override", "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0") # Example Firefox UA | |
geckodriver_path = shutil.which("geckodriver") | |
service = None | |
if geckodriver_path: | |
print(f"Using geckodriver found at: {geckodriver_path}") | |
service = FirefoxService(executable_path=geckodriver_path) | |
else: | |
print("Geckodriver not found in PATH by shutil.which.") | |
common_paths = ["/usr/bin/geckodriver", "/usr/local/bin/geckodriver"] | |
for path in common_paths: | |
if os.path.exists(path): | |
print(f"Found geckodriver at common path: {path}") | |
service = FirefoxService(executable_path=path) | |
break | |
if not service: | |
print("Geckodriver not found in common paths. Attempting to initialize FirefoxService without explicit path...") | |
print("Ensure 'firefox-esr' and 'geckodriver' are in packages.txt for HF Spaces.") | |
try: | |
# This will likely fail if geckodriver isn't installed and in PATH | |
service = FirefoxService() | |
except Exception as e_service: | |
print(f"Could not initialize FirefoxService without explicit path: {e_service}") | |
return None | |
try: | |
print("Setting up GeckoDriver (Firefox) for Hugging Face environment...") | |
driver = webdriver.Firefox(service=service, options=options) | |
print("GeckoDriver (Firefox) setup successful.") | |
except Exception as e_webdriver: | |
print(f"Error setting up GeckoDriver (Firefox): {e_webdriver}") | |
if service and service.path: # Check if service.path exists | |
# geckodriver might not have a simple --version flag like chromedriver | |
# We can try to run it to see if it executes | |
try: | |
os.system(f"{service.path} --version > geckodriver_version.txt 2>&1") | |
with open("geckodriver_version.txt", "r") as f: | |
print(f"Geckodriver version check output: {f.read()}") | |
os.remove("geckodriver_version.txt") | |
except Exception as e_ver: | |
print(f"Could not execute geckodriver version check: {e_ver}") | |
return None | |
# The AutomationControlled blink feature is Chrome-specific. | |
# For Firefox, such measures are less common or handled differently. | |
# driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined});") # This is Chrome specific | |
return driver | |
def clean_address(address_str): | |
if not isinstance(address_str, str): | |
return "" | |
cleaned_address = ' '.join(address_str.split()) | |
cleaned_address = re.sub(r'floor-\s*[\w\s]+,?', '', cleaned_address, flags=re.IGNORECASE) | |
cleaned_address = cleaned_address.replace(' ,', ',').replace(',,', ',') | |
cleaned_address = ', '.join(filter(None, (s.strip() for s in cleaned_address.split(',')))) | |
if "india" not in cleaned_address.lower() and ("mumbai" in cleaned_address.lower() or "maharashtra" in cleaned_address.lower()): | |
cleaned_address += ", India" | |
return cleaned_address | |
def geocode_address_with_fallbacks(address_str, attempt_count=0): | |
if not address_str or not address_str.strip(): | |
print("Address string is empty, cannot geocode.") | |
return None, None | |
cleaned_address = clean_address(address_str) | |
print(f"Attempting to geocode cleaned address: '{cleaned_address}' (Attempt {attempt_count + 1})") | |
nominatim_user_agent = f"gstin_gradio_app_hf_{int(time.time())}" | |
geocoders_to_try = [ | |
("Nominatim", Nominatim(user_agent=nominatim_user_agent)), | |
("ArcGIS", ArcGIS(timeout=10)) | |
] | |
for name, geolocator in geocoders_to_try: | |
try: | |
print(f"Trying geocoder: {name}...") | |
location = geolocator.geocode(cleaned_address, timeout=15) | |
if location: | |
print(f"Success with {name}: Lat: {location.latitude}, Lon: {location.longitude}") | |
return location.latitude, location.longitude | |
else: | |
print(f"{name} could not geocode the address.") | |
except (GeocoderTimedOut, GeocoderUnavailable, GeocoderServiceError) as e: | |
print(f"{name} geocoding error: {e}") | |
except Exception as e: | |
print(f"An unexpected error occurred with {name}: {e}") | |
time.sleep(1) # Be respectful to APIs | |
if attempt_count == 0: # Try a more generic address only once | |
parts = [s.strip() for s in cleaned_address.split(',') if s.strip()] | |
if len(parts) > 3: | |
start_index = max(0, len(parts) - 4) | |
generic_address = ', '.join(parts[start_index:]) | |
print(f"Trying a more generic address: '{generic_address}'") | |
return geocode_address_with_fallbacks(generic_address, attempt_count + 1) | |
print("All geocoding attempts failed for the address.") | |
return None, None | |
def get_gstin_details_for_gradio(gstin_number_input): | |
gstin_number = str(gstin_number_input).strip().upper() | |
if not (len(gstin_number) == 15 and gstin_number.isalnum()): | |
return pd.DataFrame({"Error": ["Invalid GSTIN format. Must be 15 alphanumeric characters."]}) | |
print(f"Initiating scraper for GSTIN: {gstin_number}") | |
driver = driversetup_huggingface() # Now uses Firefox setup | |
if driver is None: | |
print("WebDriver (Firefox) not initialized for scraper.") | |
return pd.DataFrame({"Error": ["WebDriver (Firefox) initialization failed. Check server logs for GeckoDriver errors."]}) | |
extracted_data = {"GSTIN Queried": gstin_number} | |
wait_time = 35 | |
url = "https://www.mastersindia.co/gst-number-search-and-gstin-verification/" | |
try: | |
print(f"Navigating to URL: {url}") | |
driver.get(url) | |
time.sleep(1.5) # Slightly longer pause for Firefox initial page load | |
gstin_input_css_selector = 'input[placeholder="XXXAAAYYYYZ01Z5"]' | |
print(f"Waiting for GSTIN input box: {gstin_input_css_selector}") | |
gstin_input = WebDriverWait(driver, wait_time).until( | |
EC.visibility_of_element_located((By.CSS_SELECTOR, gstin_input_css_selector)) | |
) | |
print("GSTIN input box visible.") | |
gstin_input.clear() | |
gstin_input.send_keys(gstin_number) | |
print(f"Entered GSTIN: {gstin_number}") | |
time.sleep(0.5) | |
search_button_css_selector = 'button[aria-label="Search"]' | |
print(f"Waiting for Search button: {search_button_css_selector}") | |
search_button = WebDriverWait(driver, wait_time).until( | |
EC.element_to_be_clickable((By.CSS_SELECTOR, search_button_css_selector)) | |
) | |
print("Search button clickable.") | |
driver.execute_script("arguments[0].scrollIntoView(true);", search_button) | |
time.sleep(0.5) | |
driver.execute_script("arguments[0].click();", search_button) | |
print("Clicked Search button using JavaScript.") | |
results_table_css_selector = "div.eaKoeQ table tbody tr" | |
print(f"Waiting for results table rows: {results_table_css_selector}") | |
WebDriverWait(driver, wait_time).until( | |
EC.presence_of_all_elements_located((By.CSS_SELECTOR, results_table_css_selector)) | |
) | |
print("Results table rows are present.") | |
time.sleep(3) | |
page_source = driver.page_source | |
soup = BeautifulSoup(page_source, 'html.parser') | |
table_container_div = soup.select_one("div.eaKoeQ") | |
table = None | |
if table_container_div: table = table_container_div.find('table') | |
if not table: table = soup.find('table') | |
if not table: | |
msg = "No data table found on the page after search." | |
if "captcha" in page_source.lower(): msg = "CAPTCHA detected during scraping." | |
elif "No details found" in page_source or "Invalid GSTIN" in page_source: | |
msg = f"No details found for GSTIN {gstin_number} or invalid GSTIN." | |
print(msg) | |
return pd.DataFrame({"Error": [msg]}) | |
rows = table.find_all('tr') | |
raw_data = {} | |
if not rows: | |
print("Table found, but no rows (<tr>) parsed from it.") | |
return pd.DataFrame({"Error": ["Data table found but no rows could be parsed."]}) | |
for row_num, row in enumerate(rows): | |
header_element = row.find('th', class_=lambda x: x and 'eLVLDP' in x.split()) | |
value_element = row.find('td', class_=lambda x: x and 'jdgLDg' in x.split()) | |
if header_element and value_element: | |
raw_data[header_element.get_text(strip=True)] = value_element.get_text(strip=True) | |
elif len(row.find_all('td')) == 2: | |
cells = row.find_all('td') | |
key = cells[0].get_text(strip=True) | |
if key: raw_data[key] = cells[1].get_text(strip=True) | |
if not raw_data: | |
print("Could not parse any key-value data from the table rows.") | |
return pd.DataFrame({"Error": ["Failed to parse key-value data from table rows."]}) | |
fields_to_extract_map = { | |
"Principal Place of Business": "Principal Business Address", | |
"Additional Place of Business": "Additional Business Address(es)", | |
"State Jurisdiction": "State Jurisdiction", | |
"Centre Jurisdiction": "Centre Jurisdiction", | |
"Date of Registration": "Registration Date", | |
"Constitution of Business": "Business Constitution", | |
"Taxpayer Type": "Taxpayer Type", | |
"GSTIN Status": "GSTIN Status" | |
} | |
for web_key, display_key in fields_to_extract_map.items(): | |
extracted_data[display_key] = raw_data.get(web_key, "Not Found") | |
address_to_geocode = extracted_data.get("Principal Business Address") | |
if address_to_geocode not in [None, "Not Found", ""]: | |
lat, lon = geocode_address_with_fallbacks(address_to_geocode) | |
extracted_data["Address Latitude"] = lat if lat is not None else "N/A" | |
extracted_data["Address Longitude"] = lon if lon is not None else "N/A" | |
else: | |
extracted_data["Address Latitude"] = "N/A" | |
extracted_data["Address Longitude"] = "N/A" | |
if extracted_data.get("Principal Business Address"): | |
print("Principal Place of Business not found or empty, skipping geocoding.") | |
print(f"Successfully scraped data for {gstin_number}") | |
df_output = pd.DataFrame(list(extracted_data.items()), columns=["Field", "Value"]) | |
return df_output | |
except Exception as e: | |
print(f"An error occurred during scraping process for {gstin_number}: {e}") | |
return pd.DataFrame({"Error": [f"Scraping process failed: {str(e)}"]}) | |
finally: | |
if 'driver' in locals() and driver is not None: | |
try: | |
driver.quit() | |
print("Browser closed.") | |
except Exception as e_quit: | |
print(f"Error quitting driver: {e_quit}") | |
# --- Gradio Interface --- | |
iface = gr.Interface( | |
fn=get_gstin_details_for_gradio, | |
inputs=gr.Textbox( | |
label="Enter GSTIN", | |
placeholder="Enter 15-character GSTIN (e.g., 27AAFCD5562R1Z5)", | |
max_lines=1, | |
info="The scraper will fetch details for the provided GSTIN from Masters India." | |
), | |
outputs=gr.DataFrame( | |
label="GSTIN Details", | |
headers=["Field", "Value"], | |
wrap=True | |
), | |
title="🧾 GSTIN Details Scraper & Verifier (Firefox Edition)", | |
description="Enter a valid 15-character Indian GSTIN to fetch its registration details and attempt to geocode the principal place of business. Uses Masters India for scraping (with Firefox/GeckoDriver).", | |
article="<p style='text-align: center;'>Powered by Selenium, BeautifulSoup, Geopy, and Gradio. <br>Note: Scraping may take 20-45 seconds. Geocoding accuracy may vary.</p>", | |
examples=[["27AAFCD5562R1Z5"], ["07AAFCM6072R1Z8"]], | |
allow_flagging="never", | |
theme=gr.themes.Soft() | |
) | |
if __name__ == '__main__': | |
if os.environ.get("SYSTEM") == "spaces": | |
iface.launch(debug=False) | |
else: | |
iface.launch(debug=True, share=False) | |
# webdriver-manager # Useful for local testing with Firefox too | |
# # app.py | |
# import gradio as gr | |
# from bs4 import BeautifulSoup | |
# from selenium import webdriver | |
# from selenium.webdriver.common.by import By | |
# from selenium.webdriver.support.ui import WebDriverWait | |
# from selenium.webdriver.support import expected_conditions as EC | |
# from selenium.webdriver.chrome.service import Service as ChromeService | |
# from selenium.webdriver.chrome.options import Options as ChromeOptions | |
# from geopy.geocoders import Nominatim, ArcGIS | |
# from geopy.exc import GeocoderTimedOut, GeocoderUnavailable, GeocoderServiceError | |
# import time | |
# import pandas as pd | |
# import re | |
# import os | |
# import shutil # For finding chromedriver | |
# def driversetup_huggingface(): | |
# """Custom driver setup for Hugging Face Spaces (headless).""" | |
# options = ChromeOptions() | |
# options.add_argument("--headless") | |
# options.add_argument("--no-sandbox") | |
# # options.add_argument("--disable-gpu") | |
# # options.add_argument("--window-size=1920,1080") | |
# options.add_argument("--disable-dev-shm-usage") | |
# # options.add_argument("lang=en") | |
# # options.add_argument("start-maximized") | |
# # options.add_argument("disable-infobars") | |
# # options.add_argument("--disable-extensions") | |
# # options.add_argument("--disable-blink-features=AutomationControlled") | |
# options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36") | |
# # Attempt to find chromedriver - Hugging Face Spaces might have it in specific locations | |
# # or it might need to be installed via packages.txt or a Dockerfile. | |
# # For Gradio apps on Spaces, it's often pre-configured or easily installable. | |
# # Let's try common paths or rely on it being in PATH. | |
# # Check if chromedriver is in PATH or use a common location | |
# chromedriver_path = shutil.which("chromedriver") | |
# if chromedriver_path: | |
# print(f"Using chromedriver found at: {chromedriver_path}") | |
# service = ChromeService(executable_path=chromedriver_path) | |
# else: | |
# # Fallback if not in PATH - this might fail on HF if not installed correctly | |
# print("Chromedriver not found in PATH. Attempting to use 'chromedriver' directly (might fail).") | |
# print("For Hugging Face Spaces, ensure Chrome & Chromedriver are available in the environment.") | |
# print("You might need to add 'chromium-chromedriver' to a packages.txt file if using a Docker Space.") | |
# # As a last resort, try initializing without explicit path, hoping Selenium finds it. | |
# # This part is crucial for HF deployment and might need adjustment based on the HF Space environment. | |
# # For many Gradio spaces, simply having 'selenium' and 'chromedriver-binary' (or similar) | |
# # in requirements.txt might work if the base image is well-configured. | |
# # However, for full Chrome, system-level install is better. | |
# # For now, we'll proceed assuming it might be found or will error out gracefully. | |
# try: | |
# # This assumes chromedriver is globally available or Selenium can find it. | |
# # On Hugging Face, if using default Docker runtime, you might need to specify | |
# # apt packages like 'chromium-driver' or 'google-chrome-stable' + 'chromedriver' | |
# # in a packages.txt file or use a custom Dockerfile. | |
# # For simplicity, let's assume it can be found or will fail here. | |
# # A common path if installed via apt in a container: | |
# if os.path.exists("/usr/bin/chromedriver"): | |
# service = ChromeService(executable_path="/usr/bin/chromedriver") | |
# elif os.path.exists("/usr/local/bin/chromedriver"): | |
# service = ChromeService(executable_path="/usr/local/bin/chromedriver") | |
# else: | |
# # This will likely fail if chromedriver isn't installed and in PATH | |
# # On HF Spaces, you typically ensure this via environment setup (e.g. packages.txt) | |
# print("Attempting to initialize ChromeService without explicit path...") | |
# service = ChromeService() # May fail if chromedriver not in PATH | |
# except Exception as e: | |
# print(f"Could not initialize ChromeService: {e}. Ensure chromedriver is installed and in PATH.") | |
# return None | |
# try: | |
# print("Setting up ChromeDriver for Hugging Face environment...") | |
# driver = webdriver.Chrome(service=service, options=options) | |
# print("ChromeDriver setup successful.") | |
# except Exception as e: | |
# print(f"Error setting up ChromeDriver: {e}") | |
# return None | |
# driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined});") | |
# return driver | |
# def clean_address(address_str): | |
# if not isinstance(address_str, str): | |
# return "" | |
# cleaned_address = ' '.join(address_str.split()) | |
# cleaned_address = re.sub(r'floor-\s*[\w\s]+,?', '', cleaned_address, flags=re.IGNORECASE) | |
# cleaned_address = cleaned_address.replace(' ,', ',').replace(',,', ',') | |
# cleaned_address = ', '.join(filter(None, (s.strip() for s in cleaned_address.split(',')))) | |
# if "india" not in cleaned_address.lower() and ("mumbai" in cleaned_address.lower() or "maharashtra" in cleaned_address.lower()): | |
# cleaned_address += ", India" | |
# return cleaned_address | |
# def geocode_address_with_fallbacks(address_str, attempt_count=0): | |
# if not address_str or not address_str.strip(): | |
# print("Address string is empty, cannot geocode.") | |
# return None, None | |
# cleaned_address = clean_address(address_str) | |
# print(f"Attempting to geocode cleaned address: '{cleaned_address}' (Attempt {attempt_count + 1})") | |
# nominatim_user_agent = f"gstin_gradio_app_hf_{int(time.time())}" | |
# geocoders_to_try = [ | |
# ("Nominatim", Nominatim(user_agent=nominatim_user_agent)), | |
# ("ArcGIS", ArcGIS(timeout=10)) | |
# ] | |
# for name, geolocator in geocoders_to_try: | |
# try: | |
# print(f"Trying geocoder: {name}...") | |
# location = geolocator.geocode(cleaned_address, timeout=15) | |
# if location: | |
# print(f"Success with {name}: Lat: {location.latitude}, Lon: {location.longitude}") | |
# return location.latitude, location.longitude | |
# else: | |
# print(f"{name} could not geocode the address.") | |
# except (GeocoderTimedOut, GeocoderUnavailable, GeocoderServiceError) as e: | |
# print(f"{name} geocoding error: {e}") | |
# except Exception as e: | |
# print(f"An unexpected error occurred with {name}: {e}") | |
# time.sleep(1) | |
# if attempt_count == 0: | |
# parts = [s.strip() for s in cleaned_address.split(',') if s.strip()] | |
# if len(parts) > 3: | |
# generic_address = ', '.join(parts[1:]) | |
# print(f"Trying a more generic address (v1): '{generic_address}'") | |
# lat, lon = geocode_address_with_fallbacks(generic_address, attempt_count + 1) | |
# if lat is not None: return lat, lon | |
# if len(parts) > 4: | |
# generic_address_v2 = ', '.join(parts[2:]) | |
# print(f"Trying a more generic address (v2): '{generic_address_v2}'") | |
# return geocode_address_with_fallbacks(generic_address_v2, attempt_count + 1) | |
# print("All geocoding attempts failed for the address.") | |
# return None, None | |
# def get_gstin_details_for_gradio(gstin_number_input): | |
# """ | |
# Main function for Gradio: takes GSTIN, scrapes, and returns data as DataFrame. | |
# """ | |
# gstin_number = str(gstin_number_input).strip().upper() | |
# if not (len(gstin_number) == 15 and gstin_number.isalnum()): | |
# return pd.DataFrame({"Error": ["Invalid GSTIN format. Must be 15 alphanumeric characters."]}) | |
# print(f"Initiating scraper for GSTIN: {gstin_number}") | |
# driver = driversetup_huggingface() | |
# if driver is None: | |
# print("WebDriver not initialized for scraper.") | |
# return pd.DataFrame({"Error": ["WebDriver initialization failed. Check server logs."]}) | |
# extracted_data = {"GSTIN Queried": gstin_number} | |
# wait_time = 30 | |
# url = "https://www.mastersindia.co/gst-number-search-and-gstin-verification/" | |
# try: | |
# driver.get(url) | |
# print(f"Navigated to URL: {url}") | |
# gstin_input_css_selector = 'input[placeholder="XXXAAAYYYYZ01Z5"]' | |
# WebDriverWait(driver, wait_time).until( | |
# EC.presence_of_element_located((By.CSS_SELECTOR, gstin_input_css_selector)) | |
# ) | |
# gstin_input = driver.find_element(By.CSS_SELECTOR, gstin_input_css_selector) | |
# gstin_input.clear() | |
# gstin_input.send_keys(gstin_number) | |
# print(f"Entered GSTIN: {gstin_number}") | |
# search_button_css_selector = 'button[aria-label="Search"]' | |
# WebDriverWait(driver, wait_time).until( | |
# EC.element_to_be_clickable((By.CSS_SELECTOR, search_button_css_selector)) | |
# ) | |
# search_button = driver.find_element(By.CSS_SELECTOR, search_button_css_selector) | |
# driver.execute_script("arguments[0].click();", search_button) | |
# print("Clicked Search button.") | |
# results_table_container_css_selector_for_wait = "div.eaKoeQ table" | |
# WebDriverWait(driver, wait_time).until( | |
# EC.presence_of_element_located((By.CSS_SELECTOR, results_table_container_css_selector_for_wait)) | |
# ) | |
# print("Results table container found.") | |
# time.sleep(4) | |
# page_source = driver.page_source | |
# soup = BeautifulSoup(page_source, 'html.parser') | |
# table_container_div = soup.select_one("div.eaKoeQ") | |
# table = None | |
# if table_container_div: table = table_container_div.find('table') | |
# if not table: table = soup.find('table') | |
# if not table: | |
# msg = "No data table found on the page after search." | |
# if "captcha" in page_source.lower(): msg = "CAPTCHA detected during scraping." | |
# elif "No details found" in page_source or "Invalid GSTIN" in page_source: | |
# msg = f"No details found for GSTIN {gstin_number} or invalid GSTIN." | |
# print(msg) | |
# return pd.DataFrame({"Error": [msg]}) | |
# rows = table.find_all('tr') | |
# raw_data = {} | |
# for row in rows: | |
# header_element = row.find('th', class_=lambda x: x and 'eLVLDP' in x.split()) | |
# value_element = row.find('td', class_=lambda x: x and 'jdgLDg' in x.split()) | |
# if header_element and value_element: | |
# raw_data[header_element.get_text(strip=True)] = value_element.get_text(strip=True) | |
# elif len(row.find_all('td')) == 2: | |
# cells = row.find_all('td') | |
# if cells[0].get_text(strip=True): | |
# raw_data[cells[0].get_text(strip=True)] = cells[1].get_text(strip=True) | |
# if not raw_data: | |
# print("Could not parse any data from the table rows.") | |
# return pd.DataFrame({"Error": ["Failed to parse data from table."]}) | |
# fields_to_extract_map = { | |
# "Principal Place of Business": "Principal Business Address", | |
# "Additional Place of Business": "Additional Business Address(es)", | |
# "State Jurisdiction": "State Jurisdiction", | |
# "Centre Jurisdiction": "Centre Jurisdiction", | |
# "Date of Registration": "Registration Date", | |
# "Constitution of Business": "Business Constitution", | |
# "Taxpayer Type": "Taxpayer Type", | |
# "GSTIN Status": "GSTIN Status" | |
# } | |
# for web_key, display_key in fields_to_extract_map.items(): | |
# extracted_data[display_key] = raw_data.get(web_key, "Not Found") | |
# address_to_geocode = extracted_data.get("Principal Business Address") | |
# if address_to_geocode not in [None, "Not Found", ""]: | |
# lat, lon = geocode_address_with_fallbacks(address_to_geocode) | |
# extracted_data["Address Latitude"] = lat if lat is not None else "N/A" | |
# extracted_data["Address Longitude"] = lon if lon is not None else "N/A" | |
# else: | |
# extracted_data["Address Latitude"] = "N/A" | |
# extracted_data["Address Longitude"] = "N/A" | |
# if extracted_data.get("Principal Business Address"): | |
# print("Principal Place of Business not found or empty, skipping geocoding.") | |
# print(f"Successfully scraped data for {gstin_number}") | |
# # Convert dictionary to a 2-column DataFrame for Gradio | |
# df_output = pd.DataFrame(list(extracted_data.items()), columns=["Field", "Value"]) | |
# return df_output | |
# except Exception as e: | |
# print(f"An error occurred during scraping process for {gstin_number}: {e}") | |
# # import traceback | |
# # traceback.print_exc() | |
# return pd.DataFrame({"Error": [f"Scraping process failed: {str(e)}"]}) | |
# finally: | |
# if 'driver' in locals() and driver is not None: | |
# try: | |
# driver.quit() | |
# print("Browser closed.") | |
# except Exception as e: | |
# print(f"Error quitting driver: {e}") | |
# # --- Gradio Interface --- | |
# iface = gr.Interface( | |
# fn=get_gstin_details_for_gradio, | |
# inputs=gr.Textbox( | |
# label="Enter GSTIN", | |
# placeholder="Enter 15-character GSTIN (e.g., 27AAFCD5562R1Z5)", | |
# max_lines=1, | |
# info="The scraper will fetch details for the provided GSTIN from Masters India." | |
# ), | |
# outputs=gr.DataFrame( | |
# label="GSTIN Details", | |
# headers=["Field", "Value"], | |
# wrap=True | |
# ), | |
# title="🧾 GSTIN Details Scraper & Verifier", | |
# description="Enter a valid 15-character Indian GSTIN to fetch its registration details and attempt to geocode the principal place of business. Uses Masters India for scraping.", | |
# article="<p style='text-align: center;'>Powered by Selenium, BeautifulSoup, Geopy, and Gradio. <br>Note: Scraping may take 20-40 seconds. Geocoding accuracy may vary.</p>", | |
# examples=[["27AAFCD5562R1Z5"], ["07AAFCM6072R1Z8"]], # Example GSTINs | |
# allow_flagging="never", | |
# theme=gr.themes.Soft() # Using a soft theme | |
# ) | |
# if __name__ == '__main__': | |
# # For Hugging Face Spaces, Gradio typically handles the server. | |
# # This launch(share=True) is more for local testing if you want a public link temporarily. | |
# # On HF Spaces, just `iface.launch()` is enough. | |
# # To run locally: python app.py | |
# if os.environ.get("SYSTEM") == "spaces": # Check if running in Hugging Face Spaces | |
# iface.launch(debug=False) | |
# else: | |
# iface.launch(debug=True, share=True) |