# app.py import gradio as gr from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC # Import Firefox specific classes from selenium.webdriver.firefox.service import Service as FirefoxService from selenium.webdriver.firefox.options import Options as FirefoxOptions # from selenium.webdriver.chrome.service import Service as ChromeService # No longer needed # from selenium.webdriver.chrome.options import Options as ChromeOptions # No longer needed from geopy.geocoders import Nominatim, ArcGIS from geopy.exc import GeocoderTimedOut, GeocoderUnavailable, GeocoderServiceError import time import pandas as pd import re import os import shutil # For finding geckodriver def driversetup_huggingface(): """Custom driver setup for Hugging Face Spaces using Firefox (headless).""" options = FirefoxOptions() options.add_argument("--headless") options.add_argument("--window-size=1920,1080") # Set a reasonable window size options.add_argument("--disable-gpu") # Often recommended for headless # Firefox doesn't use --no-sandbox or --disable-dev-shm-usage in the same way as Chrome # User agent and other settings options.set_preference("intl.accept_languages", "en-US, en") options.set_preference("general.useragent.override", "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0") # Example Firefox UA geckodriver_path = shutil.which("geckodriver") service = None if geckodriver_path: print(f"Using geckodriver found at: {geckodriver_path}") service = FirefoxService(executable_path=geckodriver_path) else: print("Geckodriver not found in PATH by shutil.which.") common_paths = ["/usr/bin/geckodriver", "/usr/local/bin/geckodriver"] for path in common_paths: if os.path.exists(path): print(f"Found geckodriver at common path: {path}") service = FirefoxService(executable_path=path) break if not service: print("Geckodriver not found in common paths. Attempting to initialize FirefoxService without explicit path...") print("Ensure 'firefox-esr' and 'geckodriver' are in packages.txt for HF Spaces.") try: # This will likely fail if geckodriver isn't installed and in PATH service = FirefoxService() except Exception as e_service: print(f"Could not initialize FirefoxService without explicit path: {e_service}") return None try: print("Setting up GeckoDriver (Firefox) for Hugging Face environment...") driver = webdriver.Firefox(service=service, options=options) print("GeckoDriver (Firefox) setup successful.") except Exception as e_webdriver: print(f"Error setting up GeckoDriver (Firefox): {e_webdriver}") if service and service.path: # Check if service.path exists # geckodriver might not have a simple --version flag like chromedriver # We can try to run it to see if it executes try: os.system(f"{service.path} --version > geckodriver_version.txt 2>&1") with open("geckodriver_version.txt", "r") as f: print(f"Geckodriver version check output: {f.read()}") os.remove("geckodriver_version.txt") except Exception as e_ver: print(f"Could not execute geckodriver version check: {e_ver}") return None # The AutomationControlled blink feature is Chrome-specific. # For Firefox, such measures are less common or handled differently. # driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined});") # This is Chrome specific return driver def clean_address(address_str): if not isinstance(address_str, str): return "" cleaned_address = ' '.join(address_str.split()) cleaned_address = re.sub(r'floor-\s*[\w\s]+,?', '', cleaned_address, flags=re.IGNORECASE) cleaned_address = cleaned_address.replace(' ,', ',').replace(',,', ',') cleaned_address = ', '.join(filter(None, (s.strip() for s in cleaned_address.split(',')))) if "india" not in cleaned_address.lower() and ("mumbai" in cleaned_address.lower() or "maharashtra" in cleaned_address.lower()): cleaned_address += ", India" return cleaned_address def geocode_address_with_fallbacks(address_str, attempt_count=0): if not address_str or not address_str.strip(): print("Address string is empty, cannot geocode.") return None, None cleaned_address = clean_address(address_str) print(f"Attempting to geocode cleaned address: '{cleaned_address}' (Attempt {attempt_count + 1})") nominatim_user_agent = f"gstin_gradio_app_hf_{int(time.time())}" geocoders_to_try = [ ("Nominatim", Nominatim(user_agent=nominatim_user_agent)), ("ArcGIS", ArcGIS(timeout=10)) ] for name, geolocator in geocoders_to_try: try: print(f"Trying geocoder: {name}...") location = geolocator.geocode(cleaned_address, timeout=15) if location: print(f"Success with {name}: Lat: {location.latitude}, Lon: {location.longitude}") return location.latitude, location.longitude else: print(f"{name} could not geocode the address.") except (GeocoderTimedOut, GeocoderUnavailable, GeocoderServiceError) as e: print(f"{name} geocoding error: {e}") except Exception as e: print(f"An unexpected error occurred with {name}: {e}") time.sleep(1) # Be respectful to APIs if attempt_count == 0: # Try a more generic address only once parts = [s.strip() for s in cleaned_address.split(',') if s.strip()] if len(parts) > 3: start_index = max(0, len(parts) - 4) generic_address = ', '.join(parts[start_index:]) print(f"Trying a more generic address: '{generic_address}'") return geocode_address_with_fallbacks(generic_address, attempt_count + 1) print("All geocoding attempts failed for the address.") return None, None def get_gstin_details_for_gradio(gstin_number_input): gstin_number = str(gstin_number_input).strip().upper() if not (len(gstin_number) == 15 and gstin_number.isalnum()): return pd.DataFrame({"Error": ["Invalid GSTIN format. Must be 15 alphanumeric characters."]}) print(f"Initiating scraper for GSTIN: {gstin_number}") driver = driversetup_huggingface() # Now uses Firefox setup if driver is None: print("WebDriver (Firefox) not initialized for scraper.") return pd.DataFrame({"Error": ["WebDriver (Firefox) initialization failed. Check server logs for GeckoDriver errors."]}) extracted_data = {"GSTIN Queried": gstin_number} wait_time = 35 url = "https://www.mastersindia.co/gst-number-search-and-gstin-verification/" try: print(f"Navigating to URL: {url}") driver.get(url) time.sleep(1.5) # Slightly longer pause for Firefox initial page load gstin_input_css_selector = 'input[placeholder="XXXAAAYYYYZ01Z5"]' print(f"Waiting for GSTIN input box: {gstin_input_css_selector}") gstin_input = WebDriverWait(driver, wait_time).until( EC.visibility_of_element_located((By.CSS_SELECTOR, gstin_input_css_selector)) ) print("GSTIN input box visible.") gstin_input.clear() gstin_input.send_keys(gstin_number) print(f"Entered GSTIN: {gstin_number}") time.sleep(0.5) search_button_css_selector = 'button[aria-label="Search"]' print(f"Waiting for Search button: {search_button_css_selector}") search_button = WebDriverWait(driver, wait_time).until( EC.element_to_be_clickable((By.CSS_SELECTOR, search_button_css_selector)) ) print("Search button clickable.") driver.execute_script("arguments[0].scrollIntoView(true);", search_button) time.sleep(0.5) driver.execute_script("arguments[0].click();", search_button) print("Clicked Search button using JavaScript.") results_table_css_selector = "div.eaKoeQ table tbody tr" print(f"Waiting for results table rows: {results_table_css_selector}") WebDriverWait(driver, wait_time).until( EC.presence_of_all_elements_located((By.CSS_SELECTOR, results_table_css_selector)) ) print("Results table rows are present.") time.sleep(3) page_source = driver.page_source soup = BeautifulSoup(page_source, 'html.parser') table_container_div = soup.select_one("div.eaKoeQ") table = None if table_container_div: table = table_container_div.find('table') if not table: table = soup.find('table') if not table: msg = "No data table found on the page after search." if "captcha" in page_source.lower(): msg = "CAPTCHA detected during scraping." elif "No details found" in page_source or "Invalid GSTIN" in page_source: msg = f"No details found for GSTIN {gstin_number} or invalid GSTIN." print(msg) return pd.DataFrame({"Error": [msg]}) rows = table.find_all('tr') raw_data = {} if not rows: print("Table found, but no rows () parsed from it.") return pd.DataFrame({"Error": ["Data table found but no rows could be parsed."]}) for row_num, row in enumerate(rows): header_element = row.find('th', class_=lambda x: x and 'eLVLDP' in x.split()) value_element = row.find('td', class_=lambda x: x and 'jdgLDg' in x.split()) if header_element and value_element: raw_data[header_element.get_text(strip=True)] = value_element.get_text(strip=True) elif len(row.find_all('td')) == 2: cells = row.find_all('td') key = cells[0].get_text(strip=True) if key: raw_data[key] = cells[1].get_text(strip=True) if not raw_data: print("Could not parse any key-value data from the table rows.") return pd.DataFrame({"Error": ["Failed to parse key-value data from table rows."]}) fields_to_extract_map = { "Principal Place of Business": "Principal Business Address", "Additional Place of Business": "Additional Business Address(es)", "State Jurisdiction": "State Jurisdiction", "Centre Jurisdiction": "Centre Jurisdiction", "Date of Registration": "Registration Date", "Constitution of Business": "Business Constitution", "Taxpayer Type": "Taxpayer Type", "GSTIN Status": "GSTIN Status" } for web_key, display_key in fields_to_extract_map.items(): extracted_data[display_key] = raw_data.get(web_key, "Not Found") address_to_geocode = extracted_data.get("Principal Business Address") if address_to_geocode not in [None, "Not Found", ""]: lat, lon = geocode_address_with_fallbacks(address_to_geocode) extracted_data["Address Latitude"] = lat if lat is not None else "N/A" extracted_data["Address Longitude"] = lon if lon is not None else "N/A" else: extracted_data["Address Latitude"] = "N/A" extracted_data["Address Longitude"] = "N/A" if extracted_data.get("Principal Business Address"): print("Principal Place of Business not found or empty, skipping geocoding.") print(f"Successfully scraped data for {gstin_number}") df_output = pd.DataFrame(list(extracted_data.items()), columns=["Field", "Value"]) return df_output except Exception as e: print(f"An error occurred during scraping process for {gstin_number}: {e}") return pd.DataFrame({"Error": [f"Scraping process failed: {str(e)}"]}) finally: if 'driver' in locals() and driver is not None: try: driver.quit() print("Browser closed.") except Exception as e_quit: print(f"Error quitting driver: {e_quit}") # --- Gradio Interface --- iface = gr.Interface( fn=get_gstin_details_for_gradio, inputs=gr.Textbox( label="Enter GSTIN", placeholder="Enter 15-character GSTIN (e.g., 27AAFCD5562R1Z5)", max_lines=1, info="The scraper will fetch details for the provided GSTIN from Masters India." ), outputs=gr.DataFrame( label="GSTIN Details", headers=["Field", "Value"], wrap=True ), title="🧾 GSTIN Details Scraper & Verifier (Firefox Edition)", description="Enter a valid 15-character Indian GSTIN to fetch its registration details and attempt to geocode the principal place of business. Uses Masters India for scraping (with Firefox/GeckoDriver).", article="

Powered by Selenium, BeautifulSoup, Geopy, and Gradio.
Note: Scraping may take 20-45 seconds. Geocoding accuracy may vary.

", examples=[["27AAFCD5562R1Z5"], ["07AAFCM6072R1Z8"]], allow_flagging="never", theme=gr.themes.Soft() ) if __name__ == '__main__': if os.environ.get("SYSTEM") == "spaces": iface.launch(debug=False) else: iface.launch(debug=True, share=False) # webdriver-manager # Useful for local testing with Firefox too # # app.py # import gradio as gr # from bs4 import BeautifulSoup # from selenium import webdriver # from selenium.webdriver.common.by import By # from selenium.webdriver.support.ui import WebDriverWait # from selenium.webdriver.support import expected_conditions as EC # from selenium.webdriver.chrome.service import Service as ChromeService # from selenium.webdriver.chrome.options import Options as ChromeOptions # from geopy.geocoders import Nominatim, ArcGIS # from geopy.exc import GeocoderTimedOut, GeocoderUnavailable, GeocoderServiceError # import time # import pandas as pd # import re # import os # import shutil # For finding chromedriver # def driversetup_huggingface(): # """Custom driver setup for Hugging Face Spaces (headless).""" # options = ChromeOptions() # options.add_argument("--headless") # options.add_argument("--no-sandbox") # # options.add_argument("--disable-gpu") # # options.add_argument("--window-size=1920,1080") # options.add_argument("--disable-dev-shm-usage") # # options.add_argument("lang=en") # # options.add_argument("start-maximized") # # options.add_argument("disable-infobars") # # options.add_argument("--disable-extensions") # # options.add_argument("--disable-blink-features=AutomationControlled") # options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36") # # Attempt to find chromedriver - Hugging Face Spaces might have it in specific locations # # or it might need to be installed via packages.txt or a Dockerfile. # # For Gradio apps on Spaces, it's often pre-configured or easily installable. # # Let's try common paths or rely on it being in PATH. # # Check if chromedriver is in PATH or use a common location # chromedriver_path = shutil.which("chromedriver") # if chromedriver_path: # print(f"Using chromedriver found at: {chromedriver_path}") # service = ChromeService(executable_path=chromedriver_path) # else: # # Fallback if not in PATH - this might fail on HF if not installed correctly # print("Chromedriver not found in PATH. Attempting to use 'chromedriver' directly (might fail).") # print("For Hugging Face Spaces, ensure Chrome & Chromedriver are available in the environment.") # print("You might need to add 'chromium-chromedriver' to a packages.txt file if using a Docker Space.") # # As a last resort, try initializing without explicit path, hoping Selenium finds it. # # This part is crucial for HF deployment and might need adjustment based on the HF Space environment. # # For many Gradio spaces, simply having 'selenium' and 'chromedriver-binary' (or similar) # # in requirements.txt might work if the base image is well-configured. # # However, for full Chrome, system-level install is better. # # For now, we'll proceed assuming it might be found or will error out gracefully. # try: # # This assumes chromedriver is globally available or Selenium can find it. # # On Hugging Face, if using default Docker runtime, you might need to specify # # apt packages like 'chromium-driver' or 'google-chrome-stable' + 'chromedriver' # # in a packages.txt file or use a custom Dockerfile. # # For simplicity, let's assume it can be found or will fail here. # # A common path if installed via apt in a container: # if os.path.exists("/usr/bin/chromedriver"): # service = ChromeService(executable_path="/usr/bin/chromedriver") # elif os.path.exists("/usr/local/bin/chromedriver"): # service = ChromeService(executable_path="/usr/local/bin/chromedriver") # else: # # This will likely fail if chromedriver isn't installed and in PATH # # On HF Spaces, you typically ensure this via environment setup (e.g. packages.txt) # print("Attempting to initialize ChromeService without explicit path...") # service = ChromeService() # May fail if chromedriver not in PATH # except Exception as e: # print(f"Could not initialize ChromeService: {e}. Ensure chromedriver is installed and in PATH.") # return None # try: # print("Setting up ChromeDriver for Hugging Face environment...") # driver = webdriver.Chrome(service=service, options=options) # print("ChromeDriver setup successful.") # except Exception as e: # print(f"Error setting up ChromeDriver: {e}") # return None # driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined});") # return driver # def clean_address(address_str): # if not isinstance(address_str, str): # return "" # cleaned_address = ' '.join(address_str.split()) # cleaned_address = re.sub(r'floor-\s*[\w\s]+,?', '', cleaned_address, flags=re.IGNORECASE) # cleaned_address = cleaned_address.replace(' ,', ',').replace(',,', ',') # cleaned_address = ', '.join(filter(None, (s.strip() for s in cleaned_address.split(',')))) # if "india" not in cleaned_address.lower() and ("mumbai" in cleaned_address.lower() or "maharashtra" in cleaned_address.lower()): # cleaned_address += ", India" # return cleaned_address # def geocode_address_with_fallbacks(address_str, attempt_count=0): # if not address_str or not address_str.strip(): # print("Address string is empty, cannot geocode.") # return None, None # cleaned_address = clean_address(address_str) # print(f"Attempting to geocode cleaned address: '{cleaned_address}' (Attempt {attempt_count + 1})") # nominatim_user_agent = f"gstin_gradio_app_hf_{int(time.time())}" # geocoders_to_try = [ # ("Nominatim", Nominatim(user_agent=nominatim_user_agent)), # ("ArcGIS", ArcGIS(timeout=10)) # ] # for name, geolocator in geocoders_to_try: # try: # print(f"Trying geocoder: {name}...") # location = geolocator.geocode(cleaned_address, timeout=15) # if location: # print(f"Success with {name}: Lat: {location.latitude}, Lon: {location.longitude}") # return location.latitude, location.longitude # else: # print(f"{name} could not geocode the address.") # except (GeocoderTimedOut, GeocoderUnavailable, GeocoderServiceError) as e: # print(f"{name} geocoding error: {e}") # except Exception as e: # print(f"An unexpected error occurred with {name}: {e}") # time.sleep(1) # if attempt_count == 0: # parts = [s.strip() for s in cleaned_address.split(',') if s.strip()] # if len(parts) > 3: # generic_address = ', '.join(parts[1:]) # print(f"Trying a more generic address (v1): '{generic_address}'") # lat, lon = geocode_address_with_fallbacks(generic_address, attempt_count + 1) # if lat is not None: return lat, lon # if len(parts) > 4: # generic_address_v2 = ', '.join(parts[2:]) # print(f"Trying a more generic address (v2): '{generic_address_v2}'") # return geocode_address_with_fallbacks(generic_address_v2, attempt_count + 1) # print("All geocoding attempts failed for the address.") # return None, None # def get_gstin_details_for_gradio(gstin_number_input): # """ # Main function for Gradio: takes GSTIN, scrapes, and returns data as DataFrame. # """ # gstin_number = str(gstin_number_input).strip().upper() # if not (len(gstin_number) == 15 and gstin_number.isalnum()): # return pd.DataFrame({"Error": ["Invalid GSTIN format. Must be 15 alphanumeric characters."]}) # print(f"Initiating scraper for GSTIN: {gstin_number}") # driver = driversetup_huggingface() # if driver is None: # print("WebDriver not initialized for scraper.") # return pd.DataFrame({"Error": ["WebDriver initialization failed. Check server logs."]}) # extracted_data = {"GSTIN Queried": gstin_number} # wait_time = 30 # url = "https://www.mastersindia.co/gst-number-search-and-gstin-verification/" # try: # driver.get(url) # print(f"Navigated to URL: {url}") # gstin_input_css_selector = 'input[placeholder="XXXAAAYYYYZ01Z5"]' # WebDriverWait(driver, wait_time).until( # EC.presence_of_element_located((By.CSS_SELECTOR, gstin_input_css_selector)) # ) # gstin_input = driver.find_element(By.CSS_SELECTOR, gstin_input_css_selector) # gstin_input.clear() # gstin_input.send_keys(gstin_number) # print(f"Entered GSTIN: {gstin_number}") # search_button_css_selector = 'button[aria-label="Search"]' # WebDriverWait(driver, wait_time).until( # EC.element_to_be_clickable((By.CSS_SELECTOR, search_button_css_selector)) # ) # search_button = driver.find_element(By.CSS_SELECTOR, search_button_css_selector) # driver.execute_script("arguments[0].click();", search_button) # print("Clicked Search button.") # results_table_container_css_selector_for_wait = "div.eaKoeQ table" # WebDriverWait(driver, wait_time).until( # EC.presence_of_element_located((By.CSS_SELECTOR, results_table_container_css_selector_for_wait)) # ) # print("Results table container found.") # time.sleep(4) # page_source = driver.page_source # soup = BeautifulSoup(page_source, 'html.parser') # table_container_div = soup.select_one("div.eaKoeQ") # table = None # if table_container_div: table = table_container_div.find('table') # if not table: table = soup.find('table') # if not table: # msg = "No data table found on the page after search." # if "captcha" in page_source.lower(): msg = "CAPTCHA detected during scraping." # elif "No details found" in page_source or "Invalid GSTIN" in page_source: # msg = f"No details found for GSTIN {gstin_number} or invalid GSTIN." # print(msg) # return pd.DataFrame({"Error": [msg]}) # rows = table.find_all('tr') # raw_data = {} # for row in rows: # header_element = row.find('th', class_=lambda x: x and 'eLVLDP' in x.split()) # value_element = row.find('td', class_=lambda x: x and 'jdgLDg' in x.split()) # if header_element and value_element: # raw_data[header_element.get_text(strip=True)] = value_element.get_text(strip=True) # elif len(row.find_all('td')) == 2: # cells = row.find_all('td') # if cells[0].get_text(strip=True): # raw_data[cells[0].get_text(strip=True)] = cells[1].get_text(strip=True) # if not raw_data: # print("Could not parse any data from the table rows.") # return pd.DataFrame({"Error": ["Failed to parse data from table."]}) # fields_to_extract_map = { # "Principal Place of Business": "Principal Business Address", # "Additional Place of Business": "Additional Business Address(es)", # "State Jurisdiction": "State Jurisdiction", # "Centre Jurisdiction": "Centre Jurisdiction", # "Date of Registration": "Registration Date", # "Constitution of Business": "Business Constitution", # "Taxpayer Type": "Taxpayer Type", # "GSTIN Status": "GSTIN Status" # } # for web_key, display_key in fields_to_extract_map.items(): # extracted_data[display_key] = raw_data.get(web_key, "Not Found") # address_to_geocode = extracted_data.get("Principal Business Address") # if address_to_geocode not in [None, "Not Found", ""]: # lat, lon = geocode_address_with_fallbacks(address_to_geocode) # extracted_data["Address Latitude"] = lat if lat is not None else "N/A" # extracted_data["Address Longitude"] = lon if lon is not None else "N/A" # else: # extracted_data["Address Latitude"] = "N/A" # extracted_data["Address Longitude"] = "N/A" # if extracted_data.get("Principal Business Address"): # print("Principal Place of Business not found or empty, skipping geocoding.") # print(f"Successfully scraped data for {gstin_number}") # # Convert dictionary to a 2-column DataFrame for Gradio # df_output = pd.DataFrame(list(extracted_data.items()), columns=["Field", "Value"]) # return df_output # except Exception as e: # print(f"An error occurred during scraping process for {gstin_number}: {e}") # # import traceback # # traceback.print_exc() # return pd.DataFrame({"Error": [f"Scraping process failed: {str(e)}"]}) # finally: # if 'driver' in locals() and driver is not None: # try: # driver.quit() # print("Browser closed.") # except Exception as e: # print(f"Error quitting driver: {e}") # # --- Gradio Interface --- # iface = gr.Interface( # fn=get_gstin_details_for_gradio, # inputs=gr.Textbox( # label="Enter GSTIN", # placeholder="Enter 15-character GSTIN (e.g., 27AAFCD5562R1Z5)", # max_lines=1, # info="The scraper will fetch details for the provided GSTIN from Masters India." # ), # outputs=gr.DataFrame( # label="GSTIN Details", # headers=["Field", "Value"], # wrap=True # ), # title="🧾 GSTIN Details Scraper & Verifier", # description="Enter a valid 15-character Indian GSTIN to fetch its registration details and attempt to geocode the principal place of business. Uses Masters India for scraping.", # article="

Powered by Selenium, BeautifulSoup, Geopy, and Gradio.
Note: Scraping may take 20-40 seconds. Geocoding accuracy may vary.

", # examples=[["27AAFCD5562R1Z5"], ["07AAFCM6072R1Z8"]], # Example GSTINs # allow_flagging="never", # theme=gr.themes.Soft() # Using a soft theme # ) # if __name__ == '__main__': # # For Hugging Face Spaces, Gradio typically handles the server. # # This launch(share=True) is more for local testing if you want a public link temporarily. # # On HF Spaces, just `iface.launch()` is enough. # # To run locally: python app.py # if os.environ.get("SYSTEM") == "spaces": # Check if running in Hugging Face Spaces # iface.launch(debug=False) # else: # iface.launch(debug=True, share=True)