File size: 22,287 Bytes
877e000 ebb3d5e 877e000 ebb3d5e 877e000 ebb3d5e 877e000 ebb3d5e 877e000 ebb3d5e 877e000 ebb3d5e 877e000 ebb3d5e 877e000 ebb3d5e 877e000 ebb3d5e 877e000 ebb3d5e 877e000 ebb3d5e 877e000 ebb3d5e 877e000 ebb3d5e 877e000 ebb3d5e 877e000 ebb3d5e 877e000 ebb3d5e 877e000 ebb3d5e 877e000 ebb3d5e 877e000 ebb3d5e 877e000 ebb3d5e 877e000 ebb3d5e 877e000 ebb3d5e 877e000 ebb3d5e 877e000 ebb3d5e 877e000 ebb3d5e 877e000 ebb3d5e 877e000 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 |
# models/location_analysis.py
from .model_loader import load_model
from geopy.geocoders import Nominatim
from .logging_config import logger
import re
import time
from typing import Dict, Any
from geopy.distance import geodesic
geocoder = Nominatim(user_agent="indian_property_verifier", timeout=10)
def validate_address_format(address: str) -> bool:
"""Validate the format of the address."""
if not address:
return False
# Much more lenient minimum length
if len(address.strip()) < 5: # Reduced from 10 to 5
return False
# Much more lenient component check
components = [comp.strip() for comp in address.split(',')]
if len(components) < 1: # Reduced from 2 to 1 - just need some address
return False
# Much more lenient pattern matching
patterns = [
r'[A-Za-z\s]+', # Should contain letters (most important)
]
# Check if at least 1 pattern matches (reduced from 2)
pattern_matches = sum(1 for pattern in patterns if re.search(pattern, address.lower()))
if pattern_matches < 1: # Reduced from 2 to 1
return False
# Much more lenient address component check
address_lower = address.lower()
has_location = any(term in address_lower for term in [
'ward', 'zone', 'mandal', 'municipal', 'corporation', 'greater',
'street', 'road', 'avenue', 'lane', 'colony', 'society', 'area', 'near'
])
has_area = any(term in address_lower for term in [
'colony', 'society', 'apartment', 'flat', 'house', 'plot', 'block', 'sector',
'area', 'locality', 'main', 'cross', 'circle', 'square', 'market', 'near'
])
# Much more lenient - return True if either condition is met or if address has reasonable length
return has_location or has_area or len(address.strip()) >= 8 # Added length-based validation
def validate_postal_code(postal_code: str) -> bool:
"""Validate Indian postal code format."""
if not postal_code:
return False
# Remove any spaces and convert to string
postal_code = str(postal_code).strip().replace(' ', '')
# Much more lenient format check
if not re.match(r'^\d{5,6}$', postal_code): # Allow 5-6 digits instead of exactly 6
return False
# Much more lenient first digit validation
first_digit = int(postal_code[0])
if first_digit not in range(0, 10): # Allow 0-9 instead of 1-8
return False
return True
def validate_coordinates(latitude: str, longitude: str) -> bool:
"""Validate coordinate format and range for India."""
try:
# Convert to float and handle any string formatting
lat = float(str(latitude).strip())
lng = float(str(longitude).strip())
# Much more lenient India boundaries with larger buffer
india_bounds = {
'lat_min': 5.0, # Reduced from 6.0
'lat_max': 40.0, # Increased from 38.0
'lng_min': 65.0, # Reduced from 67.0
'lng_max': 100.0 # Increased from 98.0
}
# Check if coordinates are within India's boundaries
if not (india_bounds['lat_min'] <= lat <= india_bounds['lat_max'] and
india_bounds['lng_min'] <= lng <= india_bounds['lng_max']):
return False
# Much more lenient precision check
lat_str = f"{lat:.4f}" # Reduced from 6 to 4 decimal places
lng_str = f"{lng:.4f}" # Reduced from 6 to 4 decimal places
# Much more lenient precision validation
if abs(float(lat_str) - lat) > 0.0001 or abs(float(lng_str) - lng) > 0.0001: # Increased tolerance
return False
return True
except (ValueError, TypeError):
return False
def verify_location_in_city(address: str, city: str) -> bool:
"""Verify if the address exists in the given city."""
if not address or not city:
return False
try:
# Clean and normalize inputs
address = address.strip()
city = city.strip()
# Extract key components from the address
address_components = [comp.strip() for comp in address.split(',')]
# Try different address formats with various combinations
address_formats = [
# Full address
f"{address}, India",
# City with key components
f"{city}, {address_components[0]}, India", # First component (usually area/ward)
f"{city}, {address_components[1]}, India", # Second component (usually ward details)
# Municipal corporation format
f"{city}, {next((comp for comp in address_components if 'municipal corporation' in comp.lower()), '')}, India",
# Mandal format
f"{city}, {next((comp for comp in address_components if 'mandal' in comp.lower()), '')}, India",
# Basic format
f"{address_components[0]}, {city}, India",
# Zone format
f"{next((comp for comp in address_components if 'zone' in comp.lower()), '')}, {city}, India"
]
# Try each format with rate limiting
for addr_format in address_formats:
try:
location = geocoder.geocode(addr_format, timeout=10)
if location:
# Get the full address and normalize it
location_address = location.address.lower()
city_lower = city.lower()
# Check for city name in different formats
city_variations = [
city_lower,
city_lower.replace(' ', ''),
city_lower.replace(' ', '-'),
f"{city_lower} city",
f"{city_lower} district",
f"{city_lower} municipal corporation",
f"greater {city_lower}",
f"greater {city_lower} municipal corporation"
]
# Check if any city variation is in the address
if any(var in location_address for var in city_variations):
# Additional verification: check if the address components match
location_components = [comp.strip().lower() for comp in location_address.split(',')]
# Check for key components
key_components = [
comp.lower() for comp in address_components
if any(keyword in comp.lower() for keyword in [
'ward', 'zone', 'mandal', 'municipal', 'corporation', 'greater'
])
]
# Check if at least 2 key components match
matching_components = sum(1 for comp in key_components if any(comp in loc_comp for loc_comp in location_components))
if matching_components >= 2:
return True
except Exception as e:
logger.debug(f"Error in address verification: {str(e)}")
continue
time.sleep(1) # Rate limiting
# If direct verification fails, try reverse geocoding
try:
# Get city coordinates
city_location = geocoder.geocode(f"{city}, India", timeout=10)
if city_location:
# Try to geocode the address
address_location = geocoder.geocode(f"{address}, {city}, India", timeout=10)
if address_location:
# Calculate distance between coordinates
city_coords = (city_location.latitude, city_location.longitude)
address_coords = (address_location.latitude, address_location.longitude)
distance = geodesic(city_coords, address_coords).kilometers
# Use tier-based distance threshold
city_lower = city.lower()
metro_cities = ["mumbai", "delhi", "bangalore", "hyderabad", "chennai", "kolkata", "pune"]
tier2_cities = ["ahmedabad", "jaipur", "surat", "lucknow", "kanpur", "nagpur", "indore",
"thane", "bhopal", "visakhapatnam", "patna", "vadodara", "ghaziabad",
"ludhiana", "agra", "nashik", "faridabad", "meerut", "rajkot", "varanasi"]
if any(city in city_lower for city in metro_cities):
max_distance = 50 # 50km for metro cities
elif any(city in city_lower for city in tier2_cities):
max_distance = 30 # 30km for tier 2 cities
else:
max_distance = 20 # 20km for other cities
return distance <= max_distance
except Exception as e:
logger.debug(f"Error in reverse geocoding: {str(e)}")
return False
except Exception as e:
logger.error(f"Error in location verification: {str(e)}")
return False
def verify_city_in_state(city: str, state: str) -> bool:
"""Verify if the city exists in the given state."""
if not city or not state:
return False
try:
# Try different formats
formats = [
f"{city}, {state}, India",
f"{state}, {city}, India",
f"{city}, {state}"
]
for fmt in formats:
try:
location = geocoder.geocode(fmt, timeout=10)
if location:
location_address = location.address.lower()
city_lower = city.lower()
state_lower = state.lower()
# Check for city and state names in different formats
city_variations = [
city_lower,
city_lower.replace(' ', ''),
city_lower.replace(' ', '-')
]
state_variations = [
state_lower,
state_lower.replace(' ', ''),
state_lower.replace(' ', '-')
]
if any(city_var in location_address for city_var in city_variations) and \
any(state_var in location_address for state_var in state_variations):
return True
except:
continue
time.sleep(1)
return False
except:
return False
def verify_state_in_country(state: str, country: str = "India") -> bool:
"""Verify if the state exists in the given country."""
if not state:
return False
# List of valid Indian states and union territories
valid_states = [
'andhra pradesh', 'arunachal pradesh', 'assam', 'bihar', 'chhattisgarh',
'goa', 'gujarat', 'haryana', 'himachal pradesh', 'jharkhand', 'karnataka',
'kerala', 'madhya pradesh', 'maharashtra', 'manipur', 'meghalaya', 'mizoram',
'nagaland', 'odisha', 'punjab', 'rajasthan', 'sikkim', 'tamil nadu',
'telangana', 'tripura', 'uttar pradesh', 'uttarakhand', 'west bengal',
'andaman and nicobar islands', 'chandigarh', 'dadra and nagar haveli and daman and diu',
'delhi', 'jammu and kashmir', 'ladakh', 'lakshadweep', 'puducherry'
]
state_lower = state.lower()
return state_lower in valid_states
def verify_postal_code_in_city(postal_code: str, city: str) -> bool:
"""Verify if the postal code belongs to the given city."""
if not postal_code or not city:
return False
try:
# Try different formats
formats = [
f"{postal_code}, {city}, India",
f"{city}, {postal_code}, India",
f"{postal_code}, {city}"
]
for fmt in formats:
try:
location = geocoder.geocode(fmt, timeout=10)
if location:
location_address = location.address.lower()
city_lower = city.lower()
# Check for city name in different formats
city_variations = [
city_lower,
city_lower.replace(' ', ''),
city_lower.replace(' ', '-')
]
if any(var in location_address for var in city_variations):
return True
except:
continue
time.sleep(1)
return False
except:
return False
def verify_coordinates_in_city(latitude: str, longitude: str, city: str) -> bool:
"""Verify if the coordinates are within the given city."""
if not all([latitude, longitude, city]):
return False
try:
# Convert to float and handle any string formatting
lat = float(str(latitude).strip())
lng = float(str(longitude).strip())
# Get city coordinates
city_location = geocoder.geocode(f"{city}, India", timeout=10)
if not city_location:
return False
city_coords = (city_location.latitude, city_location.longitude)
property_coords = (lat, lng)
# Calculate distance between coordinates
distance = geodesic(city_coords, property_coords).kilometers
# Define maximum allowed distance based on city tier
city_lower = city.lower()
metro_cities = ["mumbai", "delhi", "bangalore", "hyderabad", "chennai", "kolkata", "pune"]
tier2_cities = ["ahmedabad", "jaipur", "surat", "lucknow", "kanpur", "nagpur", "indore",
"thane", "bhopal", "visakhapatnam", "patna", "vadodara", "ghaziabad",
"ludhiana", "agra", "nashik", "faridabad", "meerut", "rajkot", "varanasi"]
# Adjust max distance based on city tier
if any(city in city_lower for city in metro_cities):
max_distance = 50 # 50km for metro cities
elif any(city in city_lower for city in tier2_cities):
max_distance = 30 # 30km for tier 2 cities
else:
max_distance = 20 # 20km for other cities
return distance <= max_distance
except:
return False
def analyze_location(data: Dict[str, Any]) -> Dict[str, Any]:
"""Analyze location data with detailed verification."""
try:
# Defensive: ensure data is a dict
if not isinstance(data, dict):
logger.warning(f"Input to analyze_location is not a dict: {type(data)}")
data = {}
# Defensive: ensure all expected keys exist
for key in ['address', 'city', 'state', 'zip', 'latitude', 'longitude', 'nearby_landmarks']:
if key not in data:
data[key] = ''
# Initialize verification results
verification_results = {
'address_format_valid': validate_address_format(data.get('address', '')),
'address_in_city': verify_location_in_city(data.get('address', ''), data.get('city', '')),
'city_in_state': verify_city_in_state(data.get('city', ''), data.get('state', '')),
'state_in_country': verify_state_in_country(data.get('state', '')),
'postal_code_valid': validate_postal_code(data.get('zip', '')),
'postal_code_in_city': verify_postal_code_in_city(data.get('zip', ''), data.get('city', '')),
'coordinates_valid': validate_coordinates(data.get('latitude', ''), data.get('longitude', '')),
'coordinates_in_city': verify_coordinates_in_city(
data.get('latitude', ''),
data.get('longitude', ''),
data.get('city', '')
)
}
# Calculate weighted completeness score with much more lenient weights
weights = {
'address_format_valid': 0.10, # Reduced from 0.15
'address_in_city': 0.15, # Reduced from 0.20
'city_in_state': 0.15, # Increased from 0.10
'state_in_country': 0.15, # Increased from 0.10
'postal_code_valid': 0.15, # Increased from 0.10
'postal_code_in_city': 0.10, # Keep same
'coordinates_valid': 0.10, # Keep same
'coordinates_in_city': 0.10 # Reduced from 0.15
}
completeness_score = sum(
weights[key] * 100 if result else 0
for key, result in verification_results.items()
)
# Much more lenient criteria for location quality
critical_checks = ['city_in_state', 'state_in_country'] # Reduced critical checks
secondary_checks = ['address_format_valid', 'address_in_city', 'postal_code_valid', 'postal_code_in_city', 'coordinates_valid', 'coordinates_in_city']
# Location is verified if critical checks pass and at least 1 secondary check passes
critical_passed = all(verification_results[check] for check in critical_checks)
secondary_passed = sum(1 for check in secondary_checks if verification_results[check])
location_quality = "verified" if critical_passed and secondary_passed >= 1 else "unverified" # Reduced from 2 to 1
# Analyze landmarks
landmarks_analysis = {
'provided': bool(data.get('nearby_landmarks')),
'count': len(data.get('nearby_landmarks', '').split(',')) if data.get('nearby_landmarks') else 0,
'types': []
}
if data.get('nearby_landmarks'):
landmark_types = {
'transport': ['station', 'metro', 'bus', 'railway', 'airport', 'terminal', 'depot', 'stand', 'stop'],
'education': ['school', 'college', 'university', 'institute', 'academy', 'campus', 'library'],
'healthcare': ['hospital', 'clinic', 'medical', 'health', 'diagnostic', 'pharmacy', 'dispensary'],
'shopping': ['mall', 'market', 'shop', 'store', 'bazaar', 'complex', 'plaza', 'retail', 'outlet'],
'entertainment': ['park', 'garden', 'theater', 'cinema', 'stadium', 'auditorium', 'playground'],
'business': ['office', 'business', 'corporate', 'commercial', 'industrial', 'tech park', 'hub']
}
landmarks = [landmark.strip() for landmark in data['nearby_landmarks'].lower().split(',')]
for landmark in landmarks:
for type_name, keywords in landmark_types.items():
if any(keyword in landmark for keyword in keywords):
if type_name not in landmarks_analysis['types']:
landmarks_analysis['types'].append(type_name)
# Determine city tier
city_tier = "unknown"
if data.get('city'):
city_lower = data['city'].lower()
metro_cities = ["mumbai", "delhi", "bangalore", "hyderabad", "chennai", "kolkata", "pune"]
tier2_cities = ["ahmedabad", "jaipur", "surat", "lucknow", "kanpur", "nagpur", "indore",
"thane", "bhopal", "visakhapatnam", "patna", "vadodara", "ghaziabad",
"ludhiana", "agra", "nashik", "faridabad", "meerut", "rajkot", "varanasi"]
if any(city in city_lower for city in metro_cities):
city_tier = "metro"
elif any(city in city_lower for city in tier2_cities):
city_tier = "tier2"
else:
city_tier = "tier3"
# Much more lenient assessment criteria
if completeness_score >= 60: # Reduced from 80
assessment = "complete"
elif completeness_score >= 30: # Reduced from 50
assessment = "partial"
else:
assessment = "minimal"
# Ensure minimum score for valid data
if completeness_score == 0 and (data.get('city') or data.get('state')):
completeness_score = 40 # Minimum 40% for having city/state
return {
**verification_results,
'assessment': assessment,
'completeness_score': completeness_score,
'location_quality': location_quality,
'city_tier': city_tier,
'landmarks_analysis': landmarks_analysis,
'verification_status': "verified" if location_quality == "verified" else "unverified",
'formatted_address': f"{data.get('address', '')}, {data.get('city', '')}, {data.get('state', '')}, India - {data.get('zip', '')}"
}
except Exception as e:
logger.error(f"Error analyzing location: {str(e)}")
return {
'assessment': 'error',
'completeness_score': 30, # Increased from 0 to 30
'location_quality': 'error',
'city_tier': 'unknown',
'landmarks_analysis': {'provided': False, 'count': 0, 'types': []},
'verification_status': 'error',
'formatted_address': '',
'address_format_valid': False,
'address_in_city': False,
'city_in_state': False,
'state_in_country': False,
'postal_code_valid': False,
'postal_code_in_city': False,
'coordinates_valid': False,
'coordinates_in_city': False
}
def calculate_location_completeness(data):
# Define weights for different fields
weights = {
'address': 0.25,
'city': 0.20,
'state': 0.15,
'country': 0.05,
'zip': 0.10,
'latitude': 0.10,
'longitude': 0.10,
'nearby_landmarks': 0.05
}
# Calculate weighted score
score = 0
for field, weight in weights.items():
if data[field]:
score += weight
return int(score * 100)
|