myspace134v / modules /server_monitor.py
rdune71's picture
Update with enhanced AI Research Assistant - streaming output, 8192 tokens, improved UI
001a1f0
import redis
import os
import time
from datetime import datetime, timedelta
class ServerMonitor:
def __init__(self):
try:
self.redis_client = redis.Redis(
host=os.getenv("REDIS_HOST", "localhost"),
port=int(os.getenv("REDIS_PORT", 6379)),
username=os.getenv("REDIS_USERNAME"),
password=os.getenv("REDIS_PASSWORD"),
decode_responses=True
)
# Test connection
self.redis_client.ping()
self.connected = True
except Exception:
self.redis_client = None
self.connected = False
def report_failure(self):
"""Report a server failure (e.g., 503 error)"""
if not self.connected:
return
try:
# Increment failure counter
key = f"server_failures:{datetime.now().strftime('%Y-%m-%d:%H')}"
self.redis_client.incr(key)
self.redis_client.expire(key, 3600) # Expire in 1 hour
# Record last failure time
self.redis_client.set("last_failure", datetime.now().isoformat())
self.redis_client.expire("last_failure", 86400) # Expire in 24 hours
except Exception:
pass # Silently fail to avoid breaking the main app
def report_success(self):
"""Report a successful request"""
if not self.connected:
return
try:
# Reset failure counter for current hour
key = f"server_failures:{datetime.now().strftime('%Y-%m-%d:%H')}"
self.redis_client.delete(key)
# Record last success time
self.redis_client.set("last_success", datetime.now().isoformat())
self.redis_client.expire("last_success", 86400) # Expire in 24 hours
except Exception:
pass # Silently fail to avoid breaking the main app
def check_server_status(self):
"""Check if server is likely available based on recent activity"""
if not self.connected:
return {"available": True, "message": "Redis not configured, assuming server available"}
try:
# Get recent failures
now = datetime.now()
failures_last_hour = 0
# Check current and previous hour
for i in range(2):
check_time = now - timedelta(hours=i)
key = f"server_failures:{check_time.strftime('%Y-%m-%d:%H')}"
failures = self.redis_client.get(key)
if failures:
failures_last_hour += int(failures)
# Get last failure time
last_failure_str = self.redis_client.get("last_failure")
last_success_str = self.redis_client.get("last_success")
# If we had recent failures but no recent success, server might be down
if failures_last_hour > 3:
if last_success_str:
last_success = datetime.fromisoformat(last_success_str)
minutes_since_success = (now - last_success).total_seconds() / 60
if minutes_since_success < 15:
return {
"available": True,
"message": "Recent success detected, server likely available",
"estimated_wait": 0
}
# Estimate wait time based on typical warmup
return {
"available": False,
"message": f"High failure rate detected ({failures_last_hour} failures recently)",
"estimated_wait": 5
}
# If we had a very recent failure (< 5 mins), suggest waiting
if last_failure_str:
last_failure = datetime.fromisoformat(last_failure_str)
minutes_since_failure = (now - last_failure).total_seconds() / 60
if minutes_since_failure < 5:
return {
"available": False,
"message": f"Recent failure {int(minutes_since_failure)} minutes ago",
"estimated_wait": max(1, 5 - int(minutes_since_failure))
}
return {
"available": True,
"message": "Server appears to be available",
"estimated_wait": 0
}
except Exception as e:
# On any Redis error, assume server is available
return {
"available": True,
"message": f"Monitoring check failed: {str(e)}, assuming server available",
"estimated_wait": 0
}
def get_system_stats(self):
"""Get detailed system statistics"""
if not self.connected:
return {"error": "Redis not configured"}
try:
stats = {}
# Get recent failures
now = datetime.now()
total_failures = 0
for i in range(24): # Last 24 hours
check_time = now - timedelta(hours=i)
key = f"server_failures:{check_time.strftime('%Y-%m-%d:%H')}"
failures = self.redis_client.get(key)
if failures:
total_failures += int(failures)
stats["failures_last_24h"] = total_failures
# Get last events
last_failure = self.redis_client.get("last_failure")
last_success = self.redis_client.get("last_success")
stats["last_failure"] = last_failure if last_failure else "None recorded"
stats["last_success"] = last_success if last_success else "None recorded"
# Calculate uptime percentage (approximate)
if last_failure and last_success:
failure_time = datetime.fromisoformat(last_failure)
success_time = datetime.fromisoformat(last_success)
if success_time > failure_time:
stats["status"] = "Operational"
else:
stats["status"] = "Degraded"
elif last_success:
stats["status"] = "Operational"
elif last_failure:
stats["status"] = "Issues Detected"
else:
stats["status"] = "Unknown"
return stats
except Exception as e:
return {"error": str(e)}