Spaces:

lamhieu
/

lightweight-embeddings

Running

App Files Files Community

lamhieu commited on Jan 4

Commit

8ad7a2a

1 Parent(s): b0cb394

chore: using redis sync instead of

Browse files

Files changed (1) hide show

lightweight_embeddings/analytics.py +138 -60

lightweight_embeddings/analytics.py CHANGED Viewed

@@ -1,56 +1,76 @@
 import logging
 import asyncio
-import redis.asyncio as redis
 import redis.exceptions
 from datetime import datetime
 from collections import defaultdict
 from typing import Dict
 logger = logging.getLogger(__name__)
 class Analytics:
     def __init__(self, redis_url: str, sync_interval: int = 60, max_retries: int = 5):
         """
-        Initializes the Analytics class with an async Redis connection and sync interval.
         Parameters:
-        - redis_url: Redis connection URL (e.g., 'redis://localhost:6379/0')
-        - sync_interval: Interval in seconds for syncing with Redis.
-        - max_retries: Maximum number of retries for reconnecting to Redis.
         """
         self.redis_url = redis_url
         self.sync_interval = sync_interval
         self.max_retries = max_retries
         self.redis_client = self._create_redis_client()
         self.local_buffer = {
-            "access": defaultdict(
-                lambda: defaultdict(int)
-            ),  # {period: {model_id: access_count}}
-            "tokens": defaultdict(
-                lambda: defaultdict(int)
-            ),  # {period: {model_id: tokens_count}}
         }
-        self.lock = asyncio.Lock()  # Async lock for thread-safe updates
-        asyncio.create_task(self._start_sync_task())
         logger.info("Initialized Analytics with Redis connection: %s", redis_url)
     def _create_redis_client(self) -> redis.Redis:
         """
-        Creates and returns a new Redis client.
         """
         return redis.from_url(
             self.redis_url,
             decode_responses=True,
             health_check_interval=10,
             socket_connect_timeout=5,
-            retry_on_timeout=True,
             socket_keepalive=True,
         )
     def _get_period_keys(self) -> tuple:
         """
-        Returns keys for day, week, month, and year based on the current date.
         """
         now = datetime.utcnow()
         day_key = now.strftime("%Y-%m-%d")
@@ -61,23 +81,23 @@ class Analytics:
     async def access(self, model_id: str, tokens: int):
         """
-        Records an access and token usage for a specific model_id.
         Parameters:
-        - model_id: The ID of the model being accessed.
-        - tokens: Number of tokens used in this access.
         """
         day_key, week_key, month_key, year_key = self._get_period_keys()
         async with self.lock:
-            # Increment access count
             self.local_buffer["access"][day_key][model_id] += 1
             self.local_buffer["access"][week_key][model_id] += 1
             self.local_buffer["access"][month_key][model_id] += 1
             self.local_buffer["access"][year_key][model_id] += 1
             self.local_buffer["access"]["total"][model_id] += 1
-            # Increment token count
             self.local_buffer["tokens"][day_key][model_id] += tokens
             self.local_buffer["tokens"][week_key][model_id] += tokens
             self.local_buffer["tokens"][month_key][model_id] += tokens
@@ -86,10 +106,7 @@ class Analytics:
     async def stats(self) -> Dict[str, Dict[str, Dict[str, int]]]:
         """
-        Returns statistics for all models from the local buffer.
-        Returns:
-        - A dictionary with access counts and token usage for each period.
         """
         async with self.lock:
             return {
@@ -103,31 +120,87 @@ class Analytics:
                 },
             }
     async def _sync_to_redis(self):
         """
-        Synchronizes local buffer data with Redis.
         """
         async with self.lock:
             try:
-                pipeline = self.redis_client.pipeline()
-                # Sync access counts
                 for period, models in self.local_buffer["access"].items():
                     for model_id, count in models.items():
-                        redis_key = f"analytics:access:{period}"
                         pipeline.hincrby(redis_key, model_id, count)
-                # Sync token counts
                 for period, models in self.local_buffer["tokens"].items():
                     for model_id, count in models.items():
-                        redis_key = f"analytics:tokens:{period}"
                         pipeline.hincrby(redis_key, model_id, count)
-                pipeline.execute()
-                self.local_buffer["access"].clear()  # Clear access buffer after sync
-                self.local_buffer["tokens"].clear()  # Clear tokens buffer after sync
-                logger.info("Synced analytics data to Redis.")
             except redis.exceptions.ConnectionError as e:
                 logger.error("Redis connection error during sync: %s", e)
                 raise e
@@ -137,64 +210,69 @@ class Analytics:
     async def _start_sync_task(self):
         """
-        Starts a background task that periodically syncs data to Redis.
-        Implements retry logic with exponential backoff on connection failures.
         """
-        retry_delay = 1  # Initial retry delay in seconds
         while True:
             await asyncio.sleep(self.sync_interval)
             try:
                 await self._sync_to_redis()
-                retry_delay = 1  # Reset retry delay after successful sync
             except redis.exceptions.ConnectionError as e:
-                logger.error("Redis connection error: %s", e)
                 await self._handle_redis_reconnection()
             except Exception as e:
-                logger.error("Error during sync: %s", e)
-                # Depending on the error, you might want to handle differently
     async def _handle_redis_reconnection(self):
         """
-        Handles Redis reconnection with exponential backoff.
         """
         retry_count = 0
-        delay = 1  # Start with 1 second delay
         while retry_count < self.max_retries:
             try:
-                logger.info("Attempting to reconnect to Redis (Attempt %d)...", retry_count + 1)
-                self.redis_client.close()
                 self.redis_client = self._create_redis_client()
-                # Optionally, perform a simple command to check connection
-                self.redis_client.ping()
                 logger.info("Successfully reconnected to Redis.")
                 return
             except redis.exceptions.ConnectionError as e:
                 logger.error("Reconnection attempt %d failed: %s", retry_count + 1, e)
                 retry_count += 1
                 await asyncio.sleep(delay)
-                delay *= 2  # Exponential backoff
-        logger.critical("Max reconnection attempts reached. Unable to reconnect to Redis.")
-        # Depending on your application's requirements, you might choose to exit or keep retrying indefinitely
-        # For example, to keep retrying:
         while True:
             try:
                 logger.info("Retrying to reconnect to Redis...")
-                self.redis_client.close()
                 self.redis_client = self._create_redis_client()
-                self.redis_client.ping()
-                logger.info("Successfully reconnected to Redis.")
                 break
             except redis.exceptions.ConnectionError as e:
-                logger.error("Reconnection attempt failed: %s", e)
                 await asyncio.sleep(delay)
-                delay = min(delay * 2, 60)  # Cap the delay to 60 seconds
     async def close(self):
         """
-        Closes the Redis connection gracefully.
         """
-        self.redis_client.close()
         logger.info("Closed Redis connection.")

 import logging
 import asyncio
+import redis
 import redis.exceptions
 from datetime import datetime
 from collections import defaultdict
 from typing import Dict
+from functools import partial
 logger = logging.getLogger(__name__)
 class Analytics:
     def __init__(self, redis_url: str, sync_interval: int = 60, max_retries: int = 5):
         """
+        Initializes the Analytics class with a synchronous Redis client,
+        wrapped in asynchronous methods by using run_in_executor.
         Parameters:
+        - redis_url (str): Redis connection URL (e.g., 'redis://localhost:6379/0').
+        - sync_interval (int): Interval in seconds for syncing with Redis.
+        - max_retries (int): Maximum number of reconnection attempts to Redis.
         """
         self.redis_url = redis_url
         self.sync_interval = sync_interval
         self.max_retries = max_retries
+        # Synchronous Redis client
         self.redis_client = self._create_redis_client()
+        # Local buffer stores cumulative data for two-way sync
         self.local_buffer = {
+            "access": defaultdict(lambda: defaultdict(int)),
+            "tokens": defaultdict(lambda: defaultdict(int)),
         }
+        # Asynchronous lock to protect shared data
+        self.lock = asyncio.Lock()
+        # Initialize data from Redis, then start the periodic sync loop
+        asyncio.create_task(self._initialize())
         logger.info("Initialized Analytics with Redis connection: %s", redis_url)
     def _create_redis_client(self) -> redis.Redis:
         """
+        Creates and returns a new synchronous Redis client.
         """
         return redis.from_url(
             self.redis_url,
             decode_responses=True,
             health_check_interval=10,
             socket_connect_timeout=5,
             socket_keepalive=True,
         )
+    async def _initialize(self):
+        """
+        Fetches existing data from Redis into the local buffer,
+        then starts the periodic synchronization task.
+        """
+        try:
+            await self._sync_from_redis()
+            logger.info("Initial sync from Redis to local buffer completed.")
+        except Exception as e:
+            logger.error("Error during initial sync from Redis: %s", e)
+        # Launch the periodic sync task
+        asyncio.create_task(self._start_sync_task())
     def _get_period_keys(self) -> tuple:
         """
+        Returns day, week, month, and year keys based on the current UTC date.
         """
         now = datetime.utcnow()
         day_key = now.strftime("%Y-%m-%d")
     async def access(self, model_id: str, tokens: int):
         """
+        Records an access event and token usage for a specific model.
         Parameters:
+        - model_id (str): The ID of the accessed model.
+        - tokens (int): Number of tokens used in this access event.
         """
         day_key, week_key, month_key, year_key = self._get_period_keys()
         async with self.lock:
+            # Access counts
             self.local_buffer["access"][day_key][model_id] += 1
             self.local_buffer["access"][week_key][model_id] += 1
             self.local_buffer["access"][month_key][model_id] += 1
             self.local_buffer["access"][year_key][model_id] += 1
             self.local_buffer["access"]["total"][model_id] += 1
+            # Token usage
             self.local_buffer["tokens"][day_key][model_id] += tokens
             self.local_buffer["tokens"][week_key][model_id] += tokens
             self.local_buffer["tokens"][month_key][model_id] += tokens
     async def stats(self) -> Dict[str, Dict[str, Dict[str, int]]]:
         """
+        Returns a copy of current statistics from the local buffer.
         """
         async with self.lock:
             return {
                 },
             }
+    async def _sync_from_redis(self):
+        """
+        Pulls existing analytics data from Redis into the local buffer.
+        Uses run_in_executor to avoid blocking the event loop.
+        """
+        loop = asyncio.get_running_loop()
+        async with self.lock:
+            # Scan 'access' keys
+            cursor = 0
+            while True:
+                cursor, keys = await loop.run_in_executor(
+                    None,
+                    partial(
+                        self.redis_client.scan,
+                        cursor=cursor,
+                        match="analytics:access:*",
+                        count=100,
+                    ),
+                )
+                for key in keys:
+                    # key is "analytics:access:<period>"
+                    period = key.replace("analytics:access:", "")
+                    data = await loop.run_in_executor(
+                        None, partial(self.redis_client.hgetall, key)
+                    )
+                    for model_id, count_str in data.items():
+                        self.local_buffer["access"][period][model_id] += int(count_str)
+                if cursor == 0:
+                    break
+            # Scan 'tokens' keys
+            cursor = 0
+            while True:
+                cursor, keys = await loop.run_in_executor(
+                    None,
+                    partial(
+                        self.redis_client.scan,
+                        cursor=cursor,
+                        match="analytics:tokens:*",
+                        count=100,
+                    ),
+                )
+                for key in keys:
+                    # key is "analytics:tokens:<period>"
+                    period = key.replace("analytics:tokens:", "")
+                    data = await loop.run_in_executor(
+                        None, partial(self.redis_client.hgetall, key)
+                    )
+                    for model_id, count_str in data.items():
+                        self.local_buffer["tokens"][period][model_id] += int(count_str)
+                if cursor == 0:
+                    break
     async def _sync_to_redis(self):
         """
+        Pushes the local buffer data to Redis (local -> Redis).
+        Uses a pipeline to minimize round trips and run_in_executor to avoid blocking.
         """
+        loop = asyncio.get_running_loop()
         async with self.lock:
             try:
+                pipeline = self.redis_client.pipeline(transaction=False)
+                # Push 'access' data
                 for period, models in self.local_buffer["access"].items():
+                    redis_key = f"analytics:access:{period}"
                     for model_id, count in models.items():
                         pipeline.hincrby(redis_key, model_id, count)
+                # Push 'tokens' data
                 for period, models in self.local_buffer["tokens"].items():
+                    redis_key = f"analytics:tokens:{period}"
                     for model_id, count in models.items():
                         pipeline.hincrby(redis_key, model_id, count)
+                # Execute the pipeline in a separate thread
+                await loop.run_in_executor(None, pipeline.execute)
+                logger.info("Analytics data successfully synced to Redis.")
             except redis.exceptions.ConnectionError as e:
                 logger.error("Redis connection error during sync: %s", e)
                 raise e
     async def _start_sync_task(self):
         """
+        Periodically runs _sync_to_redis at a configurable interval.
+        Also handles reconnections on ConnectionError.
         """
         while True:
             await asyncio.sleep(self.sync_interval)
             try:
                 await self._sync_to_redis()
             except redis.exceptions.ConnectionError as e:
+                logger.error("Redis connection error during scheduled sync: %s", e)
                 await self._handle_redis_reconnection()
             except Exception as e:
+                logger.error("Error during scheduled sync: %s", e)
+                # Handle other errors as appropriate
     async def _handle_redis_reconnection(self):
         """
+        Attempts to reconnect to Redis using exponential backoff.
         """
+        loop = asyncio.get_running_loop()
         retry_count = 0
+        delay = 1
         while retry_count < self.max_retries:
             try:
+                logger.info(
+                    "Attempting to reconnect to Redis (attempt %d)...", retry_count + 1
+                )
+                # Close existing connection
+                await loop.run_in_executor(None, self.redis_client.close)
+                # Create a new client
                 self.redis_client = self._create_redis_client()
+                # Test the new connection
+                await loop.run_in_executor(None, self.redis_client.ping)
                 logger.info("Successfully reconnected to Redis.")
                 return
             except redis.exceptions.ConnectionError as e:
                 logger.error("Reconnection attempt %d failed: %s", retry_count + 1, e)
                 retry_count += 1
                 await asyncio.sleep(delay)
+                delay *= 2  # exponential backoff
+        logger.critical(
+            "Max reconnection attempts reached. Unable to reconnect to Redis."
+        )
+        # Optional: Keep retrying indefinitely instead of giving up.
         while True:
             try:
                 logger.info("Retrying to reconnect to Redis...")
+                await loop.run_in_executor(None, self.redis_client.close)
                 self.redis_client = self._create_redis_client()
+                await loop.run_in_executor(None, self.redis_client.ping)
+                logger.info("Reconnected to Redis after extended retries.")
                 break
             except redis.exceptions.ConnectionError as e:
+                logger.error("Extended reconnection attempt failed: %s", e)
                 await asyncio.sleep(delay)
+                delay = min(delay * 2, 60)  # Cap at 60 seconds or choose your own max
     async def close(self):
         """
+        Closes the Redis client connection. Still wrapped in an async method to avoid blocking.
         """
+        loop = asyncio.get_running_loop()
+        await loop.run_in_executor(None, self.redis_client.close)
         logger.info("Closed Redis connection.")