# Persistent Storage Setup for Hugging Face Spaces This guide explains how to set up and use persistent storage in Hugging Face Spaces for your LMM-Vibes application. ## Overview Hugging Face Spaces provides persistent storage at the `/data` directory that persists across app restarts and deployments. This storage is perfect for: - Caching models and datasets - Storing user uploads and results - Maintaining application state - Saving experiment results ## Quick Start ### 1. Automatic Setup (Already Implemented) Your application automatically detects and configures persistent storage when running in Hugging Face Spaces: ```python # This is already handled in app.py if is_persistent_storage_available(): # Configure HF cache to persistent storage hf_home = get_hf_home_dir() os.environ.setdefault("HF_HOME", str(hf_home)) # Set cache directories cache_dir = get_cache_dir() os.environ.setdefault("TRANSFORMERS_CACHE", str(cache_dir / "transformers")) os.environ.setdefault("HF_DATASETS_CACHE", str(cache_dir / "datasets")) ``` ### 2. Storage Structure When persistent storage is available, your data is organized as follows: ``` /data/ ├── app_data/ # Main application data │ ├── experiments/ # Pipeline results and experiments │ ├── dataframes/ # Saved pandas DataFrames │ ├── logs/ # Application logs │ └── uploads/ # User uploaded files ├── .cache/ # Application cache │ ├── transformers/ # Hugging Face Transformers cache │ └── datasets/ # Hugging Face Datasets cache └── .huggingface/ # Hugging Face model cache ``` ## Usage Examples ### Saving Data ```python from lmmvibes.utils.persistent_storage import ( save_data_to_persistent, save_uploaded_file ) # Save binary data data_bytes = b"your binary data" saved_path = save_data_to_persistent( data=data_bytes, filename="my_data.bin", subdirectory="experiments" ) # Save uploaded file from Gradio def handle_upload(uploaded_file): if uploaded_file: saved_path = save_uploaded_file(uploaded_file, "user_upload.zip") return f"Saved to: {saved_path}" ``` ### Loading Data ```python from lmmvibes.utils.persistent_storage import load_data_from_persistent # Load binary data data_bytes = load_data_from_persistent("my_data.bin", "experiments") if data_bytes: # Process the data data = data_bytes.decode('utf-8') ``` ### Listing Files ```python from lmmvibes.utils.persistent_storage import list_persistent_files # List all files all_files = list_persistent_files() # List specific types of files json_files = list_persistent_files(subdirectory="experiments", pattern="*.json") parquet_files = list_persistent_files(subdirectory="dataframes", pattern="*.parquet") ``` ### Checking Storage Status ```python from lmmvibes.utils.persistent_storage import get_storage_info info = get_storage_info() print(f"Persistent storage available: {info['persistent_available']}") print(f"Data directory: {info['data_dir']}") print(f"Free space: {info['storage_paths']['free_gb']:.1f}GB") ``` ## Integration with Your Application ### 1. Data Loading Your application already uses persistent storage for loading pipeline results: ```python # In data_loader.py - automatically uses persistent storage when available def load_pipeline_results(results_dir: str): # The function automatically checks for data in persistent storage # Falls back to local storage if persistent storage is not available pass ``` ### 2. Caching The application automatically caches data in persistent storage: ```python # In data_loader.py - DataCache uses persistent storage when available class DataCache: @classmethod def get(cls, key: str): # Check persistent storage first, then memory cache return cls._cache.get(key) ``` ### 3. User Uploads For handling user uploads in Gradio: ```python import gradio as gr from lmmvibes.utils.persistent_storage import save_uploaded_file def handle_file_upload(file): if file: saved_path = save_uploaded_file(file, "user_upload.zip") if saved_path: return f"✅ File saved to persistent storage: {saved_path.name}" else: return "❌ Failed to save - persistent storage not available" return "⚠️ No file uploaded" # In your Gradio interface with gr.Blocks() as demo: file_input = gr.File(label="Upload data") upload_btn = gr.Button("Save to persistent storage") result = gr.Textbox(label="Status") upload_btn.click(handle_file_upload, inputs=[file_input], outputs=[result]) ``` ## Best Practices ### 1. Check Availability Always check if persistent storage is available before trying to use it: ```python from lmmvibes.utils.persistent_storage import is_persistent_storage_available if is_persistent_storage_available(): # Use persistent storage save_data_to_persistent(data, "important_data.json") else: # Fall back to local storage or in-memory print("Persistent storage not available") ``` ### 2. Organize Data Use subdirectories to organize your data: ```python # Save experiments in their own directory save_data_to_persistent( data=experiment_data, filename=f"{experiment_name}_results.json", subdirectory="experiments" ) # Save dataframes separately save_data_to_persistent( data=dataframe_bytes, filename=f"{dataset_name}_data.parquet", subdirectory="dataframes" ) ``` ### 3. Handle Errors Gracefully ```python def safe_save_data(data, filename): try: saved_path = save_data_to_persistent(data, filename) if saved_path: return f"✅ Saved to {saved_path}" else: return "❌ Failed to save - storage not available" except Exception as e: return f"❌ Error saving data: {e}" ``` ### 4. Clean Up Old Data Periodically clean up old files to manage storage space: ```python from lmmvibes.utils.persistent_storage import list_persistent_files, delete_persistent_file def cleanup_old_files(days_old=30): """Delete files older than specified days.""" import time cutoff_time = time.time() - (days_old * 24 * 60 * 60) for file in list_persistent_files(): if file.stat().st_mtime < cutoff_time: delete_persistent_file(file.name) ``` ## Troubleshooting ### 1. Storage Not Available If persistent storage is not working: ```python from lmmvibes.utils.persistent_storage import get_storage_info info = get_storage_info() print(f"Storage available: {info['persistent_available']}") print(f"Data directory: {info['data_dir']}") ``` ### 2. Permission Issues If you encounter permission issues: ```python # The utilities automatically create directories with proper permissions # If issues persist, check if /data exists and is writable import os if os.path.isdir("/data") and os.access("/data", os.W_OK): print("✅ Persistent storage is accessible and writable") else: print("❌ Persistent storage not accessible") ``` ### 3. Storage Full Monitor storage usage: ```python info = get_storage_info() if info['storage_paths']: usage_pct = (info['storage_paths']['used_gb'] / info['storage_paths']['total_gb']) * 100 if usage_pct > 90: print(f"⚠️ Storage nearly full: {usage_pct:.1f}% used") # Implement cleanup logic ``` ## Migration from Local Storage If you're migrating from local storage to persistent storage: 1. **Backup existing data**: Copy your local `data/` directory to persistent storage 2. **Update paths**: Use the persistent storage utilities instead of hardcoded paths 3. **Test thoroughly**: Ensure all functionality works with persistent storage 4. **Monitor usage**: Keep track of storage usage and implement cleanup ## Example: Complete Integration Here's a complete example of integrating persistent storage into your application: ```python import gradio as gr import json import pandas as pd from lmmvibes.utils.persistent_storage import ( save_data_to_persistent, load_data_from_persistent, list_persistent_files, get_storage_info, is_persistent_storage_available ) def save_experiment_results(results_data, experiment_name): """Save experiment results to persistent storage.""" if not is_persistent_storage_available(): return "❌ Persistent storage not available" try: results_json = json.dumps(results_data, indent=2) results_bytes = results_json.encode('utf-8') filename = f"{experiment_name}_results.json" saved_path = save_data_to_persistent( data=results_bytes, filename=filename, subdirectory="experiments" ) if saved_path: return f"✅ Saved experiment to: {saved_path.name}" else: return "❌ Failed to save experiment" except Exception as e: return f"❌ Error: {e}" def load_experiment_results(experiment_name): """Load experiment results from persistent storage.""" filename = f"{experiment_name}_results.json" results_bytes = load_data_from_persistent( filename=filename, subdirectory="experiments" ) if results_bytes: results_data = json.loads(results_bytes.decode('utf-8')) return json.dumps(results_data, indent=2) else: return "No results found" def get_available_experiments(): """List all available experiments.""" experiment_files = list_persistent_files(subdirectory="experiments", pattern="*_results.json") if experiment_files: return "\n".join([f.name for f in experiment_files]) else: return "No experiments found" # Gradio interface with gr.Blocks(title="Persistent Storage Demo") as demo: gr.Markdown("# Persistent Storage Demo") with gr.Tab("Save Experiment"): experiment_name = gr.Textbox(label="Experiment Name") results_json = gr.Textbox(label="Results (JSON)", lines=5) save_btn = gr.Button("Save Experiment") save_result = gr.Textbox(label="Save Result") save_btn.click( save_experiment_results, inputs=[results_json, experiment_name], outputs=[save_result] ) with gr.Tab("Load Experiment"): load_experiment_name = gr.Textbox(label="Experiment Name") load_btn = gr.Button("Load Experiment") load_result = gr.Textbox(label="Loaded Results", lines=10) load_btn.click( load_experiment_results, inputs=[load_experiment_name], outputs=[load_result] ) with gr.Tab("Storage Info"): info_btn = gr.Button("Get Storage Info") storage_info = gr.Textbox(label="Storage Information", lines=10) def get_info(): info = get_storage_info() return json.dumps(info, indent=2) info_btn.click(get_info, outputs=[storage_info]) if __name__ == "__main__": demo.launch() ``` This comprehensive setup ensures your application can take full advantage of Hugging Face Spaces' persistent storage capabilities while maintaining backward compatibility with local development.