Spaces:

Alovestocode
/

router-router-zero

Running on Zero

App Files Files Community

Alovestocode commited on 22 days ago

Commit

40a2927

verified ·

1 Parent(s): 7d6ddbd

Refactor: Mount Gradio on FastAPI, use gr.mount_gradio_app for proper integration

Browse files

Files changed (3) hide show

README.md +4 -2
app.py +21 -67
test_api.py +106 -0

README.md CHANGED Viewed

@@ -20,7 +20,7 @@ endpoint via the `HF_ROUTER_API` environment variable.
 | File | Purpose |
 | ---- | ------- |
-| `app.py` | Loads the merged checkpoint on demand (tries `MODEL_REPO` first, then `MODEL_FALLBACKS` or the default Gemma → Llama → Qwen order), exposes a `/v1/generate` API, and serves a small HTML console at `/gradio`. |
 | `requirements.txt` | Minimal dependency set (transformers, bitsandbytes, torch, fastapi, accelerate, sentencepiece, spaces, uvicorn). |
 | `.huggingface/spaces.yml` | Configures the Space for ZeroGPU hardware and disables automatic sleep. |
@@ -75,4 +75,6 @@ that the deployed model returns the expected JSON plan. When running on ZeroGPU
 we recommend keeping `MODEL_LOAD_STRATEGY=8bit` (or `LOAD_IN_8BIT=1`) so the
 weights fit comfortably in the 70GB slice; if that fails the app automatically
 degrades through 4-bit, bf16/fp16, and finally CPU mode. You can inspect the
-active load mode via the `/` healthcheck (`strategy` field).

 | File | Purpose |
 | ---- | ------- |
+| `app.py` | Loads the merged checkpoint on demand (tries `MODEL_REPO` first, then `MODEL_FALLBACKS` or the default Gemma → Llama → Qwen order), exposes a `/v1/generate` API, mounts the Gradio UI at `/gradio`, and keeps a lightweight HTML console at `/console`. |
 | `requirements.txt` | Minimal dependency set (transformers, bitsandbytes, torch, fastapi, accelerate, sentencepiece, spaces, uvicorn). |
 | `.huggingface/spaces.yml` | Configures the Space for ZeroGPU hardware and disables automatic sleep. |
 we recommend keeping `MODEL_LOAD_STRATEGY=8bit` (or `LOAD_IN_8BIT=1`) so the
 weights fit comfortably in the 70GB slice; if that fails the app automatically
 degrades through 4-bit, bf16/fp16, and finally CPU mode. You can inspect the
+active load mode via the `/health` endpoint (`strategy` field). The root path
+(`/`) now redirects to the Gradio UI, while `/console` serves the minimal HTML
+form for quick manual testing.

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import List, Optional, Tuple
 import torch
 from fastapi import FastAPI, HTTPException
-from fastapi.responses import HTMLResponse
 from pydantic import BaseModel
 try:
@@ -297,7 +297,13 @@ def _generate_with_gpu(
 fastapi_app = FastAPI(title="Router Model API", version="1.0.0")
-@fastapi_app.get("/")
 def healthcheck() -> dict[str, str]:
     return {
         "status": "ok",
@@ -329,7 +335,7 @@ def generate_endpoint(payload: GeneratePayload) -> GenerateResponse:
     return GenerateResponse(text=text)
-@fastapi_app.get("/gradio", response_class=HTMLResponse)
 def interactive_ui() -> str:
     return """
     <!doctype html>
@@ -495,8 +501,9 @@ with gr.Blocks(
                 }
                 ```
-                **GET** `/` - Health check
-                **GET** `/gradio` - Interactive UI
                 """)
     # Event handlers
@@ -505,72 +512,19 @@ with gr.Blocks(
         inputs=[prompt_input, max_tokens_input, temp_input, top_p_input],
         outputs=output,
     )
     clear_btn.click(
         fn=lambda: ("", ""),
         outputs=[prompt_input, output],
     )
-    # Add FastAPI routes using Gradio's load event
-    # This ensures routes are added after Gradio is fully initialized
-    def add_api_routes():
-        """Add API routes after Gradio app is loaded."""
-        try:
-            from fastapi.responses import JSONResponse
-            from starlette.routing import Route
-            async def generate_handler(request):
-                """Handle POST /v1/generate requests."""
-                try:
-                    data = await request.json()
-                    payload = GeneratePayload(**data)
-                    text = _generate_with_gpu(
-                        prompt=payload.prompt,
-                        max_new_tokens=payload.max_new_tokens or MAX_NEW_TOKENS,
-                        temperature=payload.temperature or DEFAULT_TEMPERATURE,
-                        top_p=payload.top_p or DEFAULT_TOP_P,
-                    )
-                    return JSONResponse(content={"text": text})
-                except Exception as exc:
-                    from fastapi import HTTPException
-                    raise HTTPException(status_code=500, detail=str(exc))
-            async def healthcheck_handler(request):
-                """Handle GET /api/health requests."""
-                return JSONResponse(content={
-                    "status": "ok",
-                    "model": MODEL_ID,
-                    "strategy": ACTIVE_STRATEGY or "pending",
-                })
-            async def gradio_ui_handler(request):
-                """Handle GET /api/gradio requests."""
-                return HTMLResponse(interactive_ui())
-            # Add routes using Route objects
-            gradio_app.app.router.routes.append(
-                Route("/v1/generate", generate_handler, methods=["POST"])
-            )
-            gradio_app.app.router.routes.append(
-                Route("/api/health", healthcheck_handler, methods=["GET"])
-            )
-            gradio_app.app.router.routes.append(
-                Route("/api/gradio", gradio_ui_handler, methods=["GET"])
-            )
-            gradio_app.app.router.routes.append(
-                Route("/gradio", gradio_ui_handler, methods=["GET"])
-            )
-            print("FastAPI routes added successfully via load event")
-        except Exception as e:
-            print(f"Warning: Could not add FastAPI routes: {e}")
-            import traceback
-            traceback.print_exc()
-    # Use load event to add routes after app initialization
-    gradio_app.load(add_api_routes)
-# Set app to Gradio Blocks for Spaces - ZeroGPU requires Gradio SDK
-app = gradio_app
 if __name__ == "__main__":  # pragma: no cover
-    app.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))

 import torch
 from fastapi import FastAPI, HTTPException
+from fastapi.responses import HTMLResponse, RedirectResponse
 from pydantic import BaseModel
 try:
 fastapi_app = FastAPI(title="Router Model API", version="1.0.0")
+@fastapi_app.get("/", response_class=RedirectResponse)
+def root_redirect() -> RedirectResponse:
+    """Redirect root traffic to the Gradio UI for a cleaner Spaces landing."""
+    return RedirectResponse(url="/gradio", status_code=307)
+@fastapi_app.get("/health")
 def healthcheck() -> dict[str, str]:
     return {
         "status": "ok",
     return GenerateResponse(text=text)
+@fastapi_app.get("/console", response_class=HTMLResponse)
 def interactive_ui() -> str:
     return """
     <!doctype html>
                 }
                 ```
+                **GET** `/health` - JSON health check
+                **GET** `/gradio` - Full Gradio UI
+                **GET** `/console` - Minimal HTML console
                 """)
     # Event handlers
         inputs=[prompt_input, max_tokens_input, temp_input, top_p_input],
         outputs=output,
     )
     clear_btn.click(
         fn=lambda: ("", ""),
         outputs=[prompt_input, output],
     )
+# Enable queued execution so ZeroGPU can schedule GPU work reliably
+gradio_app.queue(max_size=8)
+# Mount the Gradio UI onto the FastAPI app (served under /gradio)
+app = gr.mount_gradio_app(fastapi_app, gradio_app, path="/gradio")
 if __name__ == "__main__":  # pragma: no cover
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))

test_api.py ADDED Viewed

	@@ -0,0 +1,106 @@

+#!/usr/bin/env python3
+"""Test script for Router API endpoints."""
+import requests
+import json
+import time
+import sys
+BASE_URL = "https://Alovestocode-router-router-zero.hf.space"
+def test_healthcheck():
+    """Test the health check endpoint."""
+    print("Testing GET /health...")
+    try:
+        response = requests.get(f"{BASE_URL}/health", timeout=10)
+        print(f"Status: {response.status_code}")
+        if response.status_code == 200:
+            print(f"Response: {json.dumps(response.json(), indent=2)}")
+            return True
+        else:
+            print(f"Error: {response.text}")
+            return False
+    except Exception as e:
+        print(f"Exception: {e}")
+        return False
+def test_generate():
+    """Test the generate endpoint."""
+    print("\nTesting POST /v1/generate...")
+    try:
+        payload = {
+            "prompt": "You are a router agent. User query: What is 2+2?",
+            "max_new_tokens": 100,
+            "temperature": 0.2,
+            "top_p": 0.9
+        }
+        response = requests.post(
+            f"{BASE_URL}/v1/generate",
+            json=payload,
+            headers={"Content-Type": "application/json"},
+            timeout=60  # Longer timeout for model loading
+        )
+        print(f"Status: {response.status_code}")
+        if response.status_code == 200:
+            result = response.json()
+            print(f"Response keys: {list(result.keys())}")
+            if "text" in result:
+                print(f"Generated text (first 200 chars): {result['text'][:200]}...")
+            else:
+                print(f"Full response: {json.dumps(result, indent=2)}")
+            return True
+        else:
+            print(f"Error: {response.text}")
+            return False
+    except Exception as e:
+        print(f"Exception: {e}")
+        return False
+def test_gradio_ui():
+    """Test the Gradio UI endpoint."""
+    print("\nTesting GET /gradio (UI redirect target)...")
+    try:
+        response = requests.get(f"{BASE_URL}/gradio", timeout=10)
+        print(f"Status: {response.status_code}")
+        if response.status_code == 200:
+            print(f"Response length: {len(response.text)} chars")
+            print(f"Response type: {response.headers.get('content-type', 'unknown')}")
+            return True
+        else:
+            print(f"Error: {response.text[:200]}")
+            return False
+    except Exception as e:
+        print(f"Exception: {e}")
+        return False
+def main():
+    """Run all API tests."""
+    print("=" * 60)
+    print("Router API Test Suite")
+    print("=" * 60)
+    print(f"Base URL: {BASE_URL}\n")
+    # Wait a moment for Space to be ready
+    print("Waiting 5 seconds for Space to be ready...")
+    time.sleep(5)
+    results = []
+    # Test endpoints
+    results.append(("Health Check", test_healthcheck()))
+    results.append(("Generate", test_generate()))
+    results.append(("Gradio UI", test_gradio_ui()))
+    # Summary
+    print("\n" + "=" * 60)
+    print("Test Summary")
+    print("=" * 60)
+    for name, passed in results:
+        status = "✅ PASS" if passed else "❌ FAIL"
+        print(f"{name}: {status}")
+    all_passed = all(result[1] for result in results)
+    sys.exit(0 if all_passed else 1)
+if __name__ == "__main__":
+    main()