#!/usr/bin/env python3 """ Technical Documentation RAG System - Streamlit Interface A professional web interface for the RAG system with answer generation, optimized for technical documentation Q&A. """ import streamlit as st import sys from pathlib import Path import time import traceback from typing import List, Dict, Any import json # Add project root to path project_root = Path(__file__).parent.parent sys.path.insert(0, str(project_root)) # Import directly since we're in the project directory sys.path.insert(0, str(Path(__file__).parent)) from src.rag_with_generation import RAGWithGeneration # Page configuration st.set_page_config( page_title="Technical Documentation RAG Assistant", page_icon="🔍", layout="wide", initial_sidebar_state="expanded", ) # Custom CSS for professional styling st.markdown( """ """, unsafe_allow_html=True, ) @st.cache_resource def initialize_rag_system(): """Initialize the RAG system with HuggingFace API.""" try: import os # Try multiple common token names api_token = ( os.getenv("HUGGINGFACE_API_TOKEN") or os.getenv("HF_TOKEN") or os.getenv("HF_API_TOKEN") ) # Debug logging (will show in Spaces logs) - force to stderr for visibility import sys if api_token: print( f"✅ Found HF token (starts with: {api_token[:8]}...)", file=sys.stderr, flush=True, ) else: print( "⚠️ No HF token found in environment variables", file=sys.stderr, flush=True, ) print( f"Available env vars: {list(os.environ.keys())}", file=sys.stderr, flush=True, ) # Check if we're running locally or in HuggingFace Spaces is_hf_spaces = os.getenv("SPACE_ID") is not None # HF Spaces sets SPACE_ID use_ollama = os.getenv("USE_OLLAMA", "false").lower() == "true" use_inference_providers = os.getenv("USE_INFERENCE_PROVIDERS", "false").lower() == "true" if is_hf_spaces: print("🚀 Running in HuggingFace Spaces", file=sys.stderr, flush=True) if use_inference_providers: print("🚀 Using Inference Providers API in HuggingFace Spaces", file=sys.stderr, flush=True) elif use_ollama: print("🦙 Ollama enabled in HuggingFace Spaces", file=sys.stderr, flush=True) else: print("🤗 Using classic HuggingFace API in Spaces", file=sys.stderr, flush=True) else: print("💻 Running locally", file=sys.stderr, flush=True) if use_inference_providers: print("🚀 Using Inference Providers API locally", file=sys.stderr, flush=True) elif use_ollama: print("🦙 Using local Ollama", file=sys.stderr, flush=True) else: print("🤗 Using classic HuggingFace API locally", file=sys.stderr, flush=True) ollama_url = os.getenv("OLLAMA_URL", "http://localhost:11434") if use_inference_providers: model_name = os.getenv("INFERENCE_PROVIDERS_MODEL", "microsoft/DialoGPT-medium") print( f"🚀 Configured for Inference Providers API with model: {model_name}", file=sys.stderr, flush=True, ) elif use_ollama: model_name = os.getenv("OLLAMA_MODEL", "llama3.2:1b") print( f"🦙 Configured for local Ollama with model: {model_name}", file=sys.stderr, flush=True, ) else: model_name = "sshleifer/distilbart-cnn-12-6" # Confirmed working HF model print( f"🤗 Configured for classic HuggingFace API with model: {model_name}", file=sys.stderr, flush=True, ) rag = RAGWithGeneration( model_name=model_name, api_token=api_token, temperature=0.3, max_tokens=512, use_ollama=use_ollama, ollama_url=ollama_url, use_inference_providers=use_inference_providers, ) return rag, None except Exception as e: return None, str(e) def display_header(): """Display the main header and description.""" st.markdown( '

🔍 Technical Documentation RAG Assistant

', unsafe_allow_html=True, ) st.markdown( """ **Intelligent Q&A System for Technical Documentation** This system uses advanced hybrid search (semantic + keyword matching) combined with local LLM generation to provide accurate, cited answers from your technical documentation. **Features:** - 🚀 Hybrid retrieval for optimal relevance - 📚 Automatic citation and source attribution - 🎯 Confidence scoring for answer quality - 🦙 Local Ollama LLM for privacy and speed - 🔧 Advanced prompt engineering with domain expertise """ ) # Add deployment status info import os if os.getenv("SPACE_ID"): st.info("🌟 **Running on HuggingFace Spaces** - Local Ollama model with containerized inference") else: st.info("💻 **Running locally** - Connect to your local Ollama server") def display_system_status(rag_system): """Display system status and metrics.""" if rag_system is None: return with st.sidebar: st.header("📊 System Status") # Check if documents are indexed if hasattr(rag_system, "chunks") and rag_system.chunks: st.success(f"✅ {len(rag_system.chunks)} chunks indexed") # Show document sources sources = set(chunk.get("source", "unknown") for chunk in rag_system.chunks) st.info(f"📄 {len(sources)} documents loaded") with st.expander("Document Details"): for source in sorted(sources): source_name = Path(source).name if source != "unknown" else source chunk_count = len( [c for c in rag_system.chunks if c.get("source") == source] ) st.write(f"• {source_name}: {chunk_count} chunks") else: st.warning("⚠️ No documents indexed") st.info("Upload a PDF to get started") # Model information and status st.header("🤖 Model Status") if hasattr(rag_system, "answer_generator"): model_name = getattr( rag_system.answer_generator, "model_name", "gpt2-medium" ) st.write(f"**Model:** {model_name}") # Get detailed generator info if hasattr(rag_system, 'get_generator_info'): generator_info = rag_system.get_generator_info() st.write(f"**Generator:** {generator_info['generator_type']}") st.write(f"**Using Ollama:** {generator_info['using_ollama']}") st.write(f"**Using Inference Providers:** {generator_info['using_inference_providers']}") if generator_info['base_url']: st.write(f"**Base URL:** {generator_info['base_url']}") # Show status based on generator type if getattr(rag_system, '_using_inference_providers', False): st.success("🚀 **Inference Providers API Connected**") st.info("⚡ **Fast Responses**\nExpected response time: 2-5 seconds for most queries") # Add helpful tips with st.expander("💡 Performance Tips"): st.markdown(""" - **Response time**: 2-5 seconds (much faster than Ollama) - **Reliability**: Enterprise-grade infrastructure with automatic failover - **Models**: Latest instruction-tuned models optimized for Q&A - **Rate limits**: Free tier available, PRO tier for higher limits """) elif getattr(rag_system, '_using_ollama', False): st.success("🦙 **Ollama Connected**") st.warning("⏱️ **First Query Notice**\nFirst query may take 30-60s for model warmup. Subsequent queries will be much faster!") # Add helpful tips with st.expander("💡 Performance Tips"): st.markdown(""" - **First query**: 30-60 seconds (warmup) - **Subsequent queries**: 10-20 seconds - **Best practice**: Wait for first query to complete before trying another - **If timeout occurs**: Simply retry the same query """) else: st.success("🤗 **Classic HuggingFace API Ready**") st.info("📊 Using traditional Inference API with model-specific parsing") else: st.write(f"**Model:** gpt2-medium (HuggingFace API)") st.success("🤗 **HuggingFace API Ready**") st.write(f"**Temperature:** 0.3") st.write(f"**Max Tokens:** 512") def handle_document_upload(rag_system): """Handle PDF document upload and indexing.""" st.header("📄 Document Management") uploaded_file = st.file_uploader( "Upload PDF Document", type="pdf", help="Upload a technical PDF document to add to the knowledge base", ) if uploaded_file is not None: if st.button("Index Document", type="primary"): try: with st.spinner("Processing document..."): # Save uploaded file temporarily temp_path = Path(f"/tmp/{uploaded_file.name}") with open(temp_path, "wb") as f: f.write(uploaded_file.getvalue()) # Index the document start_time = time.time() chunk_count = rag_system.index_document(temp_path) processing_time = time.time() - start_time # Clean up temp file temp_path.unlink() st.markdown( f"""

✅ Document indexed successfully!
📊 {chunk_count} chunks created in {processing_time:.2f}s
📄 Ready for queries

""", unsafe_allow_html=True, ) # Refresh the page to update sidebar st.rerun() except Exception as e: st.markdown( f"""

❌ Error processing document:
{str(e)}

""", unsafe_allow_html=True, ) def handle_query_interface(rag_system): """Handle the main query interface.""" st.header("🤔 Ask Your Question") # Check if documents are available if not hasattr(rag_system, "chunks") or not rag_system.chunks: st.warning("Please upload and index a document first to ask questions.") return # Query input query = st.text_input( "Enter your question:", placeholder="e.g., What is RISC-V? How does instruction encoding work?", help="Ask any question about the uploaded technical documentation", ) # Advanced options with st.expander("⚙️ Advanced Options"): col1, col2 = st.columns(2) with col1: use_hybrid = st.checkbox( "Use Hybrid Search", value=True, help="Combine semantic and keyword search", ) dense_weight = st.slider( "Semantic Weight", 0.0, 1.0, 0.7, 0.1, help="Weight for semantic search (vs keyword)", ) with col2: top_k = st.slider( "Number of Sources", 1, 10, 5, help="Number of source chunks to retrieve", ) similarity_threshold = st.slider( "Similarity Threshold", 0.0, 1.0, 0.3, 0.05, help="Minimum similarity to include results (higher = more strict)", ) use_fallback_llm = st.checkbox( "Use Fallback Model", value=False, help="Use larger model for complex queries", ) # Query processing if query and st.button("Get Answer", type="primary"): try: # Check if this might be a first query with Ollama is_ollama = hasattr(rag_system, 'answer_generator') and hasattr(rag_system.answer_generator, 'base_url') if is_ollama: # Show special loading message for potential warmup with st.spinner("🔥 Initializing model (first query may take 30-60s for warmup)..."): start_time = time.time() # Add progress indicator for warmup progress_placeholder = st.empty() progress_placeholder.info("⏳ Model warming up... This is normal for the first query and won't happen again.") else: with st.spinner("Searching and generating answer..."): start_time = time.time() progress_placeholder = None # Debug: Check if documents are actually indexed print( f"🔍 Debug: Chunks available: {len(getattr(rag_system, 'chunks', []))}", file=sys.stderr, flush=True, ) if hasattr(rag_system, "chunks") and rag_system.chunks: print( f"🔍 Debug: First chunk preview: {rag_system.chunks[0].get('text', '')[:100]}...", file=sys.stderr, flush=True, ) # Get answer result = rag_system.query_with_answer( question=query, top_k=top_k, use_hybrid=use_hybrid, dense_weight=dense_weight, use_fallback_llm=use_fallback_llm, return_context=True, similarity_threshold=similarity_threshold, ) # Clear progress indicator if it was shown if is_ollama and progress_placeholder: progress_placeholder.empty() # Debug: Check what was retrieved print( f"🔍 Debug: Retrieved chunks: {len(result.get('context', []))}", file=sys.stderr, flush=True, ) print( f"🔍 Debug: Citations: {len(result.get('citations', []))}", file=sys.stderr, flush=True, ) print( f"🔍 Debug: Answer preview: {result.get('answer', '')[:100]}...", file=sys.stderr, flush=True, ) total_time = time.time() - start_time # Display results display_answer_results(result, total_time) except Exception as e: # Clear progress indicator if it was shown if is_ollama and progress_placeholder: progress_placeholder.empty() # Distinguish between different error types error_message = str(e) if "timeout" in error_message.lower() or "read timed out" in error_message.lower(): # Likely warmup timeout st.markdown( f"""

⏱️ Model Warmup Timeout
The first query timed out during model initialization. This is normal behavior.

What to do:
• Wait a moment and try the same query again
• Subsequent queries should complete much faster (10-20 seconds)
• The model is now warmed up and ready

""", unsafe_allow_html=True, ) elif "connection" in error_message.lower() or "host" in error_message.lower(): # Connection issues st.markdown( f"""

🔌 Connection Error
Unable to connect to the model service.

What to do:
• Wait a moment for the service to start up
• Try your query again
• Check if the container is still initializing

""", unsafe_allow_html=True, ) else: # Generic error st.markdown( f"""

❌ Error generating answer:
{str(e)}

Suggestion: Try rephrasing your question or wait a moment and retry.

""", unsafe_allow_html=True, ) # Show detailed error in expander for debugging with st.expander("🔍 Technical Details"): st.code(traceback.format_exc()) def display_answer_results(result: Dict[str, Any], total_time: float): """Display the answer results in a formatted way.""" # Main answer st.markdown( f"""

📝 Answer

{result['answer']}

""", unsafe_allow_html=True, ) # Check if this is a rejection/out-of-scope answer answer_text = result['answer'].lower() is_rejection = any(phrase in answer_text for phrase in [ "not available in the context", "cannot answer", "not found in the documentation", "outside the scope", "not covered in the provided", "no information about", "cannot provide information", "doesn't contain information" ]) # Metrics row col1, col2, col3, col4 = st.columns(4) with col1: # Streamlit only accepts 'normal', 'inverse', or 'off' for delta_color confidence_color = "normal" if result["confidence"] > 0.6 else "inverse" st.metric( "Confidence", f"{result['confidence']:.1%}", delta_color=confidence_color ) with col2: # Show different metric for rejections if is_rejection: st.metric("Status", "Out of scope") else: st.metric("Sources", len(result["citations"])) with col3: st.metric("Total Time", f"{total_time:.2f}s") with col4: retrieval_method = result.get("retrieval_stats", {}).get("method", "unknown") st.metric("Method", retrieval_method) # Citations - only show for valid answers if result["citations"] and not is_rejection: st.markdown("### 📚 Sources") for i, citation in enumerate(result["citations"], 1): st.markdown( f"""

{i}. {citation['source']} (Page {citation['page']})
Relevance: {citation['relevance']:.1%}
"{citation['snippet']}"

""", unsafe_allow_html=True, ) elif is_rejection: st.info("💡 **Tip**: This question appears to be outside the scope of the uploaded documentation. Try asking about topics covered in the indexed documents.") # Detailed metrics with st.expander("📊 Detailed Metrics"): col1, col2 = st.columns(2) with col1: st.subheader("Retrieval Stats") retrieval_stats = result.get("retrieval_stats", {}) st.json(retrieval_stats) with col2: st.subheader("Generation Stats") generation_stats = result.get("generation_stats", {}) st.json(generation_stats) # Context chunks (for debugging) if "context" in result and st.checkbox("Show Retrieved Context"): st.subheader("🔍 Retrieved Context") for i, chunk in enumerate(result["context"], 1): with st.expander( f"Chunk {i} - {Path(chunk.get('source', 'unknown')).name} (Page {chunk.get('page', 'unknown')})" ): st.write( f"**Score:** {chunk.get('hybrid_score', chunk.get('similarity_score', 0)):.3f}" ) st.write(f"**Text:** {chunk.get('text', '')[:500]}...") def display_sample_queries(): """Display sample queries for user guidance.""" st.header("💡 Sample Queries") sample_queries = [ "What is RISC-V and what are its main features?", "How does the RISC-V instruction encoding work?", "What are the differences between RV32I and RV64I?", "Explain the RISC-V register model and naming conventions", "How does RISC-V handle memory ordering and consistency?", "What are the RISC-V privileged instruction set features?", "How do atomic instructions work in RISC-V?", "What is the RISC-V calling convention?", ] for query in sample_queries: if st.button(f"📌 {query}", key=f"sample_{hash(query)}"): st.session_state["sample_query"] = query st.rerun() def main(): """Main Streamlit application.""" # Initialize session state if "rag_system" not in st.session_state: st.session_state["rag_system"] = None st.session_state["init_error"] = None # Display header display_header() # Initialize RAG system if st.session_state["rag_system"] is None: with st.spinner("Initializing RAG system..."): rag_system, error = initialize_rag_system() st.session_state["rag_system"] = rag_system st.session_state["init_error"] = error rag_system = st.session_state["rag_system"] init_error = st.session_state["init_error"] # Check for initialization errors if init_error: st.markdown( f"""

❌ Failed to initialize RAG system:
{init_error}

System uses HuggingFace Inference API
• For better models, add your HF Pro token as HUGGINGFACE_API_TOKEN
• Check internet connection for API access

""", unsafe_allow_html=True, ) return if rag_system is None: st.error("Failed to initialize RAG system. Please check the logs.") return # Display system status in sidebar display_system_status(rag_system) # Main interface tab1, tab2, tab3 = st.tabs( ["🤔 Ask Questions", "📄 Manage Documents", "💡 Examples"] ) with tab1: # Handle sample query selection if "sample_query" in st.session_state: st.text_input( "Enter your question:", value=st.session_state["sample_query"], key="main_query", ) del st.session_state["sample_query"] handle_query_interface(rag_system) with tab2: handle_document_upload(rag_system) # Option to load test document st.subheader("📖 Test Document") test_pdf_path = Path("data/test/riscv-base-instructions.pdf") if test_pdf_path.exists(): if st.button("Load RISC-V Test Document"): try: with st.spinner("Loading test document..."): chunk_count = rag_system.index_document(test_pdf_path) st.success( f"✅ Test document loaded! {chunk_count} chunks indexed." ) st.rerun() except Exception as e: st.error(f"Failed to load test document: {e}") else: st.info("Test document not found at data/test/riscv-base-instructions.pdf") with tab3: display_sample_queries() # Footer st.markdown("---") st.markdown( """

Technical Documentation RAG Assistant | Powered by HuggingFace API & RISC-V Documentation
Built for ML Engineer Portfolio | Swiss Tech Market Focus

""", unsafe_allow_html=True, ) if __name__ == "__main__": main()