# import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import gradio as gr

# # Display loading message
# print("Loading model... this may take a few minutes")

# # Model information
# MODEL_ID = "SamuelJaja/llama-3.1-8b-construction-merged"

# # Load tokenizer
# tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# # Load model with 8-bit quantization to fit in the L4 GPU memory
# from transformers import BitsAndBytesConfig

# # Define quantization config
# bnb_config = BitsAndBytesConfig(
#     load_in_8bit=True,
#     bnb_8bit_quant_type="nf4",
#     bnb_8bit_compute_dtype=torch.float16
# )

# # Load model with quantization config
# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_ID,
#     quantization_config=bnb_config,
#     device_map="auto"
# )

# print("Model loaded successfully!")

# # Format prompt according to Llama 3.1 chat template
# def format_prompt(message):
#     """Format the prompt according to Llama 3.1 chat template"""
#     if "[INST]" not in message:
#         return f"[INST] {message} [/INST]"
#     return message

# # Define the prediction function
# def predict(prompt, temperature=0.1, max_tokens=256, top_p=0.9):
#     """Generate text from the model with specified parameters"""
#     # Format the prompt
#     formatted_prompt = format_prompt(prompt)
    
#     # Tokenize
#     inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
    
#     # Generate
#     with torch.no_grad():
#         outputs = model.generate(
#             inputs.input_ids,
#             attention_mask=inputs.attention_mask,
#             max_new_tokens=max_tokens,
#             temperature=temperature,
#             top_p=top_p,
#             do_sample=True,
#             pad_token_id=tokenizer.eos_token_id
#         )
    
#     # Decode
#     generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
#     # Remove the prompt from the response
#     if formatted_prompt in generated_text:
#         response = generated_text.replace(formatted_prompt, "").strip()
#     else:
#         response = generated_text
    
#     return response

# # Create Gradio interface - simpler version focused on API functionality
# demo = gr.Interface(
#     fn=predict,
#     inputs=[
#         gr.Textbox(label="Prompt", placeholder="Enter your question about UK building regulations..."),
#         gr.Slider(minimum=0.1, maximum=1.0, value=0.1, step=0.1, label="Temperature"),
#         gr.Slider(minimum=64, maximum=2048, value=256, step=64, label="Max Tokens"),
#         gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.1, label="Top-p")
#     ],
#     outputs=gr.Textbox(label="Response"),
#     title="UK Building Regulations Assistant",
#     description="Ask questions about UK building regulations and construction standards.",
#     examples=[
#         ["What are the fire safety requirements for commercial buildings in the UK?", 0.1, 256, 0.9],
#         ["What are the minimum ceiling heights for residential buildings?", 0.1, 256, 0.9],
#         ["Explain the requirements for emergency exits in office buildings.", 0.1, 256, 0.9]
#     ],
# )

# # Launch the app with API explicitly enabled
# demo.launch(share=False, server_name="0.0.0.0", server_port=7860)


# import torch
# import time
# import os
# import gc
# import logging
# from functools import lru_cache
# from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# # Configure logging
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# logger = logging.getLogger(__name__)

# # Set environment variables for better performance
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
# torch.backends.cudnn.benchmark = True

# # Display GPU information
# try:
#     logger.info(f"CUDA available: {torch.cuda.is_available()}")
#     if torch.cuda.is_available():
#         logger.info(f"CUDA device: {torch.cuda.get_device_name(0)}")
#         logger.info(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
# except Exception as e:
#     logger.warning(f"Unable to get GPU info: {e}")

# # Model configuration
# MODEL_ID = "SamuelJaja/llama-3.1-8b-construction-merged"
# logger.info(f"Loading model: {MODEL_ID}")

# # Optimized 4-bit quantization configuration
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16,
#     bnb_4bit_use_double_quant=True  # Nested quantization for better memory efficiency
# )

# # Load tokenizer with optimization flags
# logger.info("Loading tokenizer...")
# tokenizer = AutoTokenizer.from_pretrained(
#     MODEL_ID,
#     use_fast=True,  # Use the fast Rust-based tokenizer
# )
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"
# logger.info("Tokenizer loaded successfully")

# # Load model with optimized settings
# logger.info("Loading model with 4-bit quantization...")
# start_load_time = time.time()

# try:
#     model = AutoModelForCausalLM.from_pretrained(
#         MODEL_ID,
#         quantization_config=bnb_config,
#         device_map="auto",
#         torch_dtype=torch.float16,
#         low_cpu_mem_usage=True,
#     )
#     # Force model evaluation mode
#     model.eval()
#     logger.info(f"Model loaded in {time.time() - start_load_time:.2f} seconds")
    
# except Exception as e:
#     logger.error(f"Error loading model with optimized settings: {e}")
#     logger.info("Falling back to 8-bit quantization...")
    
#     # Fallback to 8-bit quantization
#     bnb_config = BitsAndBytesConfig(
#         load_in_8bit=True,
#         bnb_8bit_quant_type="nf4",
#         bnb_8bit_compute_dtype=torch.float16
#     )
    
#     model = AutoModelForCausalLM.from_pretrained(
#         MODEL_ID,
#         quantization_config=bnb_config,
#         device_map="auto"
#     )
#     model.eval()
#     logger.info(f"Model loaded with fallback settings in {time.time() - start_load_time:.2f} seconds")

# # Perform initial warmup to compile any lazy modules
# logger.info("Performing model warmup...")
# try:
#     dummy_input = tokenizer("Hello", return_tensors="pt").to(model.device)
#     with torch.no_grad(), torch.inference_mode():
#         _ = model.generate(**dummy_input, max_new_tokens=1)
#     logger.info("Model warmup completed successfully")
# except Exception as e:
#     logger.warning(f"Model warmup encountered an issue: {e}")

# # Format prompt according to Llama 3.1 chat template
# def format_prompt(message):
#     """Format the prompt according to Llama 3.1 chat template"""
#     if "[INST]" not in message:
#         return f"[INST] {message} [/INST]"
#     return message

# # Cached tokenization to avoid repeating work
# @lru_cache(maxsize=128)
# def cached_tokenize(prompt):
#     """Tokenize with caching to avoid redundant work"""
#     return tokenizer(prompt, return_tensors="pt")

# # Response cache for frequently asked questions
# response_cache = {}

# # Periodically clean CUDA cache
# def clean_memory():
#     """Clean up CUDA memory"""
#     gc.collect()
#     torch.cuda.empty_cache()
#     logger.info("Memory cleaned")

# # Define the optimized prediction function
# def predict(prompt, temperature=0.1, max_tokens=256, top_p=0.9):
#     """Generate text from the model with specified parameters and performance tracking"""
#     start_time = time.time()
    
#     # Check cache for deterministic queries (when temperature=0)
#     cache_key = None
#     if temperature == 0:
#         cache_key = (prompt, max_tokens, top_p)
#         if cache_key in response_cache:
#             logger.info(f"Cache hit! Returning cached response in {time.time() - start_time:.4f}s")
#             return response_cache[cache_key]
    
#     # Format the prompt
#     formatted_prompt = format_prompt(prompt)
    
#     # Tokenize with potential cache hit
#     inputs = cached_tokenize(formatted_prompt).to(model.device)
    
#     # Log token count
#     input_token_count = inputs.input_ids.shape[1]
#     logger.info(f"Input length: {input_token_count} tokens")
    
#     # Generate with optimized settings
#     try:
#         with torch.no_grad(), torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.float16):
#             outputs = model.generate(
#                 inputs.input_ids,
#                 attention_mask=inputs.attention_mask,
#                 max_new_tokens=max_tokens,
#                 temperature=temperature,
#                 top_p=top_p,
#                 do_sample=True if temperature > 0 else False,
#                 pad_token_id=tokenizer.eos_token_id,
#                 use_cache=True,       # Enable KV caching
#                 num_beams=1,          # Disable beam search for faster generation
#                 repetition_penalty=1.1 # Slight penalty to avoid repetition
#             )
#     except RuntimeError as e:
#         if "CUDA out of memory" in str(e):
#             logger.error(f"CUDA OOM error: {e}")
#             clean_memory()
#             return "Sorry, the model ran out of memory. Please try again with a shorter prompt or fewer output tokens."
#         else:
#             logger.error(f"Generation error: {e}")
#             return f"An error occurred during generation: {str(e)}"
    
#     # Decode the generated text
#     generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
#     # Remove the prompt from the response
#     if formatted_prompt in generated_text:
#         response = generated_text.replace(formatted_prompt, "").strip()
#     else:
#         response = generated_text
    
#     # Cache the response if deterministic
#     if cache_key is not None:
#         response_cache[cache_key] = response
        
#     # Calculate and log performance metrics
#     end_time = time.time()
#     total_time = end_time - start_time
#     output_token_count = len(outputs[0]) - input_token_count
#     tokens_per_second = output_token_count / total_time if total_time > 0 else 0
    
#     logger.info(f"Generated {output_token_count} tokens in {total_time:.2f}s ({tokens_per_second:.2f} tokens/sec)")
    
#     # Clean memory occasionally (every 10 requests)
#     if len(response_cache) % 10 == 0:
#         clean_memory()
    
#     return response

# # Create Gradio interface for web UI
# import gradio as gr

# # Create the Gradio UI
# def create_ui():
#     with gr.Blocks(title="UK Building Regulations Assistant") as demo:
#         gr.Markdown("# UK Building Regulations Assistant")
#         gr.Markdown("Ask questions about UK building regulations and construction standards.")
        
#         with gr.Row():
#             with gr.Column(scale=4):
#                 prompt_input = gr.Textbox(
#                     label="Your question", 
#                     placeholder="Enter your question about UK building regulations...",
#                     lines=3
#                 )
            
#         with gr.Row():
#             with gr.Column(scale=1):
#                 temp_slider = gr.Slider(
#                     minimum=0.0, 
#                     maximum=1.0, 
#                     value=0.1, 
#                     step=0.1, 
#                     label="Temperature"
#                 )
#             with gr.Column(scale=1):
#                 tokens_slider = gr.Slider(
#                     minimum=64, 
#                     maximum=2048, 
#                     value=512, 
#                     step=64, 
#                     label="Max Output Tokens"
#                 )
#             with gr.Column(scale=1):
#                 top_p_slider = gr.Slider(
#                     minimum=0.1, 
#                     maximum=1.0, 
#                     value=0.9, 
#                     step=0.1, 
#                     label="Top-p"
#                 )
                
#         submit_btn = gr.Button("Submit", variant="primary")
#         output = gr.Textbox(label="Response", lines=12)
        
#         # Add performance metrics display
#         with gr.Accordion("Performance Metrics", open=False):
#             performance_info = gr.Markdown("")
        
#         # Example questions
#         examples = gr.Examples(
#             examples=[
#                 ["What are the fire safety requirements for commercial buildings in the UK?", 0.1, 512, 0.9],
#                 ["What are the minimum ceiling heights for residential buildings?", 0.0, 512, 0.9],
#                 ["Explain the requirements for emergency exits in office buildings.", 0.1, 512, 0.9],
#                 ["What is the required insulation U-value for external walls in new buildings?", 0.1, 512, 0.9],
#                 ["Describe the accessibility requirements for public bathrooms.", 0.1, 512, 0.9]
#             ],
#             inputs=[prompt_input, temp_slider, tokens_slider, top_p_slider]
#         )
        
#         # Define prediction function with timing
#         def timed_predict(prompt, temp, max_tokens, top_p):
#             start = time.time()
#             response = predict(prompt, temp, max_tokens, top_p)
#             end = time.time()
            
#             # Update performance metrics
#             perf_text = f"""
#             - Time to generate: {end - start:.2f} seconds
#             - Input length: {len(tokenizer.encode(format_prompt(prompt)))} tokens
#             - Output length: {len(tokenizer.encode(response))} tokens
#             - Speed: {len(tokenizer.encode(response)) / (end - start):.2f} tokens/second
#             """
            
#             return response, perf_text
        
#         # Connect the UI elements
#         submit_btn.click(
#             timed_predict, 
#             inputs=[prompt_input, temp_slider, tokens_slider, top_p_slider], 
#             outputs=[output, performance_info]
#         )
        
#         # Also allow Enter key to submit
#         prompt_input.submit(
#             timed_predict, 
#             inputs=[prompt_input, temp_slider, tokens_slider, top_p_slider], 
#             outputs=[output, performance_info]
#         )
        
#     return demo

# # Launch the app with queue enabled
# if __name__ == "__main__":
#     logger.info("Creating UI...")
#     demo = create_ui()
#     logger.info("Launching app...")
#     demo.queue(max_size=10)  # Removed concurrency_count parameter
#     demo.launch(share=False, server_name="0.0.0.0", server_port=7860)


import torch
import time
import os
import gc
import re
import logging
from functools import lru_cache
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Set environment variables for better performance
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.backends.cudnn.benchmark = True

# Display GPU information
try:
    logger.info(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        logger.info(f"CUDA device: {torch.cuda.get_device_name(0)}")
        logger.info(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
except Exception as e:
    logger.warning(f"Unable to get GPU info: {e}")

# Model configuration - UPDATED to use the fine-tuned Instruct model
MODEL_ID = "SamuelJaja/llama-3.1-8b-instruct-construction-merged"
logger.info(f"Loading model: {MODEL_ID}")

# Optimized 4-bit quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True  # Nested quantization for better memory efficiency
)

# Load tokenizer with optimization flags
logger.info("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    use_fast=True,  # Use the fast Rust-based tokenizer
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
logger.info("Tokenizer loaded successfully")

# Load model with optimized settings
logger.info("Loading model with 4-bit quantization...")
start_load_time = time.time()

try:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
    )
    # Force model evaluation mode
    model.eval()
    logger.info(f"Model loaded in {time.time() - start_load_time:.2f} seconds")
    
except Exception as e:
    logger.error(f"Error loading model with optimized settings: {e}")
    logger.info("Falling back to 8-bit quantization...")
    
    # Fallback to 8-bit quantization
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
        bnb_8bit_quant_type="nf4",
        bnb_8bit_compute_dtype=torch.float16
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        quantization_config=bnb_config,
        device_map="auto"
    )
    model.eval()
    logger.info(f"Model loaded with fallback settings in {time.time() - start_load_time:.2f} seconds")

# Perform initial warmup to compile any lazy modules
logger.info("Performing model warmup...")
try:
    dummy_input = tokenizer("Hello", return_tensors="pt").to(model.device)
    with torch.no_grad(), torch.inference_mode():
        _ = model.generate(**dummy_input, max_new_tokens=1)
    logger.info("Model warmup completed successfully")
except Exception as e:
    logger.warning(f"Model warmup encountered an issue: {e}")

# IMPROVED: Format prompt according to Llama 3.1 chat template
def format_prompt(message):
    """Format the prompt according to Llama 3.1 chat template"""
    if "[INST]" not in message:
        return f"[INST] {message} [/INST]"
    return message

# IMPROVED: Better response cleaning
def clean_response(text):
    """Remove any instruction tags from the response"""
    return re.sub(r'\[/?INST\]', '', text).strip()

  
# Cached tokenization to avoid repeating work
@lru_cache(maxsize=128)
def cached_tokenize(prompt):
    """Tokenize with caching to avoid redundant work"""
    return tokenizer(prompt, return_tensors="pt")

# Response cache for frequently asked questions
response_cache = {}

# Periodically clean CUDA cache
def clean_memory():
    """Clean up CUDA memory"""
    gc.collect()
    torch.cuda.empty_cache()
    logger.info("Memory cleaned")

# OPTIMIZED: Define the improved prediction function with faster inference settings
def predict(prompt, temperature=0.1, max_tokens=256, top_p=0.9):
    """Generate text from the model with specified parameters and performance tracking"""
    start_time = time.time()
    
    # Check cache for deterministic queries (when temperature=0)
    cache_key = None
    if temperature == 0:
        cache_key = (prompt, max_tokens, top_p)
        if cache_key in response_cache:
            logger.info(f"Cache hit! Returning cached response in {time.time() - start_time:.4f}s")
            return response_cache[cache_key]
    
    # Format the prompt
    formatted_prompt = format_prompt(prompt)
    
    # Tokenize with potential cache hit
    inputs = cached_tokenize(formatted_prompt).to(model.device)
    
    # Log token count
    input_token_count = inputs.input_ids.shape[1]
    logger.info(f"Input length: {input_token_count} tokens")
    
    # Generate with optimized settings - UPDATED FOR SPEED
    try:
        with torch.no_grad(), torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.float16):
            outputs = model.generate(
                inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_new_tokens=max_tokens,
                temperature=temperature,
                top_p=top_p,
                do_sample=False,  # Deterministic generation for speed
                pad_token_id=tokenizer.eos_token_id,
                use_cache=True,       # Enable KV caching
                num_beams=1,          # Disable beam search for faster generation
                repetition_penalty=1.1 # Slight penalty to avoid repetition
            )
    except RuntimeError as e:
        if "CUDA out of memory" in str(e):
            logger.error(f"CUDA OOM error: {e}")
            clean_memory()
            return "Sorry, the model ran out of memory. Please try again with a shorter prompt or fewer output tokens."
        else:
            logger.error(f"Generation error: {e}")
            return f"An error occurred during generation: {str(e)}"
    
    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # IMPROVED: Better response cleaning
    # First remove the prompt from the response
    if formatted_prompt in generated_text:
        response = generated_text.replace(formatted_prompt, "").strip()
    else:
        response = generated_text
    
    # Then clean any remaining instruction tags
    response = clean_response(response)

        # Trim at any follow-up questions
    question_patterns = [
        # Original patterns from your test_generation function
        r'\s+What\s+is', r'\s+How\s+does', r'\s+Why\s+is', 
        r'\s+When\s+should', r'\s+Where\s+can',
        
        # Additional necessary patterns
        r'\s+What\s+are', r'\s+How\s+can', r'\s+Why\s+are',
        r'\s+How\s+should', r'\s+Where\s+are', r'\s+When\s+can', 
        
        # Catch questions after completed sentences
        r'\?[\s\n]+[A-Z]',
        
        # Catch anything after a closing instruction tag
        r'</INST>.*'
    ]
    for pattern in question_patterns:
        parts = re.split(pattern, response, 1)
        if len(parts) > 1:
            response = parts[0].strip()  
    
    # Cache the response if deterministic
    if cache_key is not None:
        response_cache[cache_key] = response
        
    # Calculate and log performance metrics
    end_time = time.time()
    total_time = end_time - start_time
    output_token_count = len(outputs[0]) - input_token_count
    tokens_per_second = output_token_count / total_time if total_time > 0 else 0
    
    logger.info(f"Generated {output_token_count} tokens in {total_time:.2f}s ({tokens_per_second:.2f} tokens/sec)")
    
    # Clean memory occasionally (every 10 requests)
    if len(response_cache) % 10 == 0:
        clean_memory()
    
    return response

# Create Gradio interface for web UI
import gradio as gr

# Create the Gradio UI
def create_ui():
    with gr.Blocks(title="UK Building Regulations Assistant") as demo:
        gr.Markdown("# UK Building Regulations Assistant")
        gr.Markdown("Ask questions about UK building regulations and construction standards.")
        
        with gr.Row():
            with gr.Column(scale=4):
                prompt_input = gr.Textbox(
                    label="Your question", 
                    placeholder="Enter your question about UK building regulations...",
                    lines=3
                )
            
        with gr.Row():
            with gr.Column(scale=1):
                temp_slider = gr.Slider(
                    minimum=0.0, 
                    maximum=1.0, 
                    value=0.1, 
                    step=0.1, 
                    label="Temperature"
                )
            with gr.Column(scale=1):
                tokens_slider = gr.Slider(
                    minimum=64, 
                    maximum=2048, 
                    value=256,  # UPDATED: Lower default value for speed
                    step=64, 
                    label="Max Output Tokens"
                )
            with gr.Column(scale=1):
                top_p_slider = gr.Slider(
                    minimum=0.1, 
                    maximum=1.0, 
                    value=0.9, 
                    step=0.1, 
                    label="Top-p"
                )
                
        submit_btn = gr.Button("Submit", variant="primary")
        output = gr.Textbox(label="Response", lines=12)
        
        # Add performance metrics display
        with gr.Accordion("Performance Metrics", open=False):
            performance_info = gr.Markdown("")
        
        # Example questions
        examples = gr.Examples(
            examples=[
                ["What are the fire safety requirements for commercial buildings in the UK?", 0.1, 256, 0.9],
                ["What are the minimum ceiling heights for residential buildings?", 0.0, 256, 0.9],
                ["Explain the requirements for emergency exits in office buildings.", 0.1, 256, 0.9],
                ["What is the required insulation U-value for external walls in new buildings?", 0.1, 256, 0.9],
                ["Describe the accessibility requirements for public bathrooms.", 0.1, 256, 0.9]
            ],
            inputs=[prompt_input, temp_slider, tokens_slider, top_p_slider]
        )
        
        # Define prediction function with timing
        def timed_predict(prompt, temp, max_tokens, top_p):
            start = time.time()
            response = predict(prompt, temp, max_tokens, top_p)
            end = time.time()
            
            # Update performance metrics
            perf_text = f"""
            - Time to generate: {end - start:.2f} seconds
            - Input length: {len(tokenizer.encode(format_prompt(prompt)))} tokens
            - Output length: {len(tokenizer.encode(response))} tokens
            - Speed: {len(tokenizer.encode(response)) / (end - start):.2f} tokens/second
            """
            
            return response, perf_text
        
        # Connect the UI elements
        submit_btn.click(
            timed_predict, 
            inputs=[prompt_input, temp_slider, tokens_slider, top_p_slider], 
            outputs=[output, performance_info]
        )
        
        # Also allow Enter key to submit
        prompt_input.submit(
            timed_predict, 
            inputs=[prompt_input, temp_slider, tokens_slider, top_p_slider], 
            outputs=[output, performance_info]
        )
        
    return demo

# Launch the app with queue enabled
if __name__ == "__main__":
    logger.info("Creating UI...")
    demo = create_ui()
    logger.info("Launching app...")
    demo.queue(max_size=10)
    demo.launch(share=False, server_name="0.0.0.0", server_port=7860)