# import torch # from transformers import AutoModelForCausalLM, AutoTokenizer # import gradio as gr # # Display loading message # print("Loading model... this may take a few minutes") # # Model information # MODEL_ID = "SamuelJaja/llama-3.1-8b-construction-merged" # # Load tokenizer # tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) # # Load model with 8-bit quantization to fit in the L4 GPU memory # from transformers import BitsAndBytesConfig # # Define quantization config # bnb_config = BitsAndBytesConfig( # load_in_8bit=True, # bnb_8bit_quant_type="nf4", # bnb_8bit_compute_dtype=torch.float16 # ) # # Load model with quantization config # model = AutoModelForCausalLM.from_pretrained( # MODEL_ID, # quantization_config=bnb_config, # device_map="auto" # ) # print("Model loaded successfully!") # # Format prompt according to Llama 3.1 chat template # def format_prompt(message): # """Format the prompt according to Llama 3.1 chat template""" # if "[INST]" not in message: # return f"[INST] {message} [/INST]" # return message # # Define the prediction function # def predict(prompt, temperature=0.1, max_tokens=256, top_p=0.9): # """Generate text from the model with specified parameters""" # # Format the prompt # formatted_prompt = format_prompt(prompt) # # Tokenize # inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device) # # Generate # with torch.no_grad(): # outputs = model.generate( # inputs.input_ids, # attention_mask=inputs.attention_mask, # max_new_tokens=max_tokens, # temperature=temperature, # top_p=top_p, # do_sample=True, # pad_token_id=tokenizer.eos_token_id # ) # # Decode # generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) # # Remove the prompt from the response # if formatted_prompt in generated_text: # response = generated_text.replace(formatted_prompt, "").strip() # else: # response = generated_text # return response # # Create Gradio interface - simpler version focused on API functionality # demo = gr.Interface( # fn=predict, # inputs=[ # gr.Textbox(label="Prompt", placeholder="Enter your question about UK building regulations..."), # gr.Slider(minimum=0.1, maximum=1.0, value=0.1, step=0.1, label="Temperature"), # gr.Slider(minimum=64, maximum=2048, value=256, step=64, label="Max Tokens"), # gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.1, label="Top-p") # ], # outputs=gr.Textbox(label="Response"), # title="UK Building Regulations Assistant", # description="Ask questions about UK building regulations and construction standards.", # examples=[ # ["What are the fire safety requirements for commercial buildings in the UK?", 0.1, 256, 0.9], # ["What are the minimum ceiling heights for residential buildings?", 0.1, 256, 0.9], # ["Explain the requirements for emergency exits in office buildings.", 0.1, 256, 0.9] # ], # ) # # Launch the app with API explicitly enabled # demo.launch(share=False, server_name="0.0.0.0", server_port=7860) # import torch # import time # import os # import gc # import logging # from functools import lru_cache # from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig # # Configure logging # logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # logger = logging.getLogger(__name__) # # Set environment variables for better performance # os.environ["TOKENIZERS_PARALLELISM"] = "false" # torch.backends.cudnn.benchmark = True # # Display GPU information # try: # logger.info(f"CUDA available: {torch.cuda.is_available()}") # if torch.cuda.is_available(): # logger.info(f"CUDA device: {torch.cuda.get_device_name(0)}") # logger.info(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB") # except Exception as e: # logger.warning(f"Unable to get GPU info: {e}") # # Model configuration # MODEL_ID = "SamuelJaja/llama-3.1-8b-construction-merged" # logger.info(f"Loading model: {MODEL_ID}") # # Optimized 4-bit quantization configuration # bnb_config = BitsAndBytesConfig( # load_in_4bit=True, # bnb_4bit_quant_type="nf4", # bnb_4bit_compute_dtype=torch.float16, # bnb_4bit_use_double_quant=True # Nested quantization for better memory efficiency # ) # # Load tokenizer with optimization flags # logger.info("Loading tokenizer...") # tokenizer = AutoTokenizer.from_pretrained( # MODEL_ID, # use_fast=True, # Use the fast Rust-based tokenizer # ) # tokenizer.pad_token = tokenizer.eos_token # tokenizer.padding_side = "right" # logger.info("Tokenizer loaded successfully") # # Load model with optimized settings # logger.info("Loading model with 4-bit quantization...") # start_load_time = time.time() # try: # model = AutoModelForCausalLM.from_pretrained( # MODEL_ID, # quantization_config=bnb_config, # device_map="auto", # torch_dtype=torch.float16, # low_cpu_mem_usage=True, # ) # # Force model evaluation mode # model.eval() # logger.info(f"Model loaded in {time.time() - start_load_time:.2f} seconds") # except Exception as e: # logger.error(f"Error loading model with optimized settings: {e}") # logger.info("Falling back to 8-bit quantization...") # # Fallback to 8-bit quantization # bnb_config = BitsAndBytesConfig( # load_in_8bit=True, # bnb_8bit_quant_type="nf4", # bnb_8bit_compute_dtype=torch.float16 # ) # model = AutoModelForCausalLM.from_pretrained( # MODEL_ID, # quantization_config=bnb_config, # device_map="auto" # ) # model.eval() # logger.info(f"Model loaded with fallback settings in {time.time() - start_load_time:.2f} seconds") # # Perform initial warmup to compile any lazy modules # logger.info("Performing model warmup...") # try: # dummy_input = tokenizer("Hello", return_tensors="pt").to(model.device) # with torch.no_grad(), torch.inference_mode(): # _ = model.generate(**dummy_input, max_new_tokens=1) # logger.info("Model warmup completed successfully") # except Exception as e: # logger.warning(f"Model warmup encountered an issue: {e}") # # Format prompt according to Llama 3.1 chat template # def format_prompt(message): # """Format the prompt according to Llama 3.1 chat template""" # if "[INST]" not in message: # return f"[INST] {message} [/INST]" # return message # # Cached tokenization to avoid repeating work # @lru_cache(maxsize=128) # def cached_tokenize(prompt): # """Tokenize with caching to avoid redundant work""" # return tokenizer(prompt, return_tensors="pt") # # Response cache for frequently asked questions # response_cache = {} # # Periodically clean CUDA cache # def clean_memory(): # """Clean up CUDA memory""" # gc.collect() # torch.cuda.empty_cache() # logger.info("Memory cleaned") # # Define the optimized prediction function # def predict(prompt, temperature=0.1, max_tokens=256, top_p=0.9): # """Generate text from the model with specified parameters and performance tracking""" # start_time = time.time() # # Check cache for deterministic queries (when temperature=0) # cache_key = None # if temperature == 0: # cache_key = (prompt, max_tokens, top_p) # if cache_key in response_cache: # logger.info(f"Cache hit! Returning cached response in {time.time() - start_time:.4f}s") # return response_cache[cache_key] # # Format the prompt # formatted_prompt = format_prompt(prompt) # # Tokenize with potential cache hit # inputs = cached_tokenize(formatted_prompt).to(model.device) # # Log token count # input_token_count = inputs.input_ids.shape[1] # logger.info(f"Input length: {input_token_count} tokens") # # Generate with optimized settings # try: # with torch.no_grad(), torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.float16): # outputs = model.generate( # inputs.input_ids, # attention_mask=inputs.attention_mask, # max_new_tokens=max_tokens, # temperature=temperature, # top_p=top_p, # do_sample=True if temperature > 0 else False, # pad_token_id=tokenizer.eos_token_id, # use_cache=True, # Enable KV caching # num_beams=1, # Disable beam search for faster generation # repetition_penalty=1.1 # Slight penalty to avoid repetition # ) # except RuntimeError as e: # if "CUDA out of memory" in str(e): # logger.error(f"CUDA OOM error: {e}") # clean_memory() # return "Sorry, the model ran out of memory. Please try again with a shorter prompt or fewer output tokens." # else: # logger.error(f"Generation error: {e}") # return f"An error occurred during generation: {str(e)}" # # Decode the generated text # generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) # # Remove the prompt from the response # if formatted_prompt in generated_text: # response = generated_text.replace(formatted_prompt, "").strip() # else: # response = generated_text # # Cache the response if deterministic # if cache_key is not None: # response_cache[cache_key] = response # # Calculate and log performance metrics # end_time = time.time() # total_time = end_time - start_time # output_token_count = len(outputs[0]) - input_token_count # tokens_per_second = output_token_count / total_time if total_time > 0 else 0 # logger.info(f"Generated {output_token_count} tokens in {total_time:.2f}s ({tokens_per_second:.2f} tokens/sec)") # # Clean memory occasionally (every 10 requests) # if len(response_cache) % 10 == 0: # clean_memory() # return response # # Create Gradio interface for web UI # import gradio as gr # # Create the Gradio UI # def create_ui(): # with gr.Blocks(title="UK Building Regulations Assistant") as demo: # gr.Markdown("# UK Building Regulations Assistant") # gr.Markdown("Ask questions about UK building regulations and construction standards.") # with gr.Row(): # with gr.Column(scale=4): # prompt_input = gr.Textbox( # label="Your question", # placeholder="Enter your question about UK building regulations...", # lines=3 # ) # with gr.Row(): # with gr.Column(scale=1): # temp_slider = gr.Slider( # minimum=0.0, # maximum=1.0, # value=0.1, # step=0.1, # label="Temperature" # ) # with gr.Column(scale=1): # tokens_slider = gr.Slider( # minimum=64, # maximum=2048, # value=512, # step=64, # label="Max Output Tokens" # ) # with gr.Column(scale=1): # top_p_slider = gr.Slider( # minimum=0.1, # maximum=1.0, # value=0.9, # step=0.1, # label="Top-p" # ) # submit_btn = gr.Button("Submit", variant="primary") # output = gr.Textbox(label="Response", lines=12) # # Add performance metrics display # with gr.Accordion("Performance Metrics", open=False): # performance_info = gr.Markdown("") # # Example questions # examples = gr.Examples( # examples=[ # ["What are the fire safety requirements for commercial buildings in the UK?", 0.1, 512, 0.9], # ["What are the minimum ceiling heights for residential buildings?", 0.0, 512, 0.9], # ["Explain the requirements for emergency exits in office buildings.", 0.1, 512, 0.9], # ["What is the required insulation U-value for external walls in new buildings?", 0.1, 512, 0.9], # ["Describe the accessibility requirements for public bathrooms.", 0.1, 512, 0.9] # ], # inputs=[prompt_input, temp_slider, tokens_slider, top_p_slider] # ) # # Define prediction function with timing # def timed_predict(prompt, temp, max_tokens, top_p): # start = time.time() # response = predict(prompt, temp, max_tokens, top_p) # end = time.time() # # Update performance metrics # perf_text = f""" # - Time to generate: {end - start:.2f} seconds # - Input length: {len(tokenizer.encode(format_prompt(prompt)))} tokens # - Output length: {len(tokenizer.encode(response))} tokens # - Speed: {len(tokenizer.encode(response)) / (end - start):.2f} tokens/second # """ # return response, perf_text # # Connect the UI elements # submit_btn.click( # timed_predict, # inputs=[prompt_input, temp_slider, tokens_slider, top_p_slider], # outputs=[output, performance_info] # ) # # Also allow Enter key to submit # prompt_input.submit( # timed_predict, # inputs=[prompt_input, temp_slider, tokens_slider, top_p_slider], # outputs=[output, performance_info] # ) # return demo # # Launch the app with queue enabled # if __name__ == "__main__": # logger.info("Creating UI...") # demo = create_ui() # logger.info("Launching app...") # demo.queue(max_size=10) # Removed concurrency_count parameter # demo.launch(share=False, server_name="0.0.0.0", server_port=7860) import torch import time import os import gc import re import logging from functools import lru_cache from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Set environment variables for better performance os.environ["TOKENIZERS_PARALLELISM"] = "false" torch.backends.cudnn.benchmark = True # Display GPU information try: logger.info(f"CUDA available: {torch.cuda.is_available()}") if torch.cuda.is_available(): logger.info(f"CUDA device: {torch.cuda.get_device_name(0)}") logger.info(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB") except Exception as e: logger.warning(f"Unable to get GPU info: {e}") # Model configuration - UPDATED to use the fine-tuned Instruct model MODEL_ID = "SamuelJaja/llama-3.1-8b-instruct-construction-merged" logger.info(f"Loading model: {MODEL_ID}") # Optimized 4-bit quantization configuration bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True # Nested quantization for better memory efficiency ) # Load tokenizer with optimization flags logger.info("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained( MODEL_ID, use_fast=True, # Use the fast Rust-based tokenizer ) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right" logger.info("Tokenizer loaded successfully") # Load model with optimized settings logger.info("Loading model with 4-bit quantization...") start_load_time = time.time() try: model = AutoModelForCausalLM.from_pretrained( MODEL_ID, quantization_config=bnb_config, device_map="auto", torch_dtype=torch.float16, low_cpu_mem_usage=True, ) # Force model evaluation mode model.eval() logger.info(f"Model loaded in {time.time() - start_load_time:.2f} seconds") except Exception as e: logger.error(f"Error loading model with optimized settings: {e}") logger.info("Falling back to 8-bit quantization...") # Fallback to 8-bit quantization bnb_config = BitsAndBytesConfig( load_in_8bit=True, bnb_8bit_quant_type="nf4", bnb_8bit_compute_dtype=torch.float16 ) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, quantization_config=bnb_config, device_map="auto" ) model.eval() logger.info(f"Model loaded with fallback settings in {time.time() - start_load_time:.2f} seconds") # Perform initial warmup to compile any lazy modules logger.info("Performing model warmup...") try: dummy_input = tokenizer("Hello", return_tensors="pt").to(model.device) with torch.no_grad(), torch.inference_mode(): _ = model.generate(**dummy_input, max_new_tokens=1) logger.info("Model warmup completed successfully") except Exception as e: logger.warning(f"Model warmup encountered an issue: {e}") # IMPROVED: Format prompt according to Llama 3.1 chat template def format_prompt(message): """Format the prompt according to Llama 3.1 chat template""" if "[INST]" not in message: return f"[INST] {message} [/INST]" return message # IMPROVED: Better response cleaning def clean_response(text): """Remove any instruction tags from the response""" return re.sub(r'\[/?INST\]', '', text).strip() # Cached tokenization to avoid repeating work @lru_cache(maxsize=128) def cached_tokenize(prompt): """Tokenize with caching to avoid redundant work""" return tokenizer(prompt, return_tensors="pt") # Response cache for frequently asked questions response_cache = {} # Periodically clean CUDA cache def clean_memory(): """Clean up CUDA memory""" gc.collect() torch.cuda.empty_cache() logger.info("Memory cleaned") # OPTIMIZED: Define the improved prediction function with faster inference settings def predict(prompt, temperature=0.1, max_tokens=256, top_p=0.9): """Generate text from the model with specified parameters and performance tracking""" start_time = time.time() # Check cache for deterministic queries (when temperature=0) cache_key = None if temperature == 0: cache_key = (prompt, max_tokens, top_p) if cache_key in response_cache: logger.info(f"Cache hit! Returning cached response in {time.time() - start_time:.4f}s") return response_cache[cache_key] # Format the prompt formatted_prompt = format_prompt(prompt) # Tokenize with potential cache hit inputs = cached_tokenize(formatted_prompt).to(model.device) # Log token count input_token_count = inputs.input_ids.shape[1] logger.info(f"Input length: {input_token_count} tokens") # Generate with optimized settings - UPDATED FOR SPEED try: with torch.no_grad(), torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.float16): outputs = model.generate( inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, do_sample=False, # Deterministic generation for speed pad_token_id=tokenizer.eos_token_id, use_cache=True, # Enable KV caching num_beams=1, # Disable beam search for faster generation repetition_penalty=1.1 # Slight penalty to avoid repetition ) except RuntimeError as e: if "CUDA out of memory" in str(e): logger.error(f"CUDA OOM error: {e}") clean_memory() return "Sorry, the model ran out of memory. Please try again with a shorter prompt or fewer output tokens." else: logger.error(f"Generation error: {e}") return f"An error occurred during generation: {str(e)}" # Decode the generated text generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) # IMPROVED: Better response cleaning # First remove the prompt from the response if formatted_prompt in generated_text: response = generated_text.replace(formatted_prompt, "").strip() else: response = generated_text # Then clean any remaining instruction tags response = clean_response(response) # Trim at any follow-up questions question_patterns = [ # Original patterns from your test_generation function r'\s+What\s+is', r'\s+How\s+does', r'\s+Why\s+is', r'\s+When\s+should', r'\s+Where\s+can', # Additional necessary patterns r'\s+What\s+are', r'\s+How\s+can', r'\s+Why\s+are', r'\s+How\s+should', r'\s+Where\s+are', r'\s+When\s+can', # Catch questions after completed sentences r'\?[\s\n]+[A-Z]', # Catch anything after a closing instruction tag r'.*' ] for pattern in question_patterns: parts = re.split(pattern, response, 1) if len(parts) > 1: response = parts[0].strip() # Cache the response if deterministic if cache_key is not None: response_cache[cache_key] = response # Calculate and log performance metrics end_time = time.time() total_time = end_time - start_time output_token_count = len(outputs[0]) - input_token_count tokens_per_second = output_token_count / total_time if total_time > 0 else 0 logger.info(f"Generated {output_token_count} tokens in {total_time:.2f}s ({tokens_per_second:.2f} tokens/sec)") # Clean memory occasionally (every 10 requests) if len(response_cache) % 10 == 0: clean_memory() return response # Create Gradio interface for web UI import gradio as gr # Create the Gradio UI def create_ui(): with gr.Blocks(title="UK Building Regulations Assistant") as demo: gr.Markdown("# UK Building Regulations Assistant") gr.Markdown("Ask questions about UK building regulations and construction standards.") with gr.Row(): with gr.Column(scale=4): prompt_input = gr.Textbox( label="Your question", placeholder="Enter your question about UK building regulations...", lines=3 ) with gr.Row(): with gr.Column(scale=1): temp_slider = gr.Slider( minimum=0.0, maximum=1.0, value=0.1, step=0.1, label="Temperature" ) with gr.Column(scale=1): tokens_slider = gr.Slider( minimum=64, maximum=2048, value=256, # UPDATED: Lower default value for speed step=64, label="Max Output Tokens" ) with gr.Column(scale=1): top_p_slider = gr.Slider( minimum=0.1, maximum=1.0, value=0.9, step=0.1, label="Top-p" ) submit_btn = gr.Button("Submit", variant="primary") output = gr.Textbox(label="Response", lines=12) # Add performance metrics display with gr.Accordion("Performance Metrics", open=False): performance_info = gr.Markdown("") # Example questions examples = gr.Examples( examples=[ ["What are the fire safety requirements for commercial buildings in the UK?", 0.1, 256, 0.9], ["What are the minimum ceiling heights for residential buildings?", 0.0, 256, 0.9], ["Explain the requirements for emergency exits in office buildings.", 0.1, 256, 0.9], ["What is the required insulation U-value for external walls in new buildings?", 0.1, 256, 0.9], ["Describe the accessibility requirements for public bathrooms.", 0.1, 256, 0.9] ], inputs=[prompt_input, temp_slider, tokens_slider, top_p_slider] ) # Define prediction function with timing def timed_predict(prompt, temp, max_tokens, top_p): start = time.time() response = predict(prompt, temp, max_tokens, top_p) end = time.time() # Update performance metrics perf_text = f""" - Time to generate: {end - start:.2f} seconds - Input length: {len(tokenizer.encode(format_prompt(prompt)))} tokens - Output length: {len(tokenizer.encode(response))} tokens - Speed: {len(tokenizer.encode(response)) / (end - start):.2f} tokens/second """ return response, perf_text # Connect the UI elements submit_btn.click( timed_predict, inputs=[prompt_input, temp_slider, tokens_slider, top_p_slider], outputs=[output, performance_info] ) # Also allow Enter key to submit prompt_input.submit( timed_predict, inputs=[prompt_input, temp_slider, tokens_slider, top_p_slider], outputs=[output, performance_info] ) return demo # Launch the app with queue enabled if __name__ == "__main__": logger.info("Creating UI...") demo = create_ui() logger.info("Launching app...") demo.queue(max_size=10) demo.launch(share=False, server_name="0.0.0.0", server_port=7860)