import streamlit as st from llama_cpp import Llama import os def main(): direct_url = "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_K_M.gguf" model_path = "model/mistral-7b-v0.1.Q4_K_M.gguf" # Check if the model file exists if not os.path.exists(model_path): st.error(f"Model file {model_path} not found! Please ensure the model is included in the Docker image.") return # Load the model @st.cache_resource def load_model(): return Llama( model_path=model_path, n_ctx=4096, n_gpu_layers=0, # CPU only verbose=False, ) llm = load_model() def process_query(query: str) -> str: MAX_ATTEMPTS = 5 for attempt in range(MAX_ATTEMPTS): try: response = llm( query, max_tokens=1024, temperature=0.4, top_p=0.95, echo=False, stop=["Question:", "\n\n"] ) answer = response['choices'][0]['text'].strip() # Check if response is empty or too short if not answer or len(answer) < 2: print(f"Got empty or too short response: '{answer}'. Retrying...") continue else: return answer except Exception as e: print(f"Error on attempt {attempt + 1}: {str(e)}") continue return "I apologize, but after multiple attempts, I was unable to generate a satisfactory response. Please try rephrasing your question." # Streamlit UI st.title("LLama_cpp GGUF Model Inference") user_input = st.text_input("Enter your prompt:") if st.button("Generate"): if user_input: with st.spinner("Generating response..."): output = process_query(user_input) st.success("Response generated!") st.write(output) else: st.error("Please enter a prompt.") if __name__ == "__main__": main()