import streamlit as st from llama_cpp import Llama import requests import os from tqdm import tqdm direct_url = "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_K_M.gguf" model_path = "model/mistral-7b-v0.1.Q4_K_M.gguf" def main(): def download_file_with_progress(url: str, filename: str): """Download a file with progress bar using requests""" response = requests.get(url, stream=True) total_size = int(response.headers.get('content-length', 0)) with open(filename, 'wb') as file, tqdm( desc=f"Downloading {filename}", total=total_size, unit='iB', unit_scale=True, unit_divisor=1024, ) as progress_bar: for data in response.iter_content(chunk_size=1024): size = file.write(data) progress_bar.update(size) # Load the model @st.cache_resource def download_model(): os.makedirs(os.path.dirname(model_path), exist_ok=True) download_file_with_progress(direct_url, model_path) # Ensure the model is downloaded if not os.path.exists(model_path): st.info("Model file not found. Downloading...") download_model() if not os.path.exists(model_path): st.error(f"Model file {model_path} not found after download!") return # Load the model llm = Llama( model_path=model_path, n_ctx=4096, n_gpu_layers=0, # CPU only verbose=False, ) def process_query(query: str) -> str: MAX_ATTEMPTS = 5 for attempt in range(MAX_ATTEMPTS): try: response = llm( query, max_tokens=1024, temperature=0.4, top_p=0.95, echo=False, stop=["Question:", "\n\n"] ) answer = response['choices'][0]['text'].strip() # Check if response is empty or too short if not answer or len(answer) < 2: print(f"Got empty or too short response: '{answer}'. Retrying...") continue else: return answer except Exception as e: print(f"Error on attempt {attempt + 1}: {str(e)}") continue return "I apologize, but after multiple attempts, I was unable to generate a satisfactory response. Please try rephrasing your question." # Streamlit UI st.title("LLama_cpp GGUF Model Inference") user_input = st.text_input("Enter your prompt:") if st.button("Generate"): if user_input: with st.spinner("Generating response..."): output = process_query(user_input) st.success("Response generated!") st.write(output) else: st.error("Please enter a prompt.") if __name__ == "__main__": main()