Usman174's picture
Create app.py
e123768 verified
import streamlit as st
from llama_cpp import Llama
import os
def main():
direct_url = "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_K_M.gguf"
model_path = "model/mistral-7b-v0.1.Q4_K_M.gguf"
# Check if the model file exists
if not os.path.exists(model_path):
st.error(f"Model file {model_path} not found! Please ensure the model is included in the Docker image.")
return
# Load the model
@st.cache_resource
def load_model():
return Llama(
model_path=model_path,
n_ctx=4096,
n_gpu_layers=0, # CPU only
verbose=False,
)
llm = load_model()
def process_query(query: str) -> str:
MAX_ATTEMPTS = 5
for attempt in range(MAX_ATTEMPTS):
try:
response = llm(
query,
max_tokens=1024,
temperature=0.4,
top_p=0.95,
echo=False,
stop=["Question:", "\n\n"]
)
answer = response['choices'][0]['text'].strip()
# Check if response is empty or too short
if not answer or len(answer) < 2:
print(f"Got empty or too short response: '{answer}'. Retrying...")
continue
else:
return answer
except Exception as e:
print(f"Error on attempt {attempt + 1}: {str(e)}")
continue
return "I apologize, but after multiple attempts, I was unable to generate a satisfactory response. Please try rephrasing your question."
# Streamlit UI
st.title("LLama_cpp GGUF Model Inference")
user_input = st.text_input("Enter your prompt:")
if st.button("Generate"):
if user_input:
with st.spinner("Generating response..."):
output = process_query(user_input)
st.success("Response generated!")
st.write(output)
else:
st.error("Please enter a prompt.")
if __name__ == "__main__":
main()