Spaces:
Runtime error
Runtime error
import streamlit as st | |
from llama_cpp import Llama | |
import os | |
def main(): | |
direct_url = "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_K_M.gguf" | |
model_path = "model/mistral-7b-v0.1.Q4_K_M.gguf" | |
# Check if the model file exists | |
if not os.path.exists(model_path): | |
st.error(f"Model file {model_path} not found! Please ensure the model is included in the Docker image.") | |
return | |
# Load the model | |
def load_model(): | |
return Llama( | |
model_path=model_path, | |
n_ctx=4096, | |
n_gpu_layers=0, # CPU only | |
verbose=False, | |
) | |
llm = load_model() | |
def process_query(query: str) -> str: | |
MAX_ATTEMPTS = 5 | |
for attempt in range(MAX_ATTEMPTS): | |
try: | |
response = llm( | |
query, | |
max_tokens=1024, | |
temperature=0.4, | |
top_p=0.95, | |
echo=False, | |
stop=["Question:", "\n\n"] | |
) | |
answer = response['choices'][0]['text'].strip() | |
# Check if response is empty or too short | |
if not answer or len(answer) < 2: | |
print(f"Got empty or too short response: '{answer}'. Retrying...") | |
continue | |
else: | |
return answer | |
except Exception as e: | |
print(f"Error on attempt {attempt + 1}: {str(e)}") | |
continue | |
return "I apologize, but after multiple attempts, I was unable to generate a satisfactory response. Please try rephrasing your question." | |
# Streamlit UI | |
st.title("LLama_cpp GGUF Model Inference") | |
user_input = st.text_input("Enter your prompt:") | |
if st.button("Generate"): | |
if user_input: | |
with st.spinner("Generating response..."): | |
output = process_query(user_input) | |
st.success("Response generated!") | |
st.write(output) | |
else: | |
st.error("Please enter a prompt.") | |
if __name__ == "__main__": | |
main() | |