import streamlit as st
from llama_cpp import Llama
import requests
import os
from tqdm import tqdm

direct_url = "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_K_M.gguf"
model_path = "model/mistral-7b-v0.1.Q4_K_M.gguf"

def main():

    def download_file_with_progress(url: str, filename: str):
        """Download a file with progress bar using requests"""
        response = requests.get(url, stream=True)
        total_size = int(response.headers.get('content-length', 0))
        
        with open(filename, 'wb') as file, tqdm(
            desc=f"Downloading {filename}",
            total=total_size,
            unit='iB',
            unit_scale=True,
            unit_divisor=1024,
        ) as progress_bar:
            for data in response.iter_content(chunk_size=1024):
                size = file.write(data)
                progress_bar.update(size)

    # Load the model
    @st.cache_resource
    def download_model():
        os.makedirs(os.path.dirname(model_path), exist_ok=True)
        download_file_with_progress(direct_url, model_path)

    # Ensure the model is downloaded
    if not os.path.exists(model_path):
        st.info("Model file not found. Downloading...")
        download_model()
        if not os.path.exists(model_path):
            st.error(f"Model file {model_path} not found after download!")
            return

    # Load the model
    llm = Llama(
        model_path=model_path,
        n_ctx=4096,
        n_gpu_layers=0,  # CPU only
        verbose=False,
    )

    def process_query(query: str) -> str:
        MAX_ATTEMPTS = 5

        for attempt in range(MAX_ATTEMPTS):
            try:
                response = llm(
                    query,
                    max_tokens=1024,
                    temperature=0.4,
                    top_p=0.95,
                    echo=False,
                    stop=["Question:", "\n\n"]
                )

                answer = response['choices'][0]['text'].strip()

                # Check if response is empty or too short
                if not answer or len(answer) < 2:
                    print(f"Got empty or too short response: '{answer}'. Retrying...")
                    continue
                else:
                    return answer

            except Exception as e:
                print(f"Error on attempt {attempt + 1}: {str(e)}")
                continue

        return "I apologize, but after multiple attempts, I was unable to generate a satisfactory response. Please try rephrasing your question."

    # Streamlit UI
    st.title("LLama_cpp GGUF Model Inference")
    user_input = st.text_input("Enter your prompt:")

    if st.button("Generate"):
        if user_input:
            with st.spinner("Generating response..."):
                output = process_query(user_input)
                st.success("Response generated!")
                st.write(output)
        else:
            st.error("Please enter a prompt.")

if __name__ == "__main__":
    main()