import transformers  # Added since we use transformers.pipeline below
from transformers import AutoModelForCausalLM, AutoTokenizer
from llama_index.llm_predictor import LLMPredictor  # Updated import path for LLMPredictor
from llama_index import (
    PromptHelper,
    StorageContext,
    ServiceContext,
    load_index_from_storage,
    SimpleDirectoryReader,
    GPTVectorStoreIndex
)
from langchain.llms import HuggingFacePipeline
import torch
import gradio as gr
from ratelimit import limits, sleep_and_retry
import sys
import os

# Configure device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def create_llm_pipeline():
    # Load model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(
        "deepseek-ai/DeepSeek-R1",
        trust_remote_code=True,
        torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1")
    
    # Create pipeline
    pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device=0 if DEVICE == "cuda" else -1,  # Use device index: 0 for GPU, -1 for CPU
        max_length=2048,
        do_sample=True,
        temperature=0.7,
        top_p=0.95,
    )
    
    return HuggingFacePipeline(pipeline=pipeline)

# Define the rate limit for processing
RATE_LIMIT = 3

@sleep_and_retry
@limits(calls=RATE_LIMIT, period=1)
def create_service_context():
    # Constraint parameters
    max_input_size = 4096
    num_outputs = 2048  # Adjusted for DeepSeek model
    max_chunk_overlap = 15
    chunk_size_limit = 600

    # Create prompt helper
    prompt_helper = PromptHelper(
        max_input_size,
        num_outputs,
        chunk_overlap_ratio=0.1,
        chunk_size_limit=chunk_size_limit
    )

    # Create LLM predictor with DeepSeek model
    llm = create_llm_pipeline()
    llm_predictor = LLMPredictor(llm=llm)
    
    # Create service context
    service_context = ServiceContext.from_defaults(
        llm_predictor=llm_predictor,
        prompt_helper=prompt_helper
    )
    return service_context

@sleep_and_retry
@limits(calls=RATE_LIMIT, period=1)
def data_ingestion_indexing(directory_path):
    # Load documents
    documents = SimpleDirectoryReader(directory_path).load_data()

    # Create index
    index = GPTVectorStoreIndex.from_documents(
        documents,
        service_context=create_service_context()
    )

    # Persist index
    index.storage_context.persist()
    return index

def data_querying(input_text):
    # Load stored index
    storage_context = StorageContext.from_defaults(persist_dir="./storage")
    index = load_index_from_storage(
        storage_context,
        service_context=create_service_context()
    )

    # Query the index
    response = index.as_query_engine().query(input_text)
    return response.response

# Create Gradio interface
iface = gr.Interface(
    fn=data_querying,
    inputs=gr.components.Textbox(
        lines=20,
        label="Enter your question"
    ),
    outputs=gr.components.Textbox(
        lines=25,
        label="Response",
        style="height: 400px; overflow-y: scroll;"
    ),
    title="Philosophy QA - Aristotle Complete Works (Using DeepSeek-R1)"
)

# Initialize the system
if __name__ == "__main__":
    # Create initial index
    index = data_ingestion_indexing("books")
    # Launch the interface
    iface.launch()