import transformers # Added since we use transformers.pipeline below from transformers import AutoModelForCausalLM, AutoTokenizer from llama_index.llm_predictor import LLMPredictor # Updated import path for LLMPredictor from llama_index import ( PromptHelper, StorageContext, ServiceContext, load_index_from_storage, SimpleDirectoryReader, GPTVectorStoreIndex ) from langchain.llms import HuggingFacePipeline import torch import gradio as gr from ratelimit import limits, sleep_and_retry import sys import os # Configure device DEVICE = "cuda" if torch.cuda.is_available() else "cpu" def create_llm_pipeline(): # Load model and tokenizer model = AutoModelForCausalLM.from_pretrained( "deepseek-ai/DeepSeek-R1", trust_remote_code=True, torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32, device_map="auto" ) tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1") # Create pipeline pipeline = transformers.pipeline( "text-generation", model=model, tokenizer=tokenizer, device=0 if DEVICE == "cuda" else -1, # Use device index: 0 for GPU, -1 for CPU max_length=2048, do_sample=True, temperature=0.7, top_p=0.95, ) return HuggingFacePipeline(pipeline=pipeline) # Define the rate limit for processing RATE_LIMIT = 3 @sleep_and_retry @limits(calls=RATE_LIMIT, period=1) def create_service_context(): # Constraint parameters max_input_size = 4096 num_outputs = 2048 # Adjusted for DeepSeek model max_chunk_overlap = 15 chunk_size_limit = 600 # Create prompt helper prompt_helper = PromptHelper( max_input_size, num_outputs, chunk_overlap_ratio=0.1, chunk_size_limit=chunk_size_limit ) # Create LLM predictor with DeepSeek model llm = create_llm_pipeline() llm_predictor = LLMPredictor(llm=llm) # Create service context service_context = ServiceContext.from_defaults( llm_predictor=llm_predictor, prompt_helper=prompt_helper ) return service_context @sleep_and_retry @limits(calls=RATE_LIMIT, period=1) def data_ingestion_indexing(directory_path): # Load documents documents = SimpleDirectoryReader(directory_path).load_data() # Create index index = GPTVectorStoreIndex.from_documents( documents, service_context=create_service_context() ) # Persist index index.storage_context.persist() return index def data_querying(input_text): # Load stored index storage_context = StorageContext.from_defaults(persist_dir="./storage") index = load_index_from_storage( storage_context, service_context=create_service_context() ) # Query the index response = index.as_query_engine().query(input_text) return response.response # Create Gradio interface iface = gr.Interface( fn=data_querying, inputs=gr.components.Textbox( lines=20, label="Enter your question" ), outputs=gr.components.Textbox( lines=25, label="Response", style="height: 400px; overflow-y: scroll;" ), title="Philosophy QA - Aristotle Complete Works (Using DeepSeek-R1)" ) # Initialize the system if __name__ == "__main__": # Create initial index index = data_ingestion_indexing("books") # Launch the interface iface.launch()