import json
import subprocess
from threading import Thread
import os
from datetime import datetime
from pathlib import Path
from googleapiclient import discovery
from googleapiclient.http import MediaInMemoryUpload
from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build
import torch
import spaces
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build
# Removed deprecated BetterTransformer import

# Model Configuration - Updated to faster, more accurate model
MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"  # Much faster than 24B, excellent quality
MODEL_NAME = MODEL_ID.split("/")[-1]
CONTEXT_LENGTH = 32768  # Qwen2.5 supports up to 128k context
EMOJI = "🌪️"
DESCRIPTION = f"Chat with my Digital Twin, powered by {MODEL_NAME}"

def get_logs_directory():
    """
    Get the directory where chat logs should be stored.
    Creates a 'chat_logs' directory in the current working directory.
    """
    # Always use current working directory for consistency
    base_dir = Path.cwd()
    
    # Create logs directory
    logs_dir = base_dir / 'chat_logs'
    logs_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"Saving chat logs to: {logs_dir}")
    return logs_dir

def save_conversation_to_drive(message, response, history):
    """
    Save the conversation to a text file in Google Drive.
    """
    try:
        google_creds_json = os.getenv("GOOGLE_CREDS_JSON", "")
        if not google_creds_json:
            print("Warning: No Google credentials found. Cannot save conversation.")
            return

        creds_info = json.loads(google_creds_json)
        creds = Credentials.from_service_account_info(
            creds_info,
            scopes=['https://www.googleapis.com/auth/drive.file']
        )
        
        # Create Drive API service
        drive_service = build('drive', 'v3', credentials=creds)
        
        # Prepare conversation content
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        conversation_log = {
            'timestamp': datetime.now().isoformat(),
            'history': history,
            'last_message': message,
            'last_response': response
        }
        
        # Convert to formatted text content
        text_content = f"Conversation Log - {timestamp}\n\n"
        text_content += "Full Conversation History:\n"
        for i, (user, assistant) in enumerate(history):
            text_content += f"\nUser {i+1}: {user}\n"
            text_content += f"Assistant: {assistant}\n"
        text_content += f"\nLast Message: {message}\n"
        text_content += f"Final Response: {response}\n"

        # Create file metadata
        folder_id = os.getenv("GOOGLE_DRIVE_FOLDER_ID", "")  # You'll need to set this
        file_metadata = {
            'name': f'chat_log_{timestamp}.txt',
            'parents': [folder_id] if folder_id else None,
            'mimeType': 'text/plain'
        }

        # Create file in Drive
        file = drive_service.files().create(
            body=file_metadata,
            media_body=MediaInMemoryUpload(
                text_content.encode('utf-8'),
                mimetype='text/plain',
                resumable=True
            )
        ).execute()

        print(f"Conversation saved to Google Drive with file ID: {file.get('id')}")
        
    except Exception as e:
        print(f"Error saving conversation to Google Drive: {e}")

def load_system_message():
    """
    Load the system prompt text from a private Google Doc
    """
    doc_id = os.getenv("GOOGLE_DOC_ID", "")
    if not doc_id:
        print("Warning: No GOOGLE_DOC_ID found. Using default system message.")
        return "You are a helpful assistant. First recognize user request and then reply carefully with thinking."

    google_creds_json = os.getenv("GOOGLE_CREDS_JSON", "")
    if not google_creds_json:
        print("Warning: No GOOGLE_CREDS_JSON in environment. Using default message.")
        return "You are a helpful assistant. First recognize user request and then reply carefully with thinking."

    try:
        creds_info = json.loads(google_creds_json)
        creds = Credentials.from_service_account_info(
            creds_info,
            scopes=["https://www.googleapis.com/auth/documents.readonly"]
        )
        service = build("docs", "v1", credentials=creds)
        doc = service.documents().get(documentId=doc_id).execute()

        paragraphs = []
        for element in doc.get("body", {}).get("content", []):
            paragraph_elements = element.get("paragraph", {}).get("elements", [])
            for run in paragraph_elements:
                text_run = run.get("textRun", {})
                if text_run.get("content"):
                    paragraphs.append(text_run["content"])

        system_message = "".join(paragraphs).strip()
        if not system_message:
            print("Warning: Doc is empty. Using default system message.")
            return "You are a helpful assistant. First recognize user request and then reply carefully with thinking."

        return system_message
    except Exception as e:
        print(f"Error loading system message from Google Doc: {e}")
        return "You are a helpful assistant. First recognize user request and then reply carefully with thinking."

SYSTEM_MESSAGE = load_system_message()

@spaces.GPU()
def predict(message, history):
    # Format history using Mistral's chat template
    messages = [{"role": "system", "content": SYSTEM_MESSAGE}]
    
    for user, assistant in history:
        messages.append({"role": "user", "content": user})
        messages.append({"role": "assistant", "content": assistant})
    
    messages.append({"role": "user", "content": message})
    
    # Convert messages to Qwen format
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    enc = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    input_ids, attention_mask = enc.input_ids, enc.attention_mask

    if input_ids.shape[1] > CONTEXT_LENGTH:
        input_ids = input_ids[:, -CONTEXT_LENGTH:]
        attention_mask = attention_mask[:, -CONTEXT_LENGTH:]

    # Optimized generation parameters for Qwen2.5
    generate_kwargs = dict(
        input_ids=input_ids.to(device),
        attention_mask=attention_mask.to(device),
        streamer=streamer,
        do_sample=True,
        temperature=0.5,  # Slightly higher for more natural responses
        max_new_tokens=800,  # Increased for more detailed responses
        top_k=40,  # Optimized for Qwen
        repetition_penalty=1.05,  # Lower penalty for Qwen
        top_p=0.8,  # Optimized for Qwen
        use_cache=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    outputs = []
    try:
        for new_token in streamer:
            outputs.append(new_token)
            current_response = "".join(outputs)
            yield current_response
        
        # Save the complete conversation after generation is done
        final_response = "".join(outputs)
        save_conversation_to_drive(message, final_response, history)
    except Exception as e:
        print(f"Error in predict: {e}")
        yield "An error occurred while generating the response."

# Load model with optimized settings for Qwen2.5-7B
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,  # Fixed parameter name
    bnb_4bit_quant_type="nf4"
)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Load model with optimizations
try:
    print("Loading model with optimizations...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        device_map="auto",
        quantization_config=quantization_config,
        torch_dtype=torch.bfloat16
    )
    
    # Removed deprecated BetterTransformer optimization
    
    # Apply torch compile optimization (more modern and still supported)
    print("Applying torch compile optimization...")
    model = torch.compile(model, mode="reduce-overhead")
    
    print("Model loading and optimization complete!")
except Exception as e:
    print(f"Warning: Could not apply all optimizations: {e}")
    # Fallback to basic model loading if optimizations fail
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        device_map="auto",
        quantization_config=quantization_config,
        torch_dtype=torch.bfloat16
    )

# Custom CSS
CSS = """
#title {
    text-align: center !important;
}
#disclaimer-container {
    display: flex !important;
    justify-content: center !important;
    width: 100% !important;
    margin: 0 auto !important;
}
#disclaimer {
    text-align: center !important;
    color: rgba(153, 153, 153, 0.5) !important;
    font-size: 0.7em !important;
    margin: 20px auto !important;
    padding-bottom: 20px !important;
    max-width: 600px !important;
    opacity: 0.4 !important;
    line-height: 1.4 !important;
    font-weight: 250 !important;
    font-style: italic !important;
}
"""

# Create Gradio interface
with gr.Blocks(css=CSS) as demo:
    gr.Markdown(
        """
        # Chat with my Digital Twin!
        """,
        elem_id="title"
    )

    chat = gr.ChatInterface(
        fn=predict,
        chatbot=gr.Chatbot(height=400),
        examples=[
            ["Tell me the story of your life, the choices you have made and why you made them."],
            ["What are some of your favorite books or ideas?"],
            ["What are some significant technical projects you have led during your career?"],
            ["What mental models have you developed and found useful?"],
            ["When have you applied your mental models?"],
            ["What are your thoughts on proprietary vs open source projects?"]
        ],
        fill_height=True,
        theme="Nymbo/Alyx_Theme",
        title=None,
        description=None
    )

    with gr.Row(elem_id="disclaimer-container"):
        gr.Markdown(
            f"""
            *Powered by {MODEL_NAME}. Output may not always reflect my beliefs or be completely accurate. Additionally my viewpoints may change over time.*
            """,
            elem_id="disclaimer"
        )

if __name__ == "__main__":
    demo.queue().launch()