import json import subprocess from threading import Thread import os from datetime import datetime from pathlib import Path from googleapiclient import discovery from googleapiclient.http import MediaInMemoryUpload from google.oauth2.service_account import Credentials from googleapiclient.discovery import build import torch import spaces import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer from google.oauth2.service_account import Credentials from googleapiclient.discovery import build # Removed deprecated BetterTransformer import # Model Configuration - Updated to faster, more accurate model MODEL_ID = "Qwen/Qwen2.5-7B-Instruct" # Much faster than 24B, excellent quality MODEL_NAME = MODEL_ID.split("/")[-1] CONTEXT_LENGTH = 32768 # Qwen2.5 supports up to 128k context EMOJI = "🌪️" DESCRIPTION = f"Chat with my Digital Twin, powered by {MODEL_NAME}" def get_logs_directory(): """ Get the directory where chat logs should be stored. Creates a 'chat_logs' directory in the current working directory. """ # Always use current working directory for consistency base_dir = Path.cwd() # Create logs directory logs_dir = base_dir / 'chat_logs' logs_dir.mkdir(parents=True, exist_ok=True) print(f"Saving chat logs to: {logs_dir}") return logs_dir def save_conversation_to_drive(message, response, history): """ Save the conversation to a text file in Google Drive. """ try: google_creds_json = os.getenv("GOOGLE_CREDS_JSON", "") if not google_creds_json: print("Warning: No Google credentials found. Cannot save conversation.") return creds_info = json.loads(google_creds_json) creds = Credentials.from_service_account_info( creds_info, scopes=['https://www.googleapis.com/auth/drive.file'] ) # Create Drive API service drive_service = build('drive', 'v3', credentials=creds) # Prepare conversation content timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") conversation_log = { 'timestamp': datetime.now().isoformat(), 'history': history, 'last_message': message, 'last_response': response } # Convert to formatted text content text_content = f"Conversation Log - {timestamp}\n\n" text_content += "Full Conversation History:\n" for i, (user, assistant) in enumerate(history): text_content += f"\nUser {i+1}: {user}\n" text_content += f"Assistant: {assistant}\n" text_content += f"\nLast Message: {message}\n" text_content += f"Final Response: {response}\n" # Create file metadata folder_id = os.getenv("GOOGLE_DRIVE_FOLDER_ID", "") # You'll need to set this file_metadata = { 'name': f'chat_log_{timestamp}.txt', 'parents': [folder_id] if folder_id else None, 'mimeType': 'text/plain' } # Create file in Drive file = drive_service.files().create( body=file_metadata, media_body=MediaInMemoryUpload( text_content.encode('utf-8'), mimetype='text/plain', resumable=True ) ).execute() print(f"Conversation saved to Google Drive with file ID: {file.get('id')}") except Exception as e: print(f"Error saving conversation to Google Drive: {e}") def load_system_message(): """ Load the system prompt text from a private Google Doc """ doc_id = os.getenv("GOOGLE_DOC_ID", "") if not doc_id: print("Warning: No GOOGLE_DOC_ID found. Using default system message.") return "You are a helpful assistant. First recognize user request and then reply carefully with thinking." google_creds_json = os.getenv("GOOGLE_CREDS_JSON", "") if not google_creds_json: print("Warning: No GOOGLE_CREDS_JSON in environment. Using default message.") return "You are a helpful assistant. First recognize user request and then reply carefully with thinking." try: creds_info = json.loads(google_creds_json) creds = Credentials.from_service_account_info( creds_info, scopes=["https://www.googleapis.com/auth/documents.readonly"] ) service = build("docs", "v1", credentials=creds) doc = service.documents().get(documentId=doc_id).execute() paragraphs = [] for element in doc.get("body", {}).get("content", []): paragraph_elements = element.get("paragraph", {}).get("elements", []) for run in paragraph_elements: text_run = run.get("textRun", {}) if text_run.get("content"): paragraphs.append(text_run["content"]) system_message = "".join(paragraphs).strip() if not system_message: print("Warning: Doc is empty. Using default system message.") return "You are a helpful assistant. First recognize user request and then reply carefully with thinking." return system_message except Exception as e: print(f"Error loading system message from Google Doc: {e}") return "You are a helpful assistant. First recognize user request and then reply carefully with thinking." SYSTEM_MESSAGE = load_system_message() @spaces.GPU() def predict(message, history): # Format history using Mistral's chat template messages = [{"role": "system", "content": SYSTEM_MESSAGE}] for user, assistant in history: messages.append({"role": "user", "content": user}) messages.append({"role": "assistant", "content": assistant}) messages.append({"role": "user", "content": message}) # Convert messages to Qwen format prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) enc = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True) input_ids, attention_mask = enc.input_ids, enc.attention_mask if input_ids.shape[1] > CONTEXT_LENGTH: input_ids = input_ids[:, -CONTEXT_LENGTH:] attention_mask = attention_mask[:, -CONTEXT_LENGTH:] # Optimized generation parameters for Qwen2.5 generate_kwargs = dict( input_ids=input_ids.to(device), attention_mask=attention_mask.to(device), streamer=streamer, do_sample=True, temperature=0.5, # Slightly higher for more natural responses max_new_tokens=800, # Increased for more detailed responses top_k=40, # Optimized for Qwen repetition_penalty=1.05, # Lower penalty for Qwen top_p=0.8, # Optimized for Qwen use_cache=True, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id ) t = Thread(target=model.generate, kwargs=generate_kwargs) t.start() outputs = [] try: for new_token in streamer: outputs.append(new_token) current_response = "".join(outputs) yield current_response # Save the complete conversation after generation is done final_response = "".join(outputs) save_conversation_to_drive(message, final_response, history) except Exception as e: print(f"Error in predict: {e}") yield "An error occurred while generating the response." # Load model with optimized settings for Qwen2.5-7B device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, # Fixed parameter name bnb_4bit_quant_type="nf4" ) # Initialize tokenizer tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token tokenizer.pad_token_id = tokenizer.eos_token_id # Load model with optimizations try: print("Loading model with optimizations...") model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", quantization_config=quantization_config, torch_dtype=torch.bfloat16 ) # Removed deprecated BetterTransformer optimization # Apply torch compile optimization (more modern and still supported) print("Applying torch compile optimization...") model = torch.compile(model, mode="reduce-overhead") print("Model loading and optimization complete!") except Exception as e: print(f"Warning: Could not apply all optimizations: {e}") # Fallback to basic model loading if optimizations fail model = AutoModelForCausalLM.from_pretrained( MODEL_ID, device_map="auto", quantization_config=quantization_config, torch_dtype=torch.bfloat16 ) # Custom CSS CSS = """ #title { text-align: center !important; } #disclaimer-container { display: flex !important; justify-content: center !important; width: 100% !important; margin: 0 auto !important; } #disclaimer { text-align: center !important; color: rgba(153, 153, 153, 0.5) !important; font-size: 0.7em !important; margin: 20px auto !important; padding-bottom: 20px !important; max-width: 600px !important; opacity: 0.4 !important; line-height: 1.4 !important; font-weight: 250 !important; font-style: italic !important; } """ # Create Gradio interface with gr.Blocks(css=CSS) as demo: gr.Markdown( """ # Chat with my Digital Twin! """, elem_id="title" ) chat = gr.ChatInterface( fn=predict, chatbot=gr.Chatbot(height=400), examples=[ ["Tell me the story of your life, the choices you have made and why you made them."], ["What are some of your favorite books or ideas?"], ["What are some significant technical projects you have led during your career?"], ["What mental models have you developed and found useful?"], ["When have you applied your mental models?"], ["What are your thoughts on proprietary vs open source projects?"] ], fill_height=True, theme="Nymbo/Alyx_Theme", title=None, description=None ) with gr.Row(elem_id="disclaimer-container"): gr.Markdown( f""" *Powered by {MODEL_NAME}. Output may not always reflect my beliefs or be completely accurate. Additionally my viewpoints may change over time.* """, elem_id="disclaimer" ) if __name__ == "__main__": demo.queue().launch()