Spaces:

Smilyai-labs
/

Sam-chat

Running

App Files Files Community

boning123 commited on Jun 10

Commit

f040238

verified ·

1 Parent(s): da33fb2

Update app.py

Browse files

Files changed (1) hide show

app.py +99 -120

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ from transformers import (
 import torch
 import gradio as gr
 import re
-from duckduckgo_search import DDGS
 # Dictionary of available models
 AVAILABLE_MODELS = {
@@ -24,6 +24,7 @@ tokenizer = None
 model = None
 # Initialize tokenizer and model globally for the first run
 print(f"Initializing model: {DEFAULT_MODEL}...")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
 model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, torch_dtype="auto", device_map="auto")
@@ -99,18 +100,13 @@ def load_model(selected_model_name):
     return f"✅ Loaded model: {selected_model_name}"
 def respond(message, chat_history, use_reasoning):
-    # Start by appending the user's message and an empty string for the bot's response
-    # This creates the new chat bubble for the user's input.
-    chat_history.append([message, ""])
-    yield chat_history, chat_history # Yield immediately to show the user's message
     # Gradio's chat_history is a list of [user_message, bot_message] pairs.
     # We need to convert it to the format expected by the model's chat template.
     messages_for_template = [{"role": "system", "content": SYSTEM_PROMPT}]
-    for user_msg, bot_msg in chat_history[:-1]: # Exclude the current, incomplete turn
-        if user_msg:
             messages_for_template.append({"role": "user", "content": user_msg})
-        if bot_msg:
             messages_for_template.append({"role": "assistant", "content": bot_msg})
     # Add the current user message with the appropriate prefix
@@ -122,10 +118,17 @@ def respond(message, chat_history, use_reasoning):
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
     # --- First Generation Pass (for potential search query) ---
-    generated_text_buffer = ""
     with torch.no_grad():
-        # Iterate over tokens from the model
-        for output_ids in model.generate(
             input_ids=inputs["input_ids"],
             attention_mask=inputs.get("attention_mask"),
             max_new_tokens=gen_kwargs["max_new_tokens"],
@@ -137,67 +140,57 @@ def respond(message, chat_history, use_reasoning):
             eos_token_id=gen_kwargs["eos_token_id"],
             pad_token_id=gen_kwargs["pad_token_id"],
             stopping_criteria=stop_criteria,
-            return_dict_in_generate=True,
-            output_scores=True,
-            # We don't use streamer directly here for Gradio output,
-            # but for internal token generation and decoding.
-        ).sequences.tolist():
-            # Decode the newly generated token
-            new_token_text = tokenizer.decode(output_ids[len(inputs["input_ids"][0]):], skip_special_tokens=True)
-            generated_text_buffer += new_token_text
-            # Update the last assistant message in chat_history
-            chat_history[-1][1] = generated_text_buffer
-            yield chat_history, chat_history # Yield to update the Gradio UI
-            # Check for early stop condition if the full response isn't needed yet
-            if "<search>" in generated_text_buffer and "</search>" in generated_text_buffer:
-                break # Stop generating if a full search tag is found
-    # After the first pass, check for search query in the full generated buffer
-    model_response_content_first_pass = generated_text_buffer.strip()
-    # Try to extract content after 'content:' if in thinking mode
-    if use_reasoning:
-        try:
-            thinking_content_start_index = model_response_content_first_pass.find("thinking content:")
-            if thinking_content_start_index != -1:
-                content_start_index = model_response_content_first_pass.rindex("content:", thinking_content_start_index)
-                response_to_process = model_response_content_first_pass[content_start_index + len("content:"):].strip()
-            else:
-                response_to_process = model_response_content_first_pass
-        except ValueError:
-            response_to_process = model_response_content_first_pass
-    else:
-        response_to_process = model_response_content_first_pass
-    search_match = re.search(r'<search>(.*?)</search>', response_to_process, re.DOTALL)
-    if search_match:
-        search_query = search_match.group(1).strip()
-        # Update the current bubble to show the search action
-        chat_history[-1][1] = f"SAM: Detecting search query: '{search_query}'...\nSearching the web..."
-        yield chat_history, chat_history
-        search_results = search_duckduckgo(search_query)
-        # Now, prepare for the second generation pass with search results
-        # Append model's response (with search tag) and tool response to history for the model's context
-        messages_for_template.append({"role": "assistant", "content": response_to_process}) # Model's initial response with search tag
-        messages_for_template.append({"role": "tool_response", "content": f"Search results for \"{search_query}\": {search_results}"})
-        prompt_with_search_results = tokenizer.apply_chat_template(messages_for_template, tokenize=False, add_generation_prompt=True)
-        inputs_with_search = tokenizer([prompt_with_search_results], return_tensors="pt").to(model.device)
-        # Update the current bubble again to indicate thinking with results
-        chat_history[-1][1] = f"SAM: Search results for \"{search_query}\": {search_results}\nSAM: Thinking with results..."
-        yield chat_history, chat_history
-        # --- Second Generation Pass (with search results) ---
-        final_generated_text_buffer = ""
-        with torch.no_grad():
-            for output_ids_search in model.generate(
                 input_ids=inputs_with_search["input_ids"],
                 attention_mask=inputs_with_search.get("attention_mask"),
                 max_new_tokens=gen_kwargs["max_new_tokens"],
@@ -209,64 +202,50 @@ def respond(message, chat_history, use_reasoning):
                 eos_token_id=gen_kwargs["eos_token_id"],
                 pad_token_id=gen_kwargs["pad_token_id"],
                 stopping_criteria=stop_criteria,
-                return_dict_in_generate=True,
-                output_scores=True,
-            ).sequences.tolist():
-                new_token_text_search = tokenizer.decode(output_ids_search[len(inputs_with_search["input_ids"][0]):], skip_special_tokens=True)
-                final_generated_text_buffer += new_token_text_search
-                # Update the *same* last message in chat_history
-                chat_history[-1][1] = f"SAM: Search results for \"{search_query}\": {search_results}\nSAM: Thinking with results...\n" + final_generated_text_buffer
-                yield chat_history, chat_history
-        final_model_response_content = final_generated_text_buffer.strip()
-        # Extract content after 'content:' for the final response
-        if use_reasoning:
-            try:
-                thinking_content_start_index = final_model_response_content.find("thinking content:")
-                if thinking_content_start_index != -1:
-                    content_start_index = final_model_response_content.rindex("content:", thinking_content_start_index)
-                    final_response_to_display = final_model_response_content[content_start_index + len("content:"):].strip()
-                else:
                     final_response_to_display = final_model_response_content
-            except ValueError:
                 final_response_to_display = final_model_response_content
-        else:
-            final_response_to_display = final_model_response_content
-        # Final update to the chat history for the completed response
-        chat_history[-1][1] = final_response_to_display
-        yield chat_history, chat_history
-    else: # No search query detected in the first pass
-        # The generated_text_buffer already holds the full response from the first pass
-        # This part handles the case where no search was needed.
-        # Ensure the last message in chat_history is the fully generated one.
-        # Extract content after 'content:' for the direct response
-        if use_reasoning:
-            try:
-                thinking_content_start_index = model_response_content_first_pass.find("thinking content:")
-                if thinking_content_start_index != -1:
-                    content_start_index = model_response_content_first_pass.rindex("content:", thinking_content_start_index)
-                    direct_response_to_display = model_response_content_first_pass[content_start_index + len("content:"):].strip()
-                else:
-                    direct_response_to_display = model_response_content_first_pass
-            except ValueError:
-                direct_response_to_display = model_response_content_first_pass
-        else:
-            direct_response_to_display = model_response_content_first_pass
-        # Ensure the last message in chat_history is the fully generated one.
-        chat_history[-1][1] = direct_response_to_display
-        yield chat_history, chat_history
-    # The final `yield` at the end of the function ensures the state is updated
-    # for the next turn in Gradio.
     return chat_history, chat_history
 with gr.Blocks() as demo:
     gr.Markdown("## 🤖 Sam - SmilyAI Assistant")
     gr.Markdown("Chat with **Sam**, an AI assistant built by [SmilyAI Labs](https://smily.ai). Toggle reasoning mode or choose a model below.")

 import torch
 import gradio as gr
 import re
+from duckduckgo_search import DDGS # Import DuckDuckGo Search
 # Dictionary of available models
 AVAILABLE_MODELS = {
 model = None
 # Initialize tokenizer and model globally for the first run
+# This ensures they are loaded when the script starts
 print(f"Initializing model: {DEFAULT_MODEL}...")
 tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
 model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, torch_dtype="auto", device_map="auto")
     return f"✅ Loaded model: {selected_model_name}"
 def respond(message, chat_history, use_reasoning):
     # Gradio's chat_history is a list of [user_message, bot_message] pairs.
     # We need to convert it to the format expected by the model's chat template.
     messages_for_template = [{"role": "system", "content": SYSTEM_PROMPT}]
+    for user_msg, bot_msg in chat_history:
+        if user_msg: # Only add if not empty
             messages_for_template.append({"role": "user", "content": user_msg})
+        if bot_msg: # Only add if not empty
             messages_for_template.append({"role": "assistant", "content": bot_msg})
     # Add the current user message with the appropriate prefix
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
     # --- First Generation Pass (for potential search query) ---
+    full_response_parts = []
+    current_chat_history_for_yield = list(chat_history) # Create a copy for yielding
+    # Ensure streamer is correctly set up for this generation
+    # For Gradio streaming, we need to manually yield tokens.
+    # TextStreamer is for console output; here we'll collect and yield.
     with torch.no_grad():
+        # Use an iterable for token-by-token generation for Gradio
+        # This is a common pattern for streaming outputs in Gradio
+        # We will collect the full response to check for search tags.
+        generated_ids = model.generate(
             input_ids=inputs["input_ids"],
             attention_mask=inputs.get("attention_mask"),
             max_new_tokens=gen_kwargs["max_new_tokens"],
             eos_token_id=gen_kwargs["eos_token_id"],
             pad_token_id=gen_kwargs["pad_token_id"],
             stopping_criteria=stop_criteria,
+            streamer=TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) # For console debugging
+        )
+        # Decode the full generated output
+        full_generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=False)
+        # The actual response from the model (excluding the prompt)
+        model_response_content = full_generated_text[len(prompt):].strip()
+        # Try to extract content after 'content:' if in thinking mode
+        if use_reasoning:
+            try:
+                # Find the last occurrence of "content:" after "thinking content:"
+                thinking_content_start_index = model_response_content.find("thinking content:")
+                if thinking_content_start_index != -1:
+                    content_start_index = model_response_content.rindex("content:", thinking_content_start_index)
+                    response_to_display = model_response_content[content_start_index + len("content:"):].strip()
+                else:
+                    response_to_display = model_response_content
+            except ValueError:
+                response_to_display = model_response_content
+        else:
+            response_to_display = model_response_content
+        # Check for search query in the model's response
+        search_match = re.search(r'<search>(.*?)</search>', response_to_display, re.DOTALL)
+        if search_match:
+            search_query = search_match.group(1).strip()
+            # Update history with the model's initial response containing the search query
+            # For display purposes in Gradio, we'll show the search request, then results, then final answer.
+            current_chat_history_for_yield.append([message, f"SAM: Detecting search query: '{search_query}'...\nSearching the web..."])
+            yield current_chat_history_for_yield, current_chat_history_for_yield # Yield interim state
+            search_results = search_duckduckgo(search_query)
+            # Now, prepare for the second generation pass with search results
+            # Append model's response (with search tag) and tool response to history
+            messages_for_template.append({"role": "assistant", "content": response_to_display}) # Model's initial response with search tag
+            messages_for_template.append({"role": "tool_response", "content": f"Search results for \"{search_query}\": {search_results}"})
+            prompt_with_search_results = tokenizer.apply_chat_template(messages_for_template, tokenize=False, add_generation_prompt=True)
+            inputs_with_search = tokenizer([prompt_with_search_results], return_tensors="pt").to(model.device)
+            current_chat_history_for_yield[-1][1] += f"\nSearch results for \"{search_query}\": {search_results}\nSAM: Thinking with results..."
+            yield current_chat_history_for_yield, current_chat_history_for_yield # Yield interim state
+            # --- Second Generation Pass (with search results) ---
+            final_response_parts = []
+            final_generated_ids = model.generate(
                 input_ids=inputs_with_search["input_ids"],
                 attention_mask=inputs_with_search.get("attention_mask"),
                 max_new_tokens=gen_kwargs["max_new_tokens"],
                 eos_token_id=gen_kwargs["eos_token_id"],
                 pad_token_id=gen_kwargs["pad_token_id"],
                 stopping_criteria=stop_criteria,
+                streamer=TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) # For console debugging
+            )
+            final_full_generated_text = tokenizer.decode(final_generated_ids[0], skip_special_tokens=False)
+            final_model_response_content = final_full_generated_text[len(prompt_with_search_results):].strip()
+            # Extract content after 'content:' for the final response
+            if use_reasoning:
+                try:
+                    thinking_content_start_index = final_model_response_content.find("thinking content:")
+                    if thinking_content_start_index != -1:
+                        content_start_index = final_model_response_content.rindex("content:", thinking_content_start_index)
+                        final_response_to_display = final_model_response_content[content_start_index + len("content:"):].strip()
+                    else:
+                        final_response_to_display = final_model_response_content
+                except ValueError:
                     final_response_to_display = final_model_response_content
+            else:
                 final_response_to_display = final_model_response_content
+            # Yield token by token for the final response
+            for char in final_response_to_display:
+                final_response_parts.append(char)
+                # Update the last message in chat_history for streaming in Gradio
+                current_chat_history_for_yield[-1][1] = "SAM: Thinking with results...\n" + "".join(final_response_parts)
+                yield current_chat_history_for_yield, current_chat_history_for_yield
+            # After streaming, update the actual chat_history for the next turn
+            chat_history.append((message, final_response_to_display))
+        else: # No search query detected in the first pass
+            # Yield token by token for the direct response
+            for char in response_to_display:
+                full_response_parts.append(char)
+                # Update the last message in chat_history for streaming in Gradio
+                current_chat_history_for_yield.append([message, "".join(full_response_parts)])
+                yield current_chat_history_for_yield, current_chat_history_for_yield
+            # After streaming, update the actual chat_history for the next turn
+            chat_history.append((message, response_to_display))
+    # Return the final chat history for the state
     return chat_history, chat_history
 with gr.Blocks() as demo:
     gr.Markdown("## 🤖 Sam - SmilyAI Assistant")
     gr.Markdown("Chat with **Sam**, an AI assistant built by [SmilyAI Labs](https://smily.ai). Toggle reasoning mode or choose a model below.")