Spaces:

onisj
/

jarvis_gaia_agent

Starting

App Files Files Community

onisj commited on 13 days ago

Commit

c6951f4

1 Parent(s): 4701375

Rewrite app.py and search.py with multi-hop LLM refinement

Browse files

Files changed (6) hide show

app.py +310 -251
requirements.txt +6 -1
result.txt +0 -0
state.py +29 -4
test.py +7 -0
tools/search.py +99 -41

app.py CHANGED Viewed

@@ -14,6 +14,7 @@ from sentence_transformers import SentenceTransformer
 import gradio as gr
 from dotenv import load_dotenv
 from huggingface_hub import InferenceClient
 from state import JARVISState
 from tools import (
     search_tool, multi_hop_search_tool, file_parser_tool, image_parser_tool,
@@ -33,27 +34,68 @@ load_dotenv()
 SPACE_ID = os.getenv("SPACE_ID", "onisj/jarvis_gaia_agent")
 GAIA_API_URL = "https://agents-course-unit4-scoring.hf.space"
 GAIA_FILE_URL = f"{GAIA_API_URL}/files/"
-HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
 # Verify environment variables
 if not SPACE_ID:
     raise ValueError("SPACE_ID not set")
-if not HF_TOKEN:
     raise ValueError("HUGGINGFACEHUB_API_TOKEN not set")
 logger.info(f"SPACE_ID: {SPACE_ID}")
-# Initialize models
-try:
-    llm = InferenceClient(
-        model="meta-llama/Meta-Llama-3-8B-Instruct",
-        token=HF_TOKEN,
-        timeout=30
-    )
-    logger.info("Hugging Face Inference LLM initialized")
-except Exception as e:
-    logger.error(f"Failed to initialize LLM: {e}")
-    llm = None
 try:
     embedder = SentenceTransformer("all-MiniLM-L6-v2")
     logger.info("Sentence transformer initialized")
@@ -61,40 +103,41 @@ except Exception as e:
     logger.error(f"Failed to initialize embedder: {e}")
     embedder = None
-# --- Helper Functions ---
-async def test_gaia_api(task_id: str, file_type: str = "txt") -> tuple[bool, str | None]:
-    """Test if a file exists for the task ID."""
     try:
-        for ext in [file_type, "txt", "csv", "xlsx", "jpg", "pdf"]:
-            async with aiohttp.ClientSession() as session:
-                async with session.get(f"{GAIA_FILE_URL}{task_id}.{ext}", timeout=5) as resp:
-                    logger.info(f"GAIA API test for task {task_id} with .{ext}: HTTP {resp.status}")
-                    if resp.status == 200:
-                        file_path = f"temp_{task_id}.{ext}"
-                        with open(file_path, "wb") as f:
-                            f.write(await resp.read())
-                        return True, ext
-        logger.info(f"No file found for task {task_id}")
-        return False, None
     except Exception as e:
-        logger.warning(f"GAIA API test failed: {str(e)}")
-        return False, None
-# --- Node Functions ---
-async def parse_question(state: Dict[str, Any]) -> Dict[str, Any]:
-    """Parse the question to select appropriate tools."""
     try:
         question = state["question"]
         task_id = state["task_id"]
         tools_needed = ["search_tool"]
-        if llm:
             prompt = ChatPromptTemplate.from_messages([
                 SystemMessage(content="""Select tools from: ['search_tool', 'multi_hop_search_tool', 'file_parser_tool', 'image_parser_tool', 'calculator_tool', 'document_retriever_tool', 'duckduckgo_search_tool', 'weather_info_tool', 'hub_stats_tool', 'guest_info_retriever_tool'].
                 Return JSON list, e.g., ["search_tool", "file_parser_tool"].
                 Rules:
                 - Always include "search_tool" unless purely computational.
-                - Use "multi_hop_search_tool" for complex queries (over 20 words).
                 - Use "file_parser_tool" for data, tables, or Excel.
                 - Use "image_parser_tool" for images/videos.
                 - Use "calculator_tool" for math calculations.
@@ -107,15 +150,27 @@ async def parse_question(state: Dict[str, Any]) -> Dict[str, Any]:
                 HumanMessage(content=f"Query: {question}")
             ])
             try:
-                response = llm.chat_completion(
-                    messages=[
-                        {"role": "system", "content": prompt[0].content},
-                        {"role": "user", "content": prompt[1].content}
-                    ],
-                    max_tokens=512,
-                    temperature=0.7
-                )
-                tools_needed = json.loads(response["choices"][0]["message"]["content"].strip())
                 valid_tools = {
                     "search_tool", "multi_hop_search_tool", "file_parser_tool", "image_parser_tool",
                     "calculator_tool", "document_retriever_tool", "duckduckgo_search_tool",
@@ -123,165 +178,192 @@ async def parse_question(state: Dict[str, Any]) -> Dict[str, Any]:
                 }
                 tools_needed = [tool for tool in tools_needed if tool in valid_tools]
             except Exception as e:
-                logger.warning(f"Task {task_id} failed: JSON parse error: {e}")
-                tools_needed = ["search_tool"]
         # Keyword-based fallback
         question_lower = question.lower()
-        if any(word in question_lower for word in ["image", "video"]):
             tools_needed.append("image_parser_tool")
-        if any(word in question_lower for word in ["data", "table", "excel"]):
             tools_needed.append("file_parser_tool")
-        if any(word in question_lower for word in ["calculate", "math"]):
             tools_needed.append("calculator_tool")
-        if any(word in question_lower for word in ["document", "pdf"]):
             tools_needed.append("document_retriever_tool")
-        if any(word in question_lower for word in ["weather"]):
             tools_needed.append("weather_info_tool")
-        if any(word in question_lower for word in ["model", "huggingface"]):
             tools_needed.append("hub_stats_tool")
-        if any(word in question_lower for word in ["guest", "name", "relation"]):
             tools_needed.append("guest_info_retriever_tool")
-        if len(question.split()) > 20:
             tools_needed.append("multi_hop_search_tool")
-        file_available, file_ext = await test_gaia_api(task_id)
-        if file_available:
-            if "file_parser_tool" not in tools_needed and any(word in question_lower for word in ["data", "table", "excel"]):
-                tools_needed.append("file_parser_tool")
-            if "image_parser_tool" not in tools_needed and "image" in question_lower:
-                tools_needed.append("image_parser_tool")
-            if "document_retriever_tool" not in tools_needed and file_ext == "pdf":
-                tools_needed.append("document_retriever_tool")
-        else:
-            tools_needed = [tool for tool in tools_needed if tool not in ["file_parser_tool", "image_parser_tool", "document_retriever_tool"]]
-        state["tools_needed"] = list(set(tools_needed))  # Remove duplicates
         logger.info(f"Task {task_id}: Selected tools: {tools_needed}")
         return state
     except Exception as e:
         logger.error(f"Error parsing task {task_id}: {e}")
         state["tools_needed"] = ["search_tool"]
         return state
 async def tool_dispatcher(state: JARVISState) -> JARVISState:
-    """Dispatch selected tools to process the state."""
     try:
         updated_state = state.copy()
         file_type = "jpg" if "image" in state["question"].lower() else "txt"
-        if "menu" in state["question"].lower() or "report" in state["question"].lower():
             file_type = "pdf"
         elif "data" in state["question"].lower():
             file_type = "xlsx"
-        can_download, file_ext = await test_gaia_api(updated_state["task_id"], file_type)
         for tool in updated_state["tools_needed"]:
             try:
                 if tool == "search_tool":
-                    result = await search_tool.ainvoke({"query": updated_state["question"]})
-                    updated_state["web_results"].extend([r["content"] for r in result])
                 elif tool == "multi_hop_search_tool":
-                    result = await multi_hop_search_tool.ainvoke({"query": updated_state["question"], "steps": 3})
-                    updated_state["web_results"].extend([r["content"] for r in result])
-                    await asyncio.sleep(2)  # Rate limit
-                elif tool == "file_parser_tool" and can_download:
-                    result = await file_parser_tool.ainvoke({"task_id": updated_state["task_id"], "file_type": file_ext})
-                    updated_state["file_results"] = str(result)
-                elif tool == "image_parser_tool" and can_download:
-                    result = await image_parser_tool.ainvoke({
-                        "file_path": f"temp_{updated_state['task_id']}.{file_ext}",
-                        "task": "describe"
-                    })
-                    updated_state["image_results"] = str(result)
                 elif tool == "calculator_tool":
-                    result = await calculator_tool.ainvoke({"expression": updated_state.get("question", "")})
                     updated_state["calculation_results"] = str(result)
-                elif tool == "document_retriever_tool" and can_download:
-                    result = await document_retriever_tool.ainvoke({
-                        "task_id": updated_state["task_id"],
-                        "query": updated_state["question"],
-                        "file_type": file_ext
-                    })
-                    updated_state["document_results"] = str(result)
                 elif tool == "duckduckgo_search_tool":
-                    result = await duckduckgo_search_tool.run(updated_state["question"])
                     updated_state["web_results"].append(str(result))
                 elif tool == "weather_info_tool":
                     location = updated_state["question"].split("weather in ")[1].split()[0] if "weather in" in updated_state["question"].lower() else "Unknown"
-                    result = await weather_info_tool.ainvoke({"location": location})
                     updated_state["web_results"].append(str(result))
                 elif tool == "hub_stats_tool":
                     author = updated_state["question"].split("by ")[1].split()[0] if "by" in updated_state["question"].lower() else "Unknown"
-                    result = await hub_stats_tool.ainvoke({"author": author})
                     updated_state["web_results"].append(str(result))
                 elif tool == "guest_info_retriever_tool":
                     query = updated_state["question"].split("about ")[1] if "about" in updated_state["question"].lower() else updated_state["question"]
-                    result = await guest_info_retriever_tool.ainvoke({"query": query})
                     updated_state["web_results"].append(str(result))
             except Exception as e:
                 logger.warning(f"Error in tool {tool} for task {updated_state['task_id']}: {str(e)}")
-                updated_state[f"{tool}_results"] = f"Error: {str(e)}"
         logger.info(f"Task {updated_state['task_id']}: Tool results: {updated_state}")
         return updated_state
     except Exception as e:
         logger.error(f"Tool dispatch failed for task {state['task_id']}: {e}")
-        return state
 async def reasoning(state: JARVISState) -> Dict[str, Any]:
-    """Generate exact-match answer with specific formatting."""
     try:
-        if not llm:
-            return {"answer": "LLM unavailable"}
         prompt = ChatPromptTemplate.from_messages([
             SystemMessage(content="""Provide ONLY the exact answer (e.g., '90', 'HUE'). For USD, use two decimal places (e.g., '1234.00'). For lists, use comma-separated values (e.g., 'Smith, Lee'). For IOC codes, use three-letter codes (e.g., 'ARG'). No explanations or conversational text."""),
-            HumanMessage(content="""Question: {question}
 Web results: {web_results}
 File results: {file_results}
 Image results: {image_results}
 Calculation results: {calculation_results}
 Document results: {document_results}""")
         ])
-        response = llm.chat_completion(
-            messages=[
-                {"role": "system", "content": prompt[0].content},
-                {"role": "user", "content": prompt[1].content.format(
-                    question=state["question"],
-                    web_results="\n".join(state["web_results"]),
-                    file_results=state["file_results"],
-                    image_results=state["image_results"],
-                    calculation_results=state["calculation_results"],
-                    document_results=state["document_results"]
-                )}
-            ],
-            max_tokens=512,
-            temperature=0.7
-        )
-        answer = response["choices"][0]["message"]["content"].strip()
-        # Clean answer for specific formats
-        if "USD" in state["question"].lower():
             try:
-                answer = f"{float(answer):.2f}"
-            except ValueError:
-                pass
-        if "before and after" in state["question"].lower():
-            answer = answer.replace(" and ", ", ")
-        elif "IOC code" in state["question"].lower():
-            answer = answer.upper()[:3]
-        logger.info(f"Task {state['task_id']}: Answer: {answer}")
-        return {"answer": answer}
     except Exception as e:
         logger.error(f"Reasoning failed for task {state['task_id']}: {e}")
         return {"answer": f"Error: {str(e)}"}
 def router(state: JARVISState) -> str:
-    """Route based on tools needed."""
     if state["tools_needed"]:
         return "tool_dispatcher"
     return "reasoning"
-# --- Define StateGraph ---
 workflow = StateGraph(JARVISState)
 workflow.add_node("parse", parse_question)
 workflow.add_node("tool_dispatcher", tool_dispatcher)
@@ -299,33 +381,29 @@ workflow.add_edge("tool_dispatcher", "reasoning")
 workflow.add_edge("reasoning", END)
 graph = workflow.compile()
-# --- Basic Agent ---
-class BasicAgent:
     def __init__(self):
-        logger.info("BasicAgent initialized.")
     async def process_question(self, task_id: str, question: str) -> str:
-        """Process a single question with file handling."""
-        file_type = "jpg" if "image" in question.lower() else "txt"
-        if "menu" in question.lower() or "report" in question.lower():
-            file_type = "pdf"
-        elif "data" in question.lower():
-            file_type = "xlsx"
-        file_path = f"temp_{task_id}.{file_type}"
-        file_available, file_ext = await test_gaia_api(task_id, file_type)
-        if file_available:
-            try:
-                async with aiohttp.ClientSession() as session:
-                    async with session.get(f"{GAIA_FILE_URL}{task_id}.{file_ext}") as resp:
-                        if resp.status == 200:
-                            with open(file_path, "wb") as f:
-                                f.write(await resp.read())
-                        else:
-                            logger.warning(f"Failed to fetch file for {task_id}: HTTP {resp.status}")
-            except Exception as e:
-                logger.error(f"Error downloading file for task {task_id}: {str(e)}")
         state = JARVISState(
             task_id=task_id,
             question=question,
@@ -335,116 +413,98 @@ class BasicAgent:
             image_results="",
             calculation_results="",
             document_results="",
             messages=[HumanMessage(content=question)],
-            answer=""
         )
         try:
             result = await graph.ainvoke(state)
             answer = result["answer"] or "Unknown"
-            logger.info(f"Task {task_id}: Final answer generated: {answer}")
             return answer
         except Exception as e:
             logger.error(f"Error processing task {task_id}: {e}")
             return f"Error: {str(e)}"
         finally:
             for ext in ["txt", "csv", "xlsx", "jpg", "pdf"]:
-                file_path = f"temp_{task_id}.{ext}"
                 if os.path.exists(file_path):
                     try:
                         os.remove(file_path)
                     except Exception as e:
                         logger.error(f"Error removing file {file_path}: {e}")
-    async def async_call(self, question: str, task_id: str) -> str:
-        return await self.process_question(question, task_id)
-    def __call__(self, question: str, task_id: str = None) -> str:
-        logger.info(f"Processing question: {question[:50]}...")
-        if task_id is None:
-            task_id = "unknown_task_id"
-        try:
-            loop = asyncio.get_event_loop()
-        except RuntimeError:
-            loop = asyncio.new_event_loop()
-            asyncio.set_event_loop(loop)
-        return loop.run_until_complete(self.async_call(question, task_id))
-# --- Evaluation and Submission ---
-def run_and_submit_all(profile: gr.OAuthProfile | None):
-    """Run evaluation and submit answers to GAIA API."""
-    if not profile:
-        logger.error("User not logged in.")
-        return "Please Login to Hugging Face.", None
-    username = f"{profile.username}"
-    logger.info(f"User logged in: {username}")
-    questions_url = f"{GAIA_API_URL}/questions"
-    submit_url = f"{GAIA_API_URL}/submit"
-    agent_code = f"https://huggingface.co/spaces/{SPACE_ID}/tree/main"
-    try:
-        agent = BasicAgent()
-    except Exception as e:
-        logger.error(f"Agent initialization failed: {e}")
-        return f"Error initializing agent: {e}", None
-    logger.info(f"Fetching questions from: {questions_url}")
-    try:
-        response = requests.get(questions_url, timeout=15)
-        response.raise_for_status()
-        questions_data = response.json()
-        if not questions_data:
-            logger.error("Empty questions list.")
-            return "No questions fetched.", None
-        logger.info(f"Fetched {len(questions_data)} questions.")
-    except Exception as e:
-        logger.error(f"Error fetching questions: {e}")
-        return f"Error fetching questions: {e}", None
-    results_log = []
-    answers_payload = []
-    logger.info(f"Processing {len(questions_data)} questions...")
-    for item in questions_data:
-        task_id = item.get("task_id")
-        question_text = item.get("question")
-        if not task_id or question_text is None:
-            logger.warning(f"Skipping invalid item: {item}")
-            continue
         try:
-            submitted_answer = agent(question_text, task_id)
-            answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
-            results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
         except Exception as e:
-            logger.error(f"Error for task {task_id}: {e}")
-            results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
-    if not answers_payload:
-        logger.error("No answers generated.")
-        return "No answers to submit.", pd.DataFrame(results_log)
-    submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
-    logger.info(f"Submitting {len(answers_payload)} answers to: {submit_url}")
-    try:
-        response = requests.post(submit_url, json=submission_data, timeout=120)
-        response.raise_for_status()
-        result_data = response.json()
-        final_status = (
-            f"Submission Successful!\n"
-            f"User: {result_data.get('username')}\n"
-            f"Overall Score: {result_data.get('score', 'N/A')}% "
-            f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
-            f"Message: {result_data.get('message', 'No message received.')}"
-        )
-        results_df = pd.DataFrame(results_log)
-        return final_status, results_df
-    except Exception as e:
-        logger.error(f"Submission failed: {e}")
-        results_df = pd.DataFrame(results_log)
-        return f"Submission Failed: {e}", results_df
-# --- Gradio Interface ---
 with gr.Blocks() as demo:
-    gr.Markdown("# Evolved JARVIS Agent Evaluation")
     gr.Markdown(
         """
         **Instructions:**
@@ -454,23 +514,22 @@ with gr.Blocks() as demo:
         ---
         **Disclaimers:**
-        Uses Hugging Face Inference, SERPAPI, and OpenWeatherMap for GAIA benchmark.
         """
     )
-    gr.LoginButton()
     run_button = gr.Button("Run Evaluation & Submit All Answers")
     status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
-    results_table = gr.DataFrame(label="Questions and Answers", wrap=True)
     run_button.click(
-        fn=run_and_submit_all,
-        outputs=[status_output, results_table]
     )
-# --- Main ---
 if __name__ == "__main__":
     logger.info("\n" + "-"*30 + " App Starting " + "-"*30)
     logger.info(f"SPACE_ID: {SPACE_ID}")

 import gradio as gr
 from dotenv import load_dotenv
 from huggingface_hub import InferenceClient
+from transformers import AutoTokenizer, AutoModelForCausalLM
 from state import JARVISState
 from tools import (
     search_tool, multi_hop_search_tool, file_parser_tool, image_parser_tool,
 SPACE_ID = os.getenv("SPACE_ID", "onisj/jarvis_gaia_agent")
 GAIA_API_URL = "https://agents-course-unit4-scoring.hf.space"
 GAIA_FILE_URL = f"{GAIA_API_URL}/files/"
+TOGETHER_API_KEY = os.getenv("TOGETHER_API_KEY")
+HF_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
 # Verify environment variables
 if not SPACE_ID:
     raise ValueError("SPACE_ID not set")
+if not HF_API_TOKEN:
     raise ValueError("HUGGINGFACEHUB_API_TOKEN not set")
+if not TOGETHER_API_KEY:
+    raise ValueError("TOGETHER_API_KEY not set")
 logger.info(f"SPACE_ID: {SPACE_ID}")
+# Model configuration
+TOGETHER_MODELS = [
+    "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+    "deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free",
+]
+HF_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
+# Initialize LLM clients
+def initialize_llm():
+    for model in TOGETHER_MODELS:
+        try:
+            client = InferenceClient(
+                model=model,
+                api_key=TOGETHER_API_KEY,
+                base_url="https://api.together.ai/v1",
+                timeout=30
+            )
+            client.chat.completions.create(
+                model=model,
+                messages=[{"role": "user", "content": "Test"}],
+                max_tokens=10,
+            )
+            logger.info(f"Initialized Together AI model: {model}")
+            return client, "together"
+        except Exception as e:
+            logger.warning(f"Failed to initialize {model}: {e}")
+    try:
+        client = InferenceClient(
+            model=HF_MODEL,
+            token=HF_API_TOKEN,
+            timeout=30
+        )
+        logger.info(f"Initialized Hugging Face Inference API model: {HF_MODEL}")
+        return client, "hf_api"
+    except Exception as e:
+        logger.warning(f"Failed to initialize HF Inference API: {e}")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(HF_MODEL, token=HF_API_TOKEN)
+        model = AutoModelForCausalLM.from_pretrained(HF_MODEL, token=HF_API_TOKEN, device_map="mps")
+        logger.info(f"Initialized local Hugging Face model: {HF_MODEL}")
+        return (model, tokenizer), "hf_local"
+    except Exception as e:
+        logger.error(f"Failed to initialize local HF model: {e}")
+        raise Exception("No LLM could be initialized")
+llm_client, llm_type = initialize_llm()
+# Initialize embedder
 try:
     embedder = SentenceTransformer("all-MiniLM-L6-v2")
     logger.info("Sentence transformer initialized")
     logger.error(f"Failed to initialize embedder: {e}")
     embedder = None
+# Download file with local fallback
+async def download_file(task_id: str, ext: str) -> str | None:
     try:
+        url = f"{GAIA_FILE_URL}{task_id}.{ext}"
+        async with aiohttp.ClientSession() as session:
+            async with session.get(url, timeout=10) as resp:
+                logger.info(f"GAIA API test for task {task_id} with .{ext}: HTTP {resp.status}")
+                if resp.status == 200:
+                    os.makedirs("temp", exist_ok=True)
+                    file_path = f"temp/{task_id}.{ext}"
+                    with open(file_path, "wb") as f:
+                        f.write(await resp.read())
+                    return file_path
     except Exception as e:
+        logger.warning(f"File download failed for {task_id}.{ext}: {e}")
+    local_path = f"temp/{task_id}.{ext}"
+    if os.path.exists(local_path):
+        logger.info(f"Using local file: {local_path}")
+        return local_path
+    return None
+# Parse question to select tools
+async def parse_question(state: JARVISState) -> JARVISState:
     try:
         question = state["question"]
         task_id = state["task_id"]
         tools_needed = ["search_tool"]
+        if llm_client:
             prompt = ChatPromptTemplate.from_messages([
                 SystemMessage(content="""Select tools from: ['search_tool', 'multi_hop_search_tool', 'file_parser_tool', 'image_parser_tool', 'calculator_tool', 'document_retriever_tool', 'duckduckgo_search_tool', 'weather_info_tool', 'hub_stats_tool', 'guest_info_retriever_tool'].
                 Return JSON list, e.g., ["search_tool", "file_parser_tool"].
                 Rules:
                 - Always include "search_tool" unless purely computational.
+                - Use "multi_hop_search_tool" for complex queries (over 20 words or requiring multiple steps).
                 - Use "file_parser_tool" for data, tables, or Excel.
                 - Use "image_parser_tool" for images/videos.
                 - Use "calculator_tool" for math calculations.
                 HumanMessage(content=f"Query: {question}")
             ])
             try:
+                if llm_type == "hf_local":
+                    model, tokenizer = llm_client
+                    inputs = tokenizer.apply_chat_template(
+                        [{"role": "system", "content": prompt[0].content}, {"role": "user", "content": prompt[1].content}],
+                        return_tensors="pt"
+                    ).to("mps")
+                    outputs = model.generate(inputs, max_new_tokens=512, temperature=0.7)
+                    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+                    tools_needed = json.loads(response.strip())
+                else:
+                    response = llm_client.chat.completions.create(
+                        model=llm_client.model if llm_type == "together" else HF_MODEL,
+                        messages=[
+                            {"role": "system", "content": prompt[0].content},
+                            {"role": "user", "content": prompt[1].content}
+                        ],
+                        max_tokens=512,
+                        temperature=0.7
+                    )
+                    tools_needed = json.loads(response.choices[0].message.content.strip())
                 valid_tools = {
                     "search_tool", "multi_hop_search_tool", "file_parser_tool", "image_parser_tool",
                     "calculator_tool", "document_retriever_tool", "duckduckgo_search_tool",
                 }
                 tools_needed = [tool for tool in tools_needed if tool in valid_tools]
             except Exception as e:
+                logger.warning(f"Task {task_id} tool selection failed: {e}")
+                state["error"] = f"Tool selection failed: {str(e)}"
         # Keyword-based fallback
         question_lower = question.lower()
+        if any(word in question_lower for word in ["image", "video", "picture"]):
             tools_needed.append("image_parser_tool")
+        if any(word in question_lower for word in ["data", "table", "excel", ".txt", ".csv", ".xlsx"]):
             tools_needed.append("file_parser_tool")
+        if any(word in question_lower for word in ["calculate", "math", "sum", "average", "total"]):
             tools_needed.append("calculator_tool")
+        if any(word in question_lower for word in ["document", "pdf", "report", "menu"]):
             tools_needed.append("document_retriever_tool")
+        if any(word in question_lower for word in ["weather", "temperature"]):
             tools_needed.append("weather_info_tool")
+        if any(word in question_lower for word in ["model", "huggingface", "dataset"]):
             tools_needed.append("hub_stats_tool")
+        if any(word in question_lower for word in ["guest", "name", "relation", "person"]):
             tools_needed.append("guest_info_retriever_tool")
+        if len(question.split()) > 20 or "multiple" in question_lower:
             tools_needed.append("multi_hop_search_tool")
+        if any(word in question_lower for word in ["search", "wikipedia", "online"]):
+            tools_needed.append("duckduckgo_search_tool")
+        # Check file availability
+        for ext in ["txt", "csv", "xlsx", "jpg", "pdf"]:
+            file_path = await download_file(task_id, ext)
+            if file_path:
+                if ext in ["txt", "csv", "xlsx"] and "file_parser_tool" not in tools_needed:
+                    tools_needed.append("file_parser_tool")
+                if ext == "jpg" and "image_parser_tool" not in tools_needed:
+                    tools_needed.append("image_parser_tool")
+                if ext == "pdf" and "document_retriever_tool" not in tools_needed:
+                    tools_needed.append("document_retriever_tool")
+                state["metadata"] = state.get("metadata", {}) | {"file_ext": ext, "file_path": file_path}
+                break
+        state["tools_needed"] = list(set(tools_needed))
         logger.info(f"Task {task_id}: Selected tools: {tools_needed}")
         return state
     except Exception as e:
         logger.error(f"Error parsing task {task_id}: {e}")
+        state["error"] = f"Parse question failed: {str(e)}"
         state["tools_needed"] = ["search_tool"]
         return state
+# Tool dispatcher
 async def tool_dispatcher(state: JARVISState) -> JARVISState:
     try:
         updated_state = state.copy()
         file_type = "jpg" if "image" in state["question"].lower() else "txt"
+        if any(word in state["question"].lower() for word in ["menu", "report"]):
             file_type = "pdf"
         elif "data" in state["question"].lower():
             file_type = "xlsx"
         for tool in updated_state["tools_needed"]:
             try:
                 if tool == "search_tool":
+                    result = search_tool(updated_state["question"])
+                    updated_state["web_results"].extend([str(r) for r in result])
                 elif tool == "multi_hop_search_tool":
+                    result = await multi_hop_search_tool.ainvoke({"query": updated_state["question"], "steps": 3, "llm_client": llm_client, "llm_type": llm_type})
+                    updated_state["multi_hop_results"].extend([r["content"] for r in result])
+                    await asyncio.sleep(2)
+                elif tool == "file_parser_tool":
+                    for ext in ["txt", "csv", "xlsx"]:
+                        file_path = await download_file(updated_state["task_id"], ext)
+                        if file_path:
+                            result = file_parser_tool(file_path)
+                            updated_state["file_results"] = str(result)
+                            break
+                elif tool == "image_parser_tool":
+                    file_path = await download_file(updated_state["task_id"], "jpg")
+                    if file_path:
+                        result = image_parser_tool(file_path)
+                        updated_state["image_results"] = str(result)
                 elif tool == "calculator_tool":
+                    result = calculator_tool(updated_state["question"])
                     updated_state["calculation_results"] = str(result)
+                elif tool == "document_retriever_tool":
+                    file_path = await download_file(updated_state["task_id"], "pdf")
+                    if file_path:
+                        result = document_retriever_tool({"task_id": updated_state["task_id"], "query": updated_state["question"], "file_type": "pdf"})
+                        updated_state["document_results"] = str(result)
                 elif tool == "duckduckgo_search_tool":
+                    result = duckduckgo_search_tool(updated_state["question"])
                     updated_state["web_results"].append(str(result))
                 elif tool == "weather_info_tool":
                     location = updated_state["question"].split("weather in ")[1].split()[0] if "weather in" in updated_state["question"].lower() else "Unknown"
+                    result = weather_info_tool({"location": location})
                     updated_state["web_results"].append(str(result))
                 elif tool == "hub_stats_tool":
                     author = updated_state["question"].split("by ")[1].split()[0] if "by" in updated_state["question"].lower() else "Unknown"
+                    result = hub_stats_tool({"author": author})
                     updated_state["web_results"].append(str(result))
                 elif tool == "guest_info_retriever_tool":
                     query = updated_state["question"].split("about ")[1] if "about" in updated_state["question"].lower() else updated_state["question"]
+                    result = guest_info_retriever_tool({"query": query})
                     updated_state["web_results"].append(str(result))
+                updated_state["metadata"] = updated_state.get("metadata", {}) | {f"{tool}_executed": True}
             except Exception as e:
                 logger.warning(f"Error in tool {tool} for task {updated_state['task_id']}: {str(e)}")
+                updated_state["error"] = f"Tool {tool} failed: {str(e)}"
+                updated_state["metadata"] = updated_state.get("metadata", {}) | {f"{tool}_error": str(e)}
         logger.info(f"Task {updated_state['task_id']}: Tool results: {updated_state}")
         return updated_state
     except Exception as e:
         logger.error(f"Tool dispatch failed for task {state['task_id']}: {e}")
+        updated_state["error"] = f"Tool dispatch failed: {str(e)}"
+        return updated_state
+# Reasoning
 async def reasoning(state: JARVISState) -> Dict[str, Any]:
     try:
         prompt = ChatPromptTemplate.from_messages([
             SystemMessage(content="""Provide ONLY the exact answer (e.g., '90', 'HUE'). For USD, use two decimal places (e.g., '1234.00'). For lists, use comma-separated values (e.g., 'Smith, Lee'). For IOC codes, use three-letter codes (e.g., 'ARG'). No explanations or conversational text."""),
+            HumanMessage(content="""Task: {task_id}
+Question: {question}
 Web results: {web_results}
+Multi-hop results: {multi_hop_results}
 File results: {file_results}
 Image results: {image_results}
 Calculation results: {calculation_results}
 Document results: {document_results}""")
         ])
+        messages = [
+            {"role": "system", "content": prompt[0].content},
+            {"role": "user", "content": prompt[1].content.format(
+                task_id=state["task_id"],
+                question=state["question"],
+                web_results="\n".join(state["web_results"]),
+                multi_hop_results="\n".join(state["multi_hop_results"]),
+                file_results=state["file_results"],
+                image_results=state["image_results"],
+                calculation_results=state["calculation_results"],
+                document_results=state["document_results"]
+            )}
+        ]
+        for attempt in range(3):
             try:
+                if llm_type == "hf_local":
+                    model, tokenizer = llm_client
+                    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("mps")
+                    outputs = model.generate(inputs, max_new_tokens=512, temperature=0.7)
+                    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
+                else:
+                    response = llm_client.chat.completions.create(
+                        model=llm_client.model if llm_type == "together" else HF_MODEL,
+                        messages=messages,
+                        max_tokens=512,
+                        temperature=0.7
+                    )
+                    answer = response.choices[0].message.content.strip()
+                # Format answer
+                if "USD" in state["question"].lower():
+                    try:
+                        answer = f"{float(answer):.2f}"
+                    except ValueError:
+                        pass
+                if "before and after" in state["question"].lower():
+                    answer = answer.replace(" and ", ", ")
+                if "IOC code" in state["question"].lower():
+                    answer = answer.upper()[:3]
+                logger.info(f"Task {state['task_id']}: Answer: {answer}")
+                return {"answer": answer}
+            except Exception as e:
+                logger.warning(f"LLM retry {attempt + 1}/3 for task {state['task_id']}: {e}")
+                await asyncio.sleep(2)
+        state["error"] = "LLM failed after retries"
+        return {"answer": "Error: LLM failed after retries"}
     except Exception as e:
         logger.error(f"Reasoning failed for task {state['task_id']}: {e}")
+        state["error"] = f"Reasoning failed: {str(e)}"
         return {"answer": f"Error: {str(e)}"}
+# Router
 def router(state: JARVISState) -> str:
     if state["tools_needed"]:
         return "tool_dispatcher"
     return "reasoning"
+# Define StateGraph
 workflow = StateGraph(JARVISState)
 workflow.add_node("parse", parse_question)
 workflow.add_node("tool_dispatcher", tool_dispatcher)
 workflow.add_edge("reasoning", END)
 graph = workflow.compile()
+# Agent class
+class JARVISAgent:
     def __init__(self):
+        self.state = JARVISState(
+            task_id="",
+            question="",
+            tools_needed=[],
+            web_results=[],
+            file_results="",
+            image_results="",
+            calculation_results="",
+            document_results="",
+            multi_hop_results=[],
+            messages=[],
+            answer="",
+            results_table=[],
+            status_output="",
+            error=None,
+            metadata={}
+        )
+        logger.info("JARVISAgent initialized.")
     async def process_question(self, task_id: str, question: str) -> str:
         state = JARVISState(
             task_id=task_id,
             question=question,
             image_results="",
             calculation_results="",
             document_results="",
+            multi_hop_results=[],
             messages=[HumanMessage(content=question)],
+            answer="",
+            results_table=[],
+            status_output="",
+            error=None,
+            metadata={}
         )
         try:
             result = await graph.ainvoke(state)
             answer = result["answer"] or "Unknown"
+            logger.info(f"Task {task_id}: Final answer: {answer}")
+            self.state.results_table.append({"Task ID": task_id, "Question": question, "Answer": answer})
+            self.state.metadata = self.state.get("metadata", {}) | {"last_task": task_id, "answer": answer}
             return answer
         except Exception as e:
             logger.error(f"Error processing task {task_id}: {e}")
+            self.state.results_table.append({"Task ID": task_id, "Question": question, "Answer": f"Error: {e}"})
+            self.state.error = f"Task {task_id} failed: {str(e)}"
             return f"Error: {str(e)}"
         finally:
             for ext in ["txt", "csv", "xlsx", "jpg", "pdf"]:
+                file_path = f"temp/{task_id}.{ext}"
                 if os.path.exists(file_path):
                     try:
                         os.remove(file_path)
+                        logger.info(f"Removed temp file: {file_path}")
                     except Exception as e:
                         logger.error(f"Error removing file {file_path}: {e}")
+    async def process_all_questions(self, profile: gr.OAuthProfile | None):
+        if not profile:
+            logger.error("User not logged in.")
+            self.state.status_output = "Please Login to Hugging Face."
+            return pd.DataFrame(self.state.results_table), self.state.status_output
+        username = f"{profile.username}"
+        logger.info(f"User logged in: {username}")
+        questions_url = f"{GAIA_API_URL}/questions"
+        submit_url = f"{GAIA_API_URL}/submit"
+        agent_code = f"https://huggingface.co/spaces/{SPACE_ID}/tree/main"
         try:
+            response = requests.get(questions_url, timeout=15)
+            response.raise_for_status()
+            questions = response.json()
+            logger.info(f"Fetched {len(questions)} questions.")
+        except Exception as e:
+            logger.error(f"Error fetching questions: {e}")
+            self.state.status_output = f"Error fetching questions: {e}"
+            self.state.error = f"Fetch questions failed: {str(e)}"
+            return pd.DataFrame(self.state.results_table), self.state.status_output
+        answers_payload = []
+        for item in questions:
+            task_id = item.get("task_id")
+            question = item.get("question")
+            if not task_id or not question:
+                logger.warning(f"Skipping invalid item: {item}")
+                continue
+            answer = await self.process_question(task_id, question)
+            answers_payload.append({"task_id": task_id, "submitted_answer": answer})
+        if not answers_payload:
+            logger.error("No answers generated.")
+            self.state.status_output = "No answers to submit."
+            self.state.error = "No answers generated"
+            return pd.DataFrame(self.state.results_table), self.state.status_output
+        submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
+        try:
+            response = requests.post(submit_url, json=submission_data, timeout=120)
+            response.raise_for_status()
+            result_data = response.json()
+            self.state.status_output = (
+                f"Submission Successful!\n"
+                f"User: {result_data.get('username')}\n"
+                f"Overall Score: {result_data.get('score', 'N/A')}% "
+                f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
+                f"Message: {result_data.get('message', 'No message received.')}"
+            )
+            self.state.metadata = self.state.get("metadata", {}) | {"submission_score": result_data.get('score', 'N/A')}
         except Exception as e:
+            logger.error(f"Submission failed: {e}")
+            self.state.status_output = f"Submission Failed: {e}"
+            self.state.error = f"Submission failed: {str(e)}"
+        return pd.DataFrame(self.state.results_table), self.state.status_output
+# Gradio interface
 with gr.Blocks() as demo:
+    gr.Markdown("# Evolved JARVIS GAIA Agent")
     gr.Markdown(
         """
         **Instructions:**
         ---
         **Disclaimers:**
+        Uses Hugging Face Inference, Together AI, SERPAPI, and OpenWeatherMap for GAIA benchmark.
         """
     )
+    with gr.Row():
+        gr.LoginButton()
+        gr.LogoutButton()
     run_button = gr.Button("Run Evaluation & Submit All Answers")
     status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
+    results_table = gr.DataFrame(label="Questions and Answers", wrap=True, headers=["Task ID", "Question", "Answer"])
+    agent = JARVISAgent()
     run_button.click(
+        fn=agent.process_all_questions,
+        outputs=[results_table, status_output]
     )
 if __name__ == "__main__":
     logger.info("\n" + "-"*30 + " App Starting " + "-"*30)
     logger.info(f"SPACE_ID: {SPACE_ID}")

requirements.txt CHANGED Viewed

@@ -4,6 +4,7 @@ pandas
 PyPDF2
 easyocr
 langchain
 langchain-community
 langgraph
 sentence-transformers
@@ -15,4 +16,8 @@ sympy
 openpyxl
 smolagents
 datasets
-asyncio

 PyPDF2
 easyocr
 langchain
+langchain-core
 langchain-community
 langgraph
 sentence-transformers
 openpyxl
 smolagents
 datasets
+transformers
+asyncio
+serpapi
+duckduckgo-search
+torch

result.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

state.py CHANGED Viewed

@@ -1,7 +1,27 @@
-from typing import TypedDict, List
-from langchain_core.messages import AnyMessage
 class JARVISState(TypedDict):
     task_id: str
     question: str
     tools_needed: List[str]
@@ -10,5 +30,10 @@ class JARVISState(TypedDict):
     image_results: str
     calculation_results: str
     document_results: str
-    messages: List[AnyMessage]
-    answer: str

+from typing import TypedDict, List, Dict, Optional, Any
+from langchain_core.messages import BaseMessage
 class JARVISState(TypedDict):
+    """
+    State dictionary for the JARVIS GAIA Agent, used with LangGraph to manage task processing.
+    Attributes:
+        task_id: Unique identifier for the GAIA task.
+        question: The question text to be answered.
+        tools_needed: List of tool names to be used for the task.
+        web_results: List of web search results (e.g., from SERPAPI, DuckDuckGo).
+        file_results: Parsed content from text, CSV, or Excel files.
+        image_results: OCR or description results from image files.
+        calculation_results: Results from mathematical calculations.
+        document_results: Extracted content from PDF documents.
+        multi_hop_results: Results from iterative multi-hop searches.
+        messages: List of messages for LLM context (e.g., user prompts, system instructions).
+        answer: Final answer for the task, formatted for GAIA submission.
+        results_table: List of task results for Gradio display (Task ID, Question, Answer).
+        status_output: Status message for Gradio output (e.g., submission result).
+        error: Optional error message if task processing fails.
+        metadata: Optional metadata (e.g., timestamps, tool execution status).
+    """
     task_id: str
     question: str
     tools_needed: List[str]
     image_results: str
     calculation_results: str
     document_results: str
+    multi_hop_results: List[str]
+    messages: List[BaseMessage]
+    answer: str
+    results_table: List[Dict[str, str]]
+    status_output: str
+    error: Optional[str]
+    metadata: Optional[Dict[str, Any]]

test.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import os
+import requests
+headers = {"Authorization": f"Bearer {os.getenv('TOGETHER_API_KEY')}"}
+response = requests.get("https://api.together.ai/models", headers=headers)
+print(response.json())

tools/search.py CHANGED Viewed

@@ -1,46 +1,104 @@
-from langchain_core.tools import tool
-import logging
-import requests
 import os
 from typing import List, Dict, Any
-from dotenv import load_dotenv
-logger = logging.getLogger(__name__)
-load_dotenv()
-@tool
-async def search_tool(query: str) -> List[Dict[str, Any]]:
-    """Perform a web search using SERPAPI."""
-    try:
-        serpapi_key = os.getenv("SERPAPI_API_KEY")
-        if not serpapi_key:
-            logger.error("SERPAPI_API_KEY not set")
-            return [{"content": "Search unavailable: API key missing", "url": ""}]
-        params = {"q": query, "api_key": serpapi_key}
-        response = requests.get("https://serpapi.com/search", params=params, timeout=10)
-        response.raise_for_status()
-        results = response.json().get("organic_results", [])
-        logger.info(f"Search results for query '{query}': {len(results)} items")
-        search_results = [{"content": r.get("snippet", ""), "url": r.get("link", "")} for r in results]
-        return search_results or [{"content": "No search results", "url": ""}]
-    except Exception as e:
-        logger.error(f"Error in search_tool: {e}")
-        return [{"content": f"Search failed: {str(e)}", "url": ""}]
-@tool
-async def multi_hop_search_tool(query: str, steps: int = 3) -> List[Dict[str, Any]]:
-    """Perform a multi-hop search."""
-    try:
-        results = []
-        current_query = query
-        for step in range(steps):
-            step_results = await search_tool.invoke({"query": current_query})
-            results.extend(step_results)
-            current_query = f"{current_query} more details"
-            logger.info(f"Multi-hop step {step + 1}: {current_query}")
-            await asyncio.sleep(2)  # Avoid rate limits
-        return results or [{"content": "No multi-hop results", "url": ""}]
-    except Exception as e:
-        logger.error(f"Error in multi_hop_search_tool: {e}")
-        return [{"content": f"Multi-hop search failed: {str(e)}", "url": ""}]

 import os
+from serpapi import GoogleSearch
+from langchain.tools import Tool
+import asyncio
 from typing import List, Dict, Any
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.messages import SystemMessage, HumanMessage
+def search_tool(query: str) -> List[str]:
+    """
+    Perform a web search using SERPAPI with retries.
+    Args:
+        query: Search query string.
+    Returns:
+        List of search result snippets.
+    Raises:
+        Exception: If search fails after retries.
+    """
+    params = {
+        "q": query,
+        "api_key": os.getenv("SERPAPI_API_KEY"),
+        "num": 5,
+    }
+    for attempt in range(3):
+        try:
+            search = GoogleSearch(params, timeout=30)
+            results = search.get_dict()
+            organic_results = results.get("organic_results", [])
+            return [r.get("snippet", "") for r in organic_results]
+        except Exception as e:
+            print(f"INFO - SERPAPI retry {attempt + 1}/3 due to: {e}")
+            asyncio.sleep(2)
+    raise Exception("SERPAPI failed after retries")
+async def multi_hop_search_tool(query: str, steps: int = 3, llm_client: Any = None, llm_type: str = None) -> List[Dict[str, str]]:
+    """
+    Perform iterative web searches for complex queries, refining the query using an LLM.
+    Args:
+        query: Initial search query.
+        steps: Number of search iterations.
+        llm_client: LLM client for query refinement.
+        llm_type: Type of LLM client ("together", "hf_api", or "hf_local").
+    Returns:
+        List of dictionaries containing search result content.
+    """
+    results = []
+    current_query = query
+    for step in range(steps):
+        try:
+            # Perform search
+            search_results = search_tool(current_query)
+            results.extend([{"content": str(r)} for r in search_results])
+            # Refine query using LLM if available
+            if llm_client and step < steps - 1:
+                prompt = ChatPromptTemplate.from_messages([
+                    SystemMessage(content="""Refine the following query to dig deeper into the topic, focusing on missing details or related aspects. Return ONLY the refined query as plain text, no explanations."""),
+                    HumanMessage(content=f"Original query: {current_query}\nPrevious results: {json.dumps(search_results[:2], indent=2)}")
+                ])
+                messages = [
+                    {"role": "system", "content": prompt[0].content},
+                    {"role": "user", "content": prompt[1].content}
+                ]
+                try:
+                    if llm_type == "hf_local":
+                        model, tokenizer = llm_client
+                        inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("mps")
+                        outputs = model.generate(inputs, max_new_tokens=100, temperature=0.7)
+                        refined_query = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
+                    else:
+                        response = llm_client.chat.completions.create(
+                            model=llm_client.model if llm_type == "together" else "meta-llama/Llama-3.2-1B-Instruct",
+                            messages=messages,
+                            max_tokens=100,
+                            temperature=0.7
+                        )
+                        refined_query = response.choices[0].message.content.strip()
+                    current_query = refined_query if refined_query else f"more details on {current_query}"
+                except Exception as e:
+                    print(f"INFO - Query refinement failed at step {step + 1}: {e}")
+                    current_query = f"more details on {current_query}"
+            await asyncio.sleep(1)  # Rate limit
+        except Exception as e:
+            print(f"INFO - Multi-hop search step {step + 1} failed: {e}")
+            break
+    return results
+multi_hop_search_tool = Tool.from_function(
+    func=multi_hop_search_tool,
+    name="multi_hop_search_tool",
+    description="Performs iterative web searches for complex queries, refining the query with an LLM."
+)