lyndalynda commited on
Commit
a011a64
·
verified ·
1 Parent(s): 81917a3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +369 -10
app.py CHANGED
@@ -1,23 +1,382 @@
1
  import os
2
  import gradio as gr
3
  import requests
4
- import inspect
5
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
6
 
7
- # (Keep Constants as is)
8
- # --- Constants ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
10
 
11
- # --- Basic Agent Definition ---
12
- # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
13
  class BasicAgent:
14
  def __init__(self):
15
- print("BasicAgent initialized.")
 
 
16
  def __call__(self, question: str) -> str:
17
- print(f"Agent received question (first 50 chars): {question[:50]}...")
18
- fixed_answer = "This is a default answer."
19
- print(f"Agent returning fixed answer: {fixed_answer}")
20
- return fixed_answer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  def run_and_submit_all( profile: gr.OAuthProfile | None):
23
  """
 
1
  import os
2
  import gradio as gr
3
  import requests
 
4
  import pandas as pd
5
+ from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, tool
6
+ import re
7
+ import json
8
+ import math
9
+ import tempfile
10
+ from pathlib import Path
11
+ from urllib.parse import urlparse, parse_qs
12
+ import yt_dlp
13
+ from PIL import Image
14
+ import pytesseract
15
 
16
+ hf_token = os.getenv("HF_TOKEN")
17
+ SPACE_ID = os.getenv("SPACE_ID")
18
+ SPACE_HOST = os.getenv("SPACE_HOST")
19
+ # --- OUTILS CRITIQUES POUR GAIA ---
20
+ @tool
21
+ def web_browser(url: str) -> str:
22
+ """
23
+ Fetches content from a web URL.
24
+
25
+ Args:
26
+ url: The URL to fetch content from.
27
+
28
+ Returns:
29
+ Text content from the webpage.
30
+ """
31
+ try:
32
+ headers = {
33
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
34
+ }
35
+ response = requests.get(url, headers=headers, timeout=10)
36
+ response.raise_for_status()
37
+
38
+ # Simple text extraction (you might want to use BeautifulSoup for better parsing)
39
+ content = response.text
40
+ # Basic cleaning
41
+ content = re.sub(r'<[^>]+>', ' ', content) # Remove HTML tags
42
+ content = re.sub(r'\s+', ' ', content).strip() # Clean whitespace
43
+
44
+ return content[:2000] + "..." if len(content) > 2000 else content
45
+
46
+ except Exception as e:
47
+ return f"Error accessing URL: {str(e)}"
48
+
49
+ @tool
50
+ def youtube_transcript_extractor(url: str) -> str:
51
+ """
52
+ Extracts transcript or information from YouTube videos.
53
+
54
+ Args:
55
+ url: YouTube URL.
56
+
57
+ Returns:
58
+ Video information and transcript if available.
59
+ """
60
+ try:
61
+ # Extract video ID from URL
62
+ if "youtube.com/watch" in url:
63
+ video_id = parse_qs(urlparse(url).query).get('v', [None])[0]
64
+ elif "youtu.be/" in url:
65
+ video_id = urlparse(url).path[1:]
66
+ else:
67
+ return "Invalid YouTube URL format"
68
+
69
+ if not video_id:
70
+ return "Could not extract video ID from URL"
71
+
72
+ # Use youtube-dl to get video info
73
+ ydl_opts = {
74
+ 'quiet': True,
75
+ 'no_warnings': True,
76
+ 'writesubtitles': True,
77
+ 'writeautomaticsub': True,
78
+ }
79
+
80
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
81
+ info = ydl.extract_info(f"https://www.youtube.com/watch?v={video_id}", download=False)
82
+
83
+ result = f"Title: {info.get('title', 'N/A')}\n"
84
+ result += f"Description: {info.get('description', 'N/A')[:500]}...\n"
85
+ result += f"Duration: {info.get('duration', 'N/A')} seconds\n"
86
+ result += f"View count: {info.get('view_count', 'N/A')}\n"
87
+
88
+ # Try to get subtitles/transcript
89
+ if 'subtitles' in info and info['subtitles']:
90
+ result += "\n--- Transcript Available ---\n"
91
+ # This is a simplified approach - you'd need more complex logic for full transcript
92
+
93
+ return result
94
+
95
+ except Exception as e:
96
+ return f"Error extracting YouTube content: {str(e)}"
97
+
98
+ @tool
99
+ def image_ocr_analyzer(image_path: str) -> str:
100
+ """
101
+ Performs OCR on images to extract text.
102
+
103
+ Args:
104
+ image_path: Path to the image file.
105
+
106
+ Returns:
107
+ Extracted text from the image.
108
+ """
109
+ try:
110
+ # Open image with PIL
111
+ image = Image.open(image_path)
112
+
113
+ # Perform OCR
114
+ extracted_text = pytesseract.image_to_string(image)
115
+
116
+ if not extracted_text.strip():
117
+ return "No text found in the image"
118
+
119
+ return f"Extracted text:\n{extracted_text.strip()}"
120
+
121
+ except Exception as e:
122
+ return f"Error performing OCR: {str(e)}"
123
+
124
+ @tool
125
+ def pdf_text_extractor(file_path: str) -> str:
126
+ """
127
+ Extracts text from PDF files.
128
+
129
+ Args:
130
+ file_path: Path to the PDF file.
131
+
132
+ Returns:
133
+ Extracted text from PDF.
134
+ """
135
+ try:
136
+ import PyPDF2
137
+
138
+ with open(file_path, 'rb') as file:
139
+ pdf_reader = PyPDF2.PdfReader(file)
140
+ text = ""
141
+
142
+ for page_num in range(len(pdf_reader.pages)):
143
+ page = pdf_reader.pages[page_num]
144
+ text += page.extract_text() + "\n"
145
+
146
+ return text[:3000] + "..." if len(text) > 3000 else text
147
+
148
+ except Exception as e:
149
+ return f"Error extracting PDF text: {str(e)}"
150
+
151
+ @tool
152
+ def veterinary_document_analyzer(text: str) -> str:
153
+ """
154
+ Analyzes veterinary documents to extract specific information like names.
155
+
156
+ Args:
157
+ text: Document text to analyze.
158
+
159
+ Returns:
160
+ Extracted veterinary information.
161
+ """
162
+ try:
163
+ # Look for veterinarian names and surnames
164
+ vet_patterns = [
165
+ r"Dr\.?\s+([A-Z][a-z]+)\s+([A-Z][a-z]+)", # Dr. First Last
166
+ r"Doctor\s+([A-Z][a-z]+)\s+([A-Z][a-z]+)", # Doctor First Last
167
+ r"veterinarian\s+([A-Z][a-z]+)\s+([A-Z][a-z]+)", # veterinarian First Last
168
+ r"DVM\s+([A-Z][a-z]+)\s+([A-Z][a-z]+)", # DVM First Last
169
+ ]
170
+
171
+ found_vets = []
172
+ for pattern in vet_patterns:
173
+ matches = re.findall(pattern, text, re.IGNORECASE)
174
+ for match in matches:
175
+ full_name = f"{match[0]} {match[1]}"
176
+ if full_name not in found_vets:
177
+ found_vets.append(full_name)
178
+
179
+ if found_vets:
180
+ return f"Found veterinarian(s): {', '.join(found_vets)}"
181
+ else:
182
+ return "No veterinarian names found in the document"
183
+
184
+ except Exception as e:
185
+ return f"Error analyzing veterinary document: {str(e)}"
186
+
187
+ # --- Outils existants améliorés ---
188
+ @tool
189
+ def analyze_excel_file(file_path: str, analysis_type: str = "general") -> str:
190
+ """
191
+ Analyzes Excel files with multiple analysis types.
192
+ """
193
+ try:
194
+ df = pd.read_excel(file_path)
195
+
196
+ if analysis_type == "general":
197
+ return f"Excel file contains {len(df)} rows and {len(df.columns)} columns. Columns: {list(df.columns)}"
198
+
199
+ elif analysis_type == "food_sales":
200
+ if 'category' in df.columns and 'price' in df.columns and 'quantity' in df.columns:
201
+ food_df = df[df['category'].str.lower() == 'food']
202
+ total_sales = (food_df['price'] * food_df['quantity']).sum()
203
+ return f"Total food sales: ${total_sales:.2f}"
204
+ else:
205
+ return "Required columns (category, price, quantity) not found"
206
+
207
+ elif analysis_type == "summary":
208
+ summary = df.describe(include='all').to_string()
209
+ return f"Data summary:\n{summary}"
210
+
211
+ elif analysis_type == "categories":
212
+ if 'category' in df.columns:
213
+ categories = df['category'].value_counts()
214
+ return f"Categories breakdown:\n{categories.to_string()}"
215
+ else:
216
+ return "No category column found"
217
+
218
+ return "Unknown analysis type"
219
+
220
+ except Exception as e:
221
+ return f"Error analyzing Excel file: {str(e)}"
222
+
223
+ @tool
224
+ def advanced_calculator(expression: str) -> str:
225
+ """
226
+ Evaluates mathematical expressions safely, including advanced functions.
227
+ """
228
+ try:
229
+ expression = expression.replace('^', '**')
230
+ allowed_functions = {
231
+ 'abs': abs, 'round': round, 'min': min, 'max': max,
232
+ 'sum': sum, 'len': len,
233
+ 'sqrt': math.sqrt, 'pow': math.pow, 'log': math.log,
234
+ 'sin': math.sin, 'cos': math.cos, 'tan': math.tan,
235
+ 'pi': math.pi, 'e': math.e,
236
+ 'floor': math.floor, 'ceil': math.ceil
237
+ }
238
+ result = eval(expression, {"__builtins__": {}}, allowed_functions)
239
+ return str(result)
240
+
241
+ except Exception as e:
242
+ return f"Error in calculation: {str(e)}"
243
+
244
+ @tool
245
+ def smart_text_analyzer(text: str, task_type: str = "general") -> str:
246
+ """
247
+ Analyzes text with focus on GAIA-specific tasks.
248
+
249
+ Args:
250
+ text: Text to analyze.
251
+ task_type: 'general', 'names', 'dates', 'numbers', 'veterinary'.
252
+
253
+ Returns:
254
+ Analysis results.
255
+ """
256
+ try:
257
+ if task_type == "names":
258
+ # Extract proper names
259
+ name_pattern = r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b'
260
+ names = re.findall(name_pattern, text)
261
+ return f"Found names: {list(set(names))}"
262
+
263
+ elif task_type == "veterinary":
264
+ return veterinary_document_analyzer(text)
265
+
266
+ elif task_type == "dates":
267
+ date_patterns = [
268
+ r'\d{1,2}/\d{1,2}/\d{4}', # MM/DD/YYYY
269
+ r'\d{4}-\d{2}-\d{2}', # YYYY-MM-DD
270
+ r'\b\w+\s+\d{1,2},\s+\d{4}\b' # Month DD, YYYY
271
+ ]
272
+ dates = []
273
+ for pattern in date_patterns:
274
+ dates.extend(re.findall(pattern, text))
275
+ return f"Found dates: {dates}"
276
+
277
+ elif task_type == "numbers":
278
+ numbers = re.findall(r'-?\d+\.?\d*', text)
279
+ return f"Found numbers: {[float(n) for n in numbers if n]}"
280
+
281
+ else:
282
+ return f"Characters: {len(text)}, Words: {len(text.split())}, Lines: {len(text.splitlines())}"
283
+
284
+ except Exception as e:
285
+ return f"Error in text analysis: {str(e)}"
286
+
287
+ # --- Configuration du modèle OPTIMISÉE ---
288
+ # Changer pour un modèle plus léger qui ne dépasse pas ton quota
289
+ model = HfApiModel(
290
+ max_tokens=2048, # Réduit pour économiser le quota
291
+ temperature=0.1,
292
+ model_id='microsoft/DialoGPT-medium', # Modèle plus léger
293
+ # Ou essaye: 'HuggingFaceH4/zephyr-7b-beta' si disponible
294
+ )
295
+
296
+ # --- Initialisation des outils ---
297
+ search_tool = DuckDuckGoSearchTool()
298
+
299
+ # IMPORTANT: Ajouter TOUS les outils à la liste
300
+ tools = [
301
+ search_tool, # ⚠️ TU AVAIS OUBLIÉ ÇA !
302
+ web_browser,
303
+ youtube_transcript_extractor,
304
+ image_ocr_analyzer,
305
+ pdf_text_extractor,
306
+ veterinary_document_analyzer,
307
+ smart_text_analyzer,
308
+ advanced_calculator,
309
+ analyze_excel_file,
310
+ ]
311
+
312
+ # Agent avec plus d'étapes pour les tâches complexes
313
+ agent_code = CodeAgent(
314
+ tools=tools,
315
+ model=model,
316
+ max_steps=15, # Augmenté pour les tâches complexes GAIA
317
+ additional_authorized_imports=[
318
+ "os", "tempfile", "pathlib", "re", "json", "math", "pandas",
319
+ "requests", "PIL", "pytesseract", "PyPDF2", "yt_dlp"
320
+ ]
321
+ )
322
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
323
 
 
 
324
  class BasicAgent:
325
  def __init__(self):
326
+ print("Enhanced GAIA Agent initialized with web browsing capabilities.")
327
+ self.agent = agent_code
328
+
329
  def __call__(self, question: str) -> str:
330
+ try:
331
+ # Prompt amélioré spécifiquement pour GAIA
332
+ enhanced_question = self._create_gaia_prompt(question)
333
+
334
+ result = self.agent.run(enhanced_question)
335
+
336
+ # Post-processing pour GAIA
337
+ cleaned_result = self._clean_gaia_result(result)
338
+
339
+ return cleaned_result if cleaned_result else "No response generated."
340
+
341
+ except Exception as e:
342
+ print(f"Agent error: {e}")
343
+ # Fallback strategy
344
+ try:
345
+ fallback_prompt = f"""
346
+ CRITICAL GAIA TASK: {question}
347
+
348
+ Use available tools to find the answer. If it's a YouTube video, use youtube_transcript_extractor.
349
+ If it's about documents, use appropriate analyzers.
350
+ Be precise and direct in your final answer.
351
+ """
352
+ simple_result = self.agent.run(fallback_prompt)
353
+ return simple_result if simple_result else f"Error: {e}"
354
+ except:
355
+ return f"Error: {e}"
356
+
357
+ def _create_gaia_prompt(self, question: str) -> str:
358
+ """Crée un prompt optimisé pour GAIA."""
359
+ return f"""
360
+ GAIA EVALUATION TASK - ANSWER PRECISELY
361
+
362
+ Question: {question}
363
+
364
+ INSTRUCTIONS:
365
+ 1. If this involves a YouTube video, use youtube_transcript_extractor tool
366
+ 2. If this involves web content, use web_browser tool
367
+ 3. If this involves documents/PDFs, use appropriate analyzers
368
+ 4. If this involves images, use image_ocr_analyzer
369
+ 5. If this needs search, use the search tool
370
+ 6. For calculations, use advanced_calculator
371
+ 7. Be EXACT and SPECIFIC in your final answer
372
+ 8. Don't provide explanations unless asked - just the answer
373
+
374
+ Work step by step and use the right tools for this task.
375
+ """
376
+
377
+
378
+
379
+
380
 
381
  def run_and_submit_all( profile: gr.OAuthProfile | None):
382
  """