from fuzzywuzzy import fuzz from google.genai import Client, types from datasets import load_dataset import json import os def search_and_retrieve(user_input, config): dataset = config["dataset"] model = config["model"] user_embedding = model.encode(user_input) results = dataset.get_nearest_examples('embeddings', user_embedding, k=5) s=results.scores t=results.examples n = len(t['name']) result = [] for i in range(n): item = {} for key, value in t.items(): if key!="embeddings": item[key] = value[i] result.append(item) for i,r in enumerate(result): r["score"]=float(s[i]) final_output = {"title": result[0]["name"], "purpose": result[0]["purpose"], "score": result[0]["score"]} final_output["top5"] = result print(final_output) return final_output def generate_tech(user_input, user_instructions): prompt = f""" # ROLE You are a meticulous senior technical analyst and technology scout. Your task is to generate a technology into a structured JSON object. # OBJECTIVE Analyze the provided ``. Identify what is technology discussed, focus on the highest level of the technology. Create a complete JSON object according to the schema below. Your final output must be a single, valid JSON document containing a technology you created. The technology should be described with sentences. # INSTRUCTIONS & RULES 1. **JSON List Output**: Your entire response MUST be a single JSON code block starting with a hyphen (`-`) to denote a list. Do not include any explanatory text before or after the JSON. 2. **Discover and Iterate**: Your primary task is to understand the technology and create a JSON entry for it. 3. **Descriptive Sentences**: You MUST write clear, full sentences that describe the technology's abilities and the issues it resolves. Do not use single keywords. 4. **Infer Where Necessary**: The source material may not contain all details. Infer plausible information based on the context. # YAML SCHEMA & EXAMPLE Your output must be a list of YAML objects matching this structure. Note how `functional_capabilities` and `problem_types_solved` contain full sentences. {{"name": "Generative Watermarking" "purpose": "Add an invisible, machine-readable tags to content generated by AI models and enables the tracing and authentication of digital media to its source." "problem_types_solved": "Helps to combat digital misinformation by providing a method to verify content authenticity and addresses the erosion of trust in digital media caused by the proliferation of deepfakes." "advantages": "Way faster to generate by an AI" "limitations": "Takes a lot of computational time to generate" "domain_tags": "Present in the domains of : AI ethics, cybersecurity, digital media, content moderation" }} Take into account those additionnal informations if there is any: {user_instructions} --- ***NOW, BEGIN THE TASK.*** {user_input} """ client = Client(api_key=os.getenv("GEMINI_API_KEY")) client = Client(api_key=os.getenv("GEMINI_API_KEY")) # Define the grounding tool grounding_tool = types.Tool( google_search=types.GoogleSearch() ) # Configure generation settings config = types.GenerateContentConfig( tools=[grounding_tool] ) response = client.models.generate_content( model="gemini-2.5-flash", contents=prompt, config=config, ) data = response.text data = data[data.find("{"):data.find("}")+1].replace('\n','') json_data = json.loads(data[data.find("{"):data.find("}")+1].replace('\n','')) return json_data def send_to_dataset(data, model): data_embedding = model.encode(str(data)) data["embeddings"] = data_embedding dataset = load_dataset("OrganizedProgrammers/Technologies", split="train") updated_dataset = dataset.add_item(data) updated_dataset.push_to_hub("OrganizedProgrammers/Technologies")