Spaces:

OrganizedProgrammers
/

Search-Technologies-API

Sleeping

File size: 4,164 Bytes

ceaeaf3
 
 
2783254
 
ceaeaf3
 
 
 
 
 
 
7d1249d
ceaeaf3
7d1249d
 
 
ceaeaf3
7d1249d
62151ed
7d1249d
 
ceaeaf3
7d1249d
 
 
ceaeaf3
7d1249d
ceaeaf3
7d1249d
 
 
 
 
 
ceaeaf3
7d1249d
ceaeaf3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62151ed
ceaeaf3
 
62151ed
 
2783254
ceaeaf3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a7787a
ceaeaf3
7a7787a

from fuzzywuzzy import fuzz
from google.genai import Client, types
from datasets import load_dataset
import json
import os


def search_and_retrieve(user_input, config):
    dataset = config["dataset"]
    model = config["model"]

    user_embedding = model.encode(user_input)
    results = dataset.get_nearest_examples('embeddings', user_embedding, k=5)

    s=results.scores
    t=results.examples
    n = len(t['name'])

    result = []

    for i in range(n):
        item = {}

        for key, value in t.items():
          if key!="embeddings":
            item[key] = value[i]

        result.append(item)

    for i,r in enumerate(result):
      r["score"]=float(s[i])
    
    final_output = {"title": result[0]["name"], "purpose": result[0]["purpose"], "score": result[0]["score"]}
    final_output["top5"] = result
    print(final_output)

    return final_output


def generate_tech(user_input, user_instructions):
    prompt = f"""
    # ROLE

    You are a meticulous senior technical analyst and technology scout. Your task is to generate a technology into a structured JSON object.

    # OBJECTIVE

    Analyze the provided `<USER_INPUT>`. Identify what is technology discussed, focus on the highest level of the technology. 
    Create a complete JSON object according to the schema below. 
    Your final output must be a single, valid JSON document containing a technology you created. 
    The technology should be described with sentences.

    # INSTRUCTIONS & RULES

    1.  **JSON List Output**: Your entire response MUST be a single JSON code block starting with a hyphen (`-`) to denote a list. 
    Do not include any explanatory text before or after the JSON.
    2.  **Discover and Iterate**: Your primary task is to understand the technology and create a JSON entry for it.
    3.  **Descriptive Sentences**: You MUST write clear, full sentences that describe the technology's abilities and the issues it resolves. 
    Do not use single keywords.
    4.  **Infer Where Necessary**: The source material may not contain all details. Infer plausible information based on the context.

    # YAML SCHEMA & EXAMPLE

    Your output must be a list of YAML objects matching this structure. Note how `functional_capabilities` and `problem_types_solved` contain full sentences.

    {{"name": "Generative Watermarking"
      "purpose": "Add an invisible, machine-readable tags to content generated by AI models and enables the tracing and authentication of digital media to its source."
      "problem_types_solved": "Helps to combat digital misinformation by providing a method to verify content authenticity and addresses the erosion of trust in digital media caused by the proliferation of deepfakes."
      "advantages": "Way faster to generate by an AI"
      "limitations": "Takes a lot of computational time to generate"
      "domain_tags": "Present in the domains of : AI ethics, cybersecurity, digital media, content moderation"
    }}

    Take into account those additionnal informations if there is any:
    {user_instructions}
    ---
    ***NOW, BEGIN THE TASK.***

    <USER_INPUT>
    {user_input}
    </USER_INPUT>
    """ 

    client = Client(api_key=os.getenv("GEMINI_API_KEY"))
    
    client = Client(api_key=os.getenv("GEMINI_API_KEY"))

    # Define the grounding tool
    grounding_tool = types.Tool(
        google_search=types.GoogleSearch()
    )

    # Configure generation settings
    config = types.GenerateContentConfig(
        tools=[grounding_tool]
    )

    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=prompt,
        config=config,
    )

    data = response.text
    data = data[data.find("{"):data.find("}")+1].replace('\n','')
    json_data = json.loads(data[data.find("{"):data.find("}")+1].replace('\n',''))

    return json_data


def send_to_dataset(data, model):
    data_embedding = model.encode(str(data))
    data["embeddings"] = data_embedding

    dataset = load_dataset("OrganizedProgrammers/Technologies", split="train")
    updated_dataset = dataset.add_item(data)
    updated_dataset.push_to_hub("OrganizedProgrammers/Technologies")