Spaces:
Sleeping
Sleeping
File size: 5,216 Bytes
1f05644 b906506 c97a50e b906506 c97a50e b906506 1f05644 7a18454 1f05644 c97a50e 1f05644 c97a50e 1f05644 c97a50e 1f05644 c97a50e 1f05644 c97a50e 1f05644 c97a50e 1f05644 dd7eda0 1f05644 c97a50e 1f05644 c97a50e 1f05644 c97a50e 1f05644 c97a50e 1f05644 b906506 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import pickle
import numpy as np
import pandas as pd
import nltk
from nltk.stem import *
nltk.download("punkt_tab")
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()
import os
import google.generativeai as genai
import json
from google.genai import Client, types
from datasets import load_dataset
def set_prompt(problem):
prompt = """
# ROLE
You are a meticulous senior technical analyst and constraints scout. Your task is to read a small description of a technical problem and identify distinct constraints each related to the problem and ensuring that the whole problem is encompassed by each constraints into a JSON object.
# OBJECTIVE
Find all the constraints in this technical problem making sure each are premised on the problem only.
Take into account different technical domains to encompass the whole problem.
Output each constraints in a JSON such as : {"title of the constraints1":"description1","title of the constraintsN":"descriptionN"}
# INSTRUCTIONS & RULES
1. **JSON Output**: Your entire response MUST be a single JSON code block starting with a hyphen (`-`) to denote a list. Do not include any explanatory text before or after the JSON.
2 **Discover and Iterate**: Your primary task is to scan the technical problem, find each constraints and create a seperate entry for it in the output JSON.
3. **Descriptive Sentences**: You MUST write clear, full sentences that describe the constraints's issues. Do not use single keywords. These descriptions should be based on the information in the technical problem.
4. **Infer Where Necessary**: The technical problem may not contain all details. Infer plausible information based on the context.
# JSON SCHEMA & EXAMPLE
{
'Exposing Compute Resources': 'The 6G network shall provide suitable APIs to allow authorized third parties and/or UEs to retrieve availability information about computational resources inside the Service Hosting Environment (SHE) and to utilize these computational resources for running workloads on demand.',
'Providing AI Compute': 'The 6G network shall be able to provide computing resources in the Service Hosting Environment for AI services and provide AI services to UEs.',
...
}
---
***NOW, BEGIN THE TASK.***
# TECHNICAL PROBLEM
""" + problem
return prompt
def load_data():
return load_dataset("heymenn/Technologies", split="train")
def stem(data,data_type):
stemmer = SnowballStemmer("english")
processed_data = []
if data_type == "technologies":
for index, t_item in enumerate(data):
processed_data.append({
"name": stemmer.stem(t_item["name"]),
"purpose": stemmer.stem(t_item["purpose"]),
"problem_types_solved": stemmer.stem(t_item["problem_types_solved"]),
"advantages": stemmer.stem(t_item["advantages"]),
"limitations": stemmer.stem(t_item["limitations"]),
"domain_tags": stemmer.stem(t_item["domain_tags"]),
"id": index
})
else:
for t_item in data:
processed_data.append({
"title": stemmer.stem(t_item),
"description": stemmer.stem(data[t_item])
})
return processed_data
def get_technologies_by_id(technologies,dataset):
result = []
for id in technologies:
print(id)
data = dataset[id]
del data["embeddings"]
print(data)
result.append(data)
return result
def save_to_pickle(result_similarites):
constraint_titles = sorted(list(set([item['constraint']['title'] for item in result_similarites])))
max_id2 = max([item['id2'] for item in result_similarites])
row_label_to_index = {title: i for i, title in enumerate(constraint_titles)}
col_labels = list(range(1, max_id2 + 1))
num_rows = len(constraint_titles)
num_cols = max_id2
matrix = np.full((num_rows, num_cols), np.nan, dtype=np.float32)
for item in result_similarites:
row_idx = row_label_to_index[item['constraint']['title']]
col_idx = item['id2'] - 1
similarity_value = item['similarity'].item()
matrix[row_idx, col_idx] = similarity_value
print(f"Successfully created matrix with shape: {matrix.shape}")
print(f"Number of rows (unique constraints): {num_rows}")
print(f"Number of columns (max id2): {num_cols}")
print("\nExample 5x5 block of the created matrix (NaN for missing values):")
print(matrix[:5, :5])
output_filename = "cosine_similarity_matrix_with_labels.pkl"
data_to_save = {
'matrix': matrix,
'row_labels': constraint_titles,
'col_labels': col_labels
}
with open(output_filename, 'wb') as f:
pickle.dump(data_to_save, f)
print(f"\nMatrix and labels saved to {output_filename}")
return output_filename
def set_gemini():
gemini_api = os.getenv("GEMINI_API")
client = Client(api_key=gemini_api)
# Define the grounding tool
grounding_tool = types.Tool(
google_search=types.GoogleSearch()
)
# Configure generation settings
config = types.GenerateContentConfig(
tools=[grounding_tool]
)
return client,config |