Spaces:
Sleeping
Sleeping
Commit
·
5eaaba5
0
Parent(s):
first commit
Browse files- .gitignore +3 -0
- README.md +32 -0
- app.py +127 -0
- case_study/TableQuestionAnswering.ipynb +174 -0
- case_study/TextGeneration.ipynb +199 -0
- case_study/ZeroShotClassification.ipynb +173 -0
- requirements.txt +7 -0
- utils/classify.py +80 -0
- utils/extract.py +89 -0
- utils/fake.py +95 -0
- utils/serialize_json.py +65 -0
- utils/validate.py +34 -0
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
.ipynb_checkpoints
|
3 |
+
.env
|
README.md
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Mock JSON Generator
|
3 |
+
emoji: 🃏
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: yellow
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 5.20.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
---
|
11 |
+
|
12 |
+
# Mock JSON Generator
|
13 |
+
|
14 |
+
Generate test data for a JSON schema.
|
15 |
+
|
16 |
+
1. User inputs JSON schema
|
17 |
+
2. Use Zero-Shot Classification model to map schema properties to faker functions
|
18 |
+
3. Call faker functions and return mock JSON
|
19 |
+
|
20 |
+
## Why
|
21 |
+
|
22 |
+
Although mock data can be generated by chatting with any popular large language model (LLM), generating massive amounts of mock data for performance testing or load testing would be an inefficient use of an LLM. With this approach, once the classification model has mapped the schema properties, mock data can be generated instantly, for free, with no significant CPU load.
|
23 |
+
|
24 |
+
Check the `case_study` directory to see why I decided to use Zero-Shot Classification for this solution.
|
25 |
+
|
26 |
+
## Tradeoffs
|
27 |
+
|
28 |
+
Using a Zero-Shot Classification model rather than a full LLM introduces a flexibility vs. scalability tradeoff. A full LLM would provide greater flexibility in understanding complex schemas and generating more nuanced mock data. However, this approach offers superior scalability - once the classification model has mapped schema properties to faker functions, you can generate virtually unlimited mock data instantly without the computational overhead or costs associated with repeated LLM calls.
|
29 |
+
|
30 |
+
## Optimizations
|
31 |
+
|
32 |
+
A custom-trained model would be ideal for a specific task like this.
|
app.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import gradio as gr
|
3 |
+
from utils.classify import get_functions_for_descriptions
|
4 |
+
from utils.extract import extract_descriptions
|
5 |
+
from utils.fake import generate_mock_data
|
6 |
+
from utils.validate import validate_schema_structure
|
7 |
+
from utils.serialize_json import serialize_to_json
|
8 |
+
|
9 |
+
# Store globally so we don't have to recalculate if
|
10 |
+
# the schema has not changed
|
11 |
+
function_mappings = {}
|
12 |
+
|
13 |
+
|
14 |
+
def process_schema(schema):
|
15 |
+
"""Process the schema and return either mock data or an error message"""
|
16 |
+
global function_mappings
|
17 |
+
|
18 |
+
is_valid, result = validate_schema_structure(schema)
|
19 |
+
|
20 |
+
if not is_valid:
|
21 |
+
return (
|
22 |
+
None,
|
23 |
+
result, # If invalid this will contain the error message
|
24 |
+
)
|
25 |
+
|
26 |
+
if not function_mappings:
|
27 |
+
descriptions = extract_descriptions(result)
|
28 |
+
function_mappings = get_functions_for_descriptions(descriptions)
|
29 |
+
|
30 |
+
mock_data = generate_mock_data(result, function_mappings)
|
31 |
+
|
32 |
+
mock_json = serialize_to_json(mock_data, pretty=True)
|
33 |
+
|
34 |
+
return mock_json, None
|
35 |
+
|
36 |
+
|
37 |
+
def clear_function_mappings():
|
38 |
+
global function_mappings
|
39 |
+
function_mappings = {}
|
40 |
+
|
41 |
+
|
42 |
+
# Create a default schema example
|
43 |
+
default_schema = json.dumps(
|
44 |
+
{
|
45 |
+
"type": "object",
|
46 |
+
"description": "A person object",
|
47 |
+
"properties": {
|
48 |
+
"first_name": {"type": "string", "description": "The person's first name"},
|
49 |
+
"last_name": {"type": "string", "description": "The person's last name"},
|
50 |
+
"age": {"type": "integer", "minimum": 18, "maximum": 100},
|
51 |
+
"email": {"type": "string"},
|
52 |
+
"is_active": {"type": "boolean"},
|
53 |
+
"address": {
|
54 |
+
"type": "object",
|
55 |
+
"properties": {
|
56 |
+
"street": {"type": "string"},
|
57 |
+
"city": {"type": "string"},
|
58 |
+
"zip": {"type": "string"},
|
59 |
+
},
|
60 |
+
},
|
61 |
+
},
|
62 |
+
"required": ["first_name", "last_name", "age"],
|
63 |
+
},
|
64 |
+
indent=2,
|
65 |
+
)
|
66 |
+
|
67 |
+
# Create the Gradio interface
|
68 |
+
with gr.Blocks(theme=gr.themes.Base()) as app:
|
69 |
+
gr.Markdown("# JSON Schema Mock Data Generator")
|
70 |
+
gr.Markdown(
|
71 |
+
"Enter a valid JSON schema and generate mock data that conforms to the schema."
|
72 |
+
)
|
73 |
+
|
74 |
+
with gr.Row():
|
75 |
+
with gr.Column():
|
76 |
+
schema_input = gr.Textbox(
|
77 |
+
label="JSON Schema",
|
78 |
+
value=default_schema,
|
79 |
+
lines=15,
|
80 |
+
placeholder="Enter your JSON schema here...",
|
81 |
+
)
|
82 |
+
|
83 |
+
generate_btn = gr.Button("Generate Mock Data", variant="primary")
|
84 |
+
|
85 |
+
with gr.Column():
|
86 |
+
mock_output = gr.Textbox(
|
87 |
+
label="Generated Mock JSON", lines=15, interactive=False
|
88 |
+
)
|
89 |
+
|
90 |
+
error_output = gr.Textbox(label="Errors", visible=False, interactive=False)
|
91 |
+
|
92 |
+
def update_output(schema_str):
|
93 |
+
mock_data, error = process_schema(schema_str)
|
94 |
+
|
95 |
+
if error:
|
96 |
+
return {
|
97 |
+
mock_output: None,
|
98 |
+
error_output: error,
|
99 |
+
error_output: gr.update(visible=True),
|
100 |
+
}
|
101 |
+
else:
|
102 |
+
return {
|
103 |
+
mock_output: mock_data,
|
104 |
+
error_output: None,
|
105 |
+
error_output: gr.update(visible=False),
|
106 |
+
}
|
107 |
+
|
108 |
+
schema_input.change(fn=clear_function_mappings, inputs=[], outputs=[])
|
109 |
+
|
110 |
+
generate_btn.click(
|
111 |
+
fn=update_output,
|
112 |
+
inputs=[schema_input],
|
113 |
+
outputs=[mock_output, error_output, error_output],
|
114 |
+
)
|
115 |
+
|
116 |
+
gr.Markdown(
|
117 |
+
"""
|
118 |
+
## Notes
|
119 |
+
- Zero Shot Classification will run the first time a mock is generated for a schema. Subsequent generations will be instant.
|
120 |
+
- The schema must be valid JSON and comply with JSON Schema Draft 7
|
121 |
+
- Required keywords: `type` and `properties` (for object types)
|
122 |
+
- Currently Supported types: string, integer, boolean, array, and object
|
123 |
+
"""
|
124 |
+
)
|
125 |
+
|
126 |
+
if __name__ == "__main__":
|
127 |
+
app.launch()
|
case_study/TableQuestionAnswering.ipynb
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "50fa1916-c45f-4e79-aacd-8fd2956292b9",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# Table Question Answering Pipeline\n",
|
9 |
+
"\n",
|
10 |
+
"Experimenting with a TableQuestionAnsweringPipeline.\n",
|
11 |
+
"\n",
|
12 |
+
"This is not really a good use of the model but it kind of works."
|
13 |
+
]
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"cell_type": "code",
|
17 |
+
"execution_count": 1,
|
18 |
+
"id": "c4ca4235-042e-4b30-9a95-73f85c5f64ea",
|
19 |
+
"metadata": {},
|
20 |
+
"outputs": [],
|
21 |
+
"source": [
|
22 |
+
"from transformers import pipeline"
|
23 |
+
]
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"cell_type": "code",
|
27 |
+
"execution_count": 2,
|
28 |
+
"id": "87419b8e-c1bb-42e4-b15a-e7d8c2b7367a",
|
29 |
+
"metadata": {},
|
30 |
+
"outputs": [],
|
31 |
+
"source": [
|
32 |
+
"faker_mappings = {\n",
|
33 |
+
" # Personal information\n",
|
34 |
+
" \"name\": [\n",
|
35 |
+
" \"full name\", \"name\", \"name of user\", \"person name\", \"name of person\", \n",
|
36 |
+
" \"complete name\", \"user name\", \"customer name\", \"client name\"\n",
|
37 |
+
" ],\n",
|
38 |
+
" \"first_name\": [\n",
|
39 |
+
" \"first name\", \"user's first name\", \"first name of a person\", \"person first name\", \n",
|
40 |
+
" \"given name\", \"forename\", \"christian name\", '', ''\n",
|
41 |
+
" ],\n",
|
42 |
+
" \"last_name\": [\n",
|
43 |
+
" \"last name\", \"surname\", \"family name\", \"user's last name\", \n",
|
44 |
+
" \"last name of user\", \"person's surname\", '', '', ''\n",
|
45 |
+
" ],\n",
|
46 |
+
" \"password\": [\n",
|
47 |
+
" \"user password\", \"person password\", \"member password\", \"secret password\", \n",
|
48 |
+
" \"confidential password\", \"example password\", '', '', '' # all arrays have to be equal. \n",
|
49 |
+
" ]\n",
|
50 |
+
"}"
|
51 |
+
]
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"cell_type": "code",
|
55 |
+
"execution_count": 3,
|
56 |
+
"id": "1d2919a4-2312-4e8c-a1d3-66c7042b2400",
|
57 |
+
"metadata": {},
|
58 |
+
"outputs": [
|
59 |
+
{
|
60 |
+
"name": "stderr",
|
61 |
+
"output_type": "stream",
|
62 |
+
"text": [
|
63 |
+
"Device set to use mps:0\n"
|
64 |
+
]
|
65 |
+
}
|
66 |
+
],
|
67 |
+
"source": [
|
68 |
+
"pipe = pipeline(model=\"google/tapas-base-finetuned-wtq\")"
|
69 |
+
]
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"cell_type": "code",
|
73 |
+
"execution_count": 4,
|
74 |
+
"id": "0a7ca742-273c-4530-a591-3012fe31cc09",
|
75 |
+
"metadata": {},
|
76 |
+
"outputs": [
|
77 |
+
{
|
78 |
+
"name": "stderr",
|
79 |
+
"output_type": "stream",
|
80 |
+
"text": [
|
81 |
+
"/opt/anaconda3/envs/llms/lib/python3.11/site-packages/transformers/models/tapas/tokenization_tapas.py:2699: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
82 |
+
" text = normalize_for_match(row[col_index].text)\n",
|
83 |
+
"/opt/anaconda3/envs/llms/lib/python3.11/site-packages/transformers/models/tapas/tokenization_tapas.py:1493: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
84 |
+
" cell = row[col_index]\n"
|
85 |
+
]
|
86 |
+
},
|
87 |
+
{
|
88 |
+
"name": "stdout",
|
89 |
+
"output_type": "stream",
|
90 |
+
"text": [
|
91 |
+
"[{'answer': 'first name', 'coordinates': [(0, 1)], 'cells': ['first name'], 'aggregator': 'NONE'}, {'answer': 'full name', 'coordinates': [(0, 0)], 'cells': ['full name'], 'aggregator': 'NONE'}, {'answer': 'secret password, confidential password', 'coordinates': [(3, 3), (4, 3)], 'cells': ['secret password', 'confidential password'], 'aggregator': 'NONE'}, {'answer': 'full name', 'coordinates': [(0, 0)], 'cells': ['full name'], 'aggregator': 'NONE'}]\n"
|
92 |
+
]
|
93 |
+
}
|
94 |
+
],
|
95 |
+
"source": [
|
96 |
+
"result = pipe(query=['user first name', 'full name', \"secret password\", \"unique id\"], table=faker_mappings)\n",
|
97 |
+
"print(result)"
|
98 |
+
]
|
99 |
+
},
|
100 |
+
{
|
101 |
+
"cell_type": "code",
|
102 |
+
"execution_count": 5,
|
103 |
+
"id": "803584e5-e3e1-48df-b974-712f7c5e81f5",
|
104 |
+
"metadata": {},
|
105 |
+
"outputs": [],
|
106 |
+
"source": [
|
107 |
+
"# A function to lookup the faker function names based on coordinates\n",
|
108 |
+
"def lookup_keys_from_results(result, faker_mappings):\n",
|
109 |
+
" reverse_mapping = {}\n",
|
110 |
+
" for key, values in faker_mappings.items():\n",
|
111 |
+
" for value in values:\n",
|
112 |
+
" if value:\n",
|
113 |
+
" reverse_mapping[value.lower()] = key\n",
|
114 |
+
" \n",
|
115 |
+
" coordinate_to_key = {}\n",
|
116 |
+
" for item in result:\n",
|
117 |
+
" answer = item['answer'].lower()\n",
|
118 |
+
" \n",
|
119 |
+
" if ',' in answer:\n",
|
120 |
+
" answers = [a.strip() for a in answer.split(',')]\n",
|
121 |
+
" for a in answers:\n",
|
122 |
+
" if a in reverse_mapping:\n",
|
123 |
+
" for coord in item['coordinates']:\n",
|
124 |
+
" coordinate_to_key[coord] = reverse_mapping[a]\n",
|
125 |
+
" break\n",
|
126 |
+
" else:\n",
|
127 |
+
" if answer in reverse_mapping:\n",
|
128 |
+
" for coord in item['coordinates']:\n",
|
129 |
+
" coordinate_to_key[coord] = reverse_mapping[answer]\n",
|
130 |
+
" \n",
|
131 |
+
" return coordinate_to_key"
|
132 |
+
]
|
133 |
+
},
|
134 |
+
{
|
135 |
+
"cell_type": "code",
|
136 |
+
"execution_count": 6,
|
137 |
+
"id": "2025a892-e3e5-4efe-8822-2cbbd13f903c",
|
138 |
+
"metadata": {},
|
139 |
+
"outputs": [
|
140 |
+
{
|
141 |
+
"name": "stdout",
|
142 |
+
"output_type": "stream",
|
143 |
+
"text": [
|
144 |
+
"{(0, 1): 'first_name', (0, 0): 'name', (3, 3): 'password', (4, 3): 'password'}\n"
|
145 |
+
]
|
146 |
+
}
|
147 |
+
],
|
148 |
+
"source": [
|
149 |
+
"print(lookup_keys_from_results(result, faker_mappings))"
|
150 |
+
]
|
151 |
+
}
|
152 |
+
],
|
153 |
+
"metadata": {
|
154 |
+
"kernelspec": {
|
155 |
+
"display_name": "Python 3 (ipykernel)",
|
156 |
+
"language": "python",
|
157 |
+
"name": "python3"
|
158 |
+
},
|
159 |
+
"language_info": {
|
160 |
+
"codemirror_mode": {
|
161 |
+
"name": "ipython",
|
162 |
+
"version": 3
|
163 |
+
},
|
164 |
+
"file_extension": ".py",
|
165 |
+
"mimetype": "text/x-python",
|
166 |
+
"name": "python",
|
167 |
+
"nbconvert_exporter": "python",
|
168 |
+
"pygments_lexer": "ipython3",
|
169 |
+
"version": "3.11.11"
|
170 |
+
}
|
171 |
+
},
|
172 |
+
"nbformat": 4,
|
173 |
+
"nbformat_minor": 5
|
174 |
+
}
|
case_study/TextGeneration.ipynb
ADDED
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "b4614723-e45f-426e-a545-d0ed9685557d",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# Text Generation\n",
|
9 |
+
"\n",
|
10 |
+
"Experimenting with a Text Generation Pipeline"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "code",
|
15 |
+
"execution_count": 1,
|
16 |
+
"id": "b249c274-f0ab-43cb-9c44-b4de64045cac",
|
17 |
+
"metadata": {},
|
18 |
+
"outputs": [],
|
19 |
+
"source": [
|
20 |
+
"from transformers import pipeline"
|
21 |
+
]
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"cell_type": "code",
|
25 |
+
"execution_count": 2,
|
26 |
+
"id": "09955617-16bb-4d6e-8546-91549c3fe296",
|
27 |
+
"metadata": {},
|
28 |
+
"outputs": [
|
29 |
+
{
|
30 |
+
"data": {
|
31 |
+
"application/vnd.jupyter.widget-view+json": {
|
32 |
+
"model_id": "2bbb30f80b274ff0b8c50a075f0cdd73",
|
33 |
+
"version_major": 2,
|
34 |
+
"version_minor": 0
|
35 |
+
},
|
36 |
+
"text/plain": [
|
37 |
+
"Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]"
|
38 |
+
]
|
39 |
+
},
|
40 |
+
"metadata": {},
|
41 |
+
"output_type": "display_data"
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"name": "stderr",
|
45 |
+
"output_type": "stream",
|
46 |
+
"text": [
|
47 |
+
"Some parameters are on the meta device because they were offloaded to the disk.\n",
|
48 |
+
"Device set to use mps\n"
|
49 |
+
]
|
50 |
+
}
|
51 |
+
],
|
52 |
+
"source": [
|
53 |
+
"# Use a pre-quantized model from the Hub\n",
|
54 |
+
"pipe = pipeline(\n",
|
55 |
+
" \"text-generation\",\n",
|
56 |
+
" model=\"meta-llama/Llama-3.2-3B\",\n",
|
57 |
+
" device_map=\"auto\",\n",
|
58 |
+
" do_sample=True\n",
|
59 |
+
")"
|
60 |
+
]
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"cell_type": "code",
|
64 |
+
"execution_count": 4,
|
65 |
+
"id": "4e7f1608-b823-4cee-ad31-c9a9c5feae4d",
|
66 |
+
"metadata": {},
|
67 |
+
"outputs": [
|
68 |
+
{
|
69 |
+
"name": "stderr",
|
70 |
+
"output_type": "stream",
|
71 |
+
"text": [
|
72 |
+
"Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n"
|
73 |
+
]
|
74 |
+
},
|
75 |
+
{
|
76 |
+
"name": "stdout",
|
77 |
+
"output_type": "stream",
|
78 |
+
"text": [
|
79 |
+
"first_name\n",
|
80 |
+
"email\n",
|
81 |
+
"phone\n"
|
82 |
+
]
|
83 |
+
}
|
84 |
+
],
|
85 |
+
"source": [
|
86 |
+
"def get_faker_mapping(properties):\n",
|
87 |
+
" # Create a structured prompt\n",
|
88 |
+
" prompt = \"\"\"\n",
|
89 |
+
"You are a data helper.\n",
|
90 |
+
"\n",
|
91 |
+
"Function Names:\n",
|
92 |
+
"name\n",
|
93 |
+
"first_name\n",
|
94 |
+
"last_name\n",
|
95 |
+
"email\n",
|
96 |
+
"phone\n",
|
97 |
+
"address\n",
|
98 |
+
"city\n",
|
99 |
+
"state\n",
|
100 |
+
"zip_code\n",
|
101 |
+
"country\n",
|
102 |
+
"company\n",
|
103 |
+
"job_title\n",
|
104 |
+
"credit_card_number\n",
|
105 |
+
"date_of_birth\n",
|
106 |
+
"text\n",
|
107 |
+
"random_number\n",
|
108 |
+
"\n",
|
109 |
+
"EXAMPLE 1:\n",
|
110 |
+
"Properties:\n",
|
111 |
+
"fullName: Full name of person\n",
|
112 |
+
"businessEmail: Business email address\n",
|
113 |
+
"phoneNum: US phone number\n",
|
114 |
+
"\n",
|
115 |
+
"Response:\n",
|
116 |
+
"fullName: name\n",
|
117 |
+
"businessEmail: email\n",
|
118 |
+
"phoneNum: phone\n",
|
119 |
+
"\n",
|
120 |
+
"EXAMPLE 2:\n",
|
121 |
+
"Properties:\n",
|
122 |
+
"name: name of person\n",
|
123 |
+
"ccn: Credit card number\n",
|
124 |
+
"personalPhone: US phone number\n",
|
125 |
+
"\n",
|
126 |
+
"Response:\n",
|
127 |
+
"name: name\n",
|
128 |
+
"ccn: credit_card_number\n",
|
129 |
+
"personalPhone: phone\n",
|
130 |
+
"\n",
|
131 |
+
"EXAMPLE 3:\n",
|
132 |
+
"Properties:\n",
|
133 |
+
"firstName: first name of person\n",
|
134 |
+
"city: City of residence\n",
|
135 |
+
"personalPhone: US phone number\n",
|
136 |
+
"\n",
|
137 |
+
"Response:\n",
|
138 |
+
"firstName: first_name\n",
|
139 |
+
"city: city\n",
|
140 |
+
"personalPhone: phone\n",
|
141 |
+
"\n",
|
142 |
+
"Complete EXAMPLE 4:\n",
|
143 |
+
"Properties:\n",
|
144 |
+
"\"\"\"\n",
|
145 |
+
" # Add properties to prompt\n",
|
146 |
+
" for prop, desc in properties:\n",
|
147 |
+
" prompt += f\"{prop}: {desc}\\n\"\n",
|
148 |
+
" \n",
|
149 |
+
" prompt += \"\\nResponse:\"\n",
|
150 |
+
" \n",
|
151 |
+
" # Generate response\n",
|
152 |
+
" response = pipe(\n",
|
153 |
+
" prompt,\n",
|
154 |
+
" max_new_tokens=50\n",
|
155 |
+
" )\n",
|
156 |
+
" \n",
|
157 |
+
" result = response[0]['generated_text']\n",
|
158 |
+
" \n",
|
159 |
+
" # Logic to convert to a dict\n",
|
160 |
+
" response_text = result.split(\"Complete EXAMPLE 4:\")[1].split(\"Complete EXAMPLE 5:\")[0].split(\"Response:\")[1].strip()\n",
|
161 |
+
" \n",
|
162 |
+
" return {line.split(\":\", 1)[0].strip(): line.split(\":\", 1)[1].strip() for line in response_text.splitlines() if \":\" in line}\n",
|
163 |
+
" \n",
|
164 |
+
"# Example use\n",
|
165 |
+
"properties = [\n",
|
166 |
+
" (\"name\", \"users first name\"),\n",
|
167 |
+
" (\"email\", \"Business email address\"),\n",
|
168 |
+
" (\"phone\", \"US phone number\")\n",
|
169 |
+
"]\n",
|
170 |
+
"\n",
|
171 |
+
"result = get_faker_mapping(properties)\n",
|
172 |
+
"\n",
|
173 |
+
"for key, desc in properties:\n",
|
174 |
+
" print(result.get(key))\n"
|
175 |
+
]
|
176 |
+
}
|
177 |
+
],
|
178 |
+
"metadata": {
|
179 |
+
"kernelspec": {
|
180 |
+
"display_name": "Python 3 (ipykernel)",
|
181 |
+
"language": "python",
|
182 |
+
"name": "python3"
|
183 |
+
},
|
184 |
+
"language_info": {
|
185 |
+
"codemirror_mode": {
|
186 |
+
"name": "ipython",
|
187 |
+
"version": 3
|
188 |
+
},
|
189 |
+
"file_extension": ".py",
|
190 |
+
"mimetype": "text/x-python",
|
191 |
+
"name": "python",
|
192 |
+
"nbconvert_exporter": "python",
|
193 |
+
"pygments_lexer": "ipython3",
|
194 |
+
"version": "3.11.11"
|
195 |
+
}
|
196 |
+
},
|
197 |
+
"nbformat": 4,
|
198 |
+
"nbformat_minor": 5
|
199 |
+
}
|
case_study/ZeroShotClassification.ipynb
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "42aaa537-ab3b-46d2-a12b-d0355669bf26",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# ZeroShotClassification\n",
|
9 |
+
"\n",
|
10 |
+
"Experimenting with a ZeroShotClassification Pipeline.\n",
|
11 |
+
"\n",
|
12 |
+
"This works very good. Even \"login id\" was tagged as \"username\" at 20% confidence. Based on testing we will not consider below 18%."
|
13 |
+
]
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"cell_type": "code",
|
17 |
+
"execution_count": 1,
|
18 |
+
"id": "c8e72fc9-198e-43c6-8045-aaf811145e1c",
|
19 |
+
"metadata": {},
|
20 |
+
"outputs": [],
|
21 |
+
"source": [
|
22 |
+
"from transformers import pipeline\n",
|
23 |
+
"from faker import Faker"
|
24 |
+
]
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"cell_type": "code",
|
28 |
+
"execution_count": 2,
|
29 |
+
"id": "620682d6-fd51-47b7-b99b-a32434c28044",
|
30 |
+
"metadata": {},
|
31 |
+
"outputs": [],
|
32 |
+
"source": [
|
33 |
+
"fake = Faker()"
|
34 |
+
]
|
35 |
+
},
|
36 |
+
{
|
37 |
+
"cell_type": "code",
|
38 |
+
"execution_count": 3,
|
39 |
+
"id": "0c5d29b2-2d9f-4009-9f55-690af4be7ea7",
|
40 |
+
"metadata": {},
|
41 |
+
"outputs": [],
|
42 |
+
"source": [
|
43 |
+
"# Create a dictionary of Faker functions with descriptive labels\n",
|
44 |
+
"faker_functions = {\n",
|
45 |
+
" \"person name\": fake.name,\n",
|
46 |
+
" \"first name\": fake.first_name,\n",
|
47 |
+
" \"last name\": fake.last_name,\n",
|
48 |
+
" \"email address\": fake.email,\n",
|
49 |
+
" \"phone number\": fake.phone_number,\n",
|
50 |
+
" \"street address\": fake.street_address,\n",
|
51 |
+
" \"city name\": fake.city,\n",
|
52 |
+
" \"state name\": fake.state,\n",
|
53 |
+
" \"country name\": fake.country,\n",
|
54 |
+
" \"zip code\": fake.zipcode,\n",
|
55 |
+
" \"job title\": fake.job,\n",
|
56 |
+
" \"company name\": fake.company,\n",
|
57 |
+
" \"credit card number\": fake.credit_card_number,\n",
|
58 |
+
" \"date of birth\": fake.date_of_birth,\n",
|
59 |
+
" \"username\": fake.user_name,\n",
|
60 |
+
" \"website url\": fake.url,\n",
|
61 |
+
" \"paragraph text\": fake.paragraph,\n",
|
62 |
+
" \"sentence text\": fake.sentence\n",
|
63 |
+
"}"
|
64 |
+
]
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"cell_type": "code",
|
68 |
+
"execution_count": 4,
|
69 |
+
"id": "998e0fa8-0ed5-4b39-8f5e-732628ace900",
|
70 |
+
"metadata": {},
|
71 |
+
"outputs": [
|
72 |
+
{
|
73 |
+
"name": "stderr",
|
74 |
+
"output_type": "stream",
|
75 |
+
"text": [
|
76 |
+
"Device set to use mps:0\n"
|
77 |
+
]
|
78 |
+
}
|
79 |
+
],
|
80 |
+
"source": [
|
81 |
+
"pipe = pipeline(model=\"facebook/bart-large-mnli\")"
|
82 |
+
]
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"cell_type": "code",
|
86 |
+
"execution_count": 5,
|
87 |
+
"id": "1a1d7010-1291-4e31-b525-83327b5e7c01",
|
88 |
+
"metadata": {},
|
89 |
+
"outputs": [],
|
90 |
+
"source": [
|
91 |
+
"result = pipe(\n",
|
92 |
+
" [\"The first name of a user\", \"login id\", \"full name of member\"],\n",
|
93 |
+
" candidate_labels=list(faker_functions.keys())\n",
|
94 |
+
")"
|
95 |
+
]
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"cell_type": "code",
|
99 |
+
"execution_count": 6,
|
100 |
+
"id": "d8cd7a50-aa79-48e4-aba9-7e78a4e64074",
|
101 |
+
"metadata": {},
|
102 |
+
"outputs": [],
|
103 |
+
"source": [
|
104 |
+
"def get_highest_score_functions(result, faker_functions, threshold=0.18):\n",
|
105 |
+
" sequence_to_function = {}\n",
|
106 |
+
" \n",
|
107 |
+
" for item in result:\n",
|
108 |
+
" sequence = item['sequence']\n",
|
109 |
+
" label = item['labels'][0]\n",
|
110 |
+
" score = item['scores'][0]\n",
|
111 |
+
" \n",
|
112 |
+
" if (score >= threshold):\n",
|
113 |
+
" sequence_to_function[sequence] = faker_functions.get(label)\n",
|
114 |
+
" else:\n",
|
115 |
+
" sequence_to_function[sequence] = None\n",
|
116 |
+
" \n",
|
117 |
+
" return sequence_to_function"
|
118 |
+
]
|
119 |
+
},
|
120 |
+
{
|
121 |
+
"cell_type": "code",
|
122 |
+
"execution_count": 7,
|
123 |
+
"id": "5c1989e2-99b8-4de3-b7f9-45c625405eb9",
|
124 |
+
"metadata": {},
|
125 |
+
"outputs": [
|
126 |
+
{
|
127 |
+
"data": {
|
128 |
+
"text/plain": [
|
129 |
+
"{'The first name of a user': <bound method Provider.first_name of <faker.providers.person.en_US.Provider object at 0x34cecc050>>,\n",
|
130 |
+
" 'login id': <bound method Provider.user_name of <faker.providers.internet.en_US.Provider object at 0x34ceb2cd0>>,\n",
|
131 |
+
" 'full name of member': <bound method Provider.name of <faker.providers.person.en_US.Provider object at 0x34cecc050>>}"
|
132 |
+
]
|
133 |
+
},
|
134 |
+
"execution_count": 7,
|
135 |
+
"metadata": {},
|
136 |
+
"output_type": "execute_result"
|
137 |
+
}
|
138 |
+
],
|
139 |
+
"source": [
|
140 |
+
"get_highest_score_functions(result, faker_functions, threshold=0.18)"
|
141 |
+
]
|
142 |
+
},
|
143 |
+
{
|
144 |
+
"cell_type": "code",
|
145 |
+
"execution_count": null,
|
146 |
+
"id": "799de95c-3d08-4da9-8a84-c3bc81ab5928",
|
147 |
+
"metadata": {},
|
148 |
+
"outputs": [],
|
149 |
+
"source": []
|
150 |
+
}
|
151 |
+
],
|
152 |
+
"metadata": {
|
153 |
+
"kernelspec": {
|
154 |
+
"display_name": "Python 3 (ipykernel)",
|
155 |
+
"language": "python",
|
156 |
+
"name": "python3"
|
157 |
+
},
|
158 |
+
"language_info": {
|
159 |
+
"codemirror_mode": {
|
160 |
+
"name": "ipython",
|
161 |
+
"version": 3
|
162 |
+
},
|
163 |
+
"file_extension": ".py",
|
164 |
+
"mimetype": "text/x-python",
|
165 |
+
"name": "python",
|
166 |
+
"nbconvert_exporter": "python",
|
167 |
+
"pygments_lexer": "ipython3",
|
168 |
+
"version": "3.11.11"
|
169 |
+
}
|
170 |
+
},
|
171 |
+
"nbformat": 4,
|
172 |
+
"nbformat_minor": 5
|
173 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
python-dotenv
|
2 |
+
huggingface-hub
|
3 |
+
transformers
|
4 |
+
faker
|
5 |
+
torch
|
6 |
+
jsonschema
|
7 |
+
gradio
|
utils/classify.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Utility for generating mock data functions based on text descriptions.
|
3 |
+
|
4 |
+
This utility uses a zero-shot classification model to match text descriptions
|
5 |
+
to appropriate Faker functions for generating synthetic data.
|
6 |
+
"""
|
7 |
+
|
8 |
+
from transformers import pipeline
|
9 |
+
from faker import Faker
|
10 |
+
|
11 |
+
fake = Faker()
|
12 |
+
|
13 |
+
# Create a dictionary of Faker functions with descriptive labels
|
14 |
+
faker_functions = {
|
15 |
+
"person name": fake.name,
|
16 |
+
"first name": fake.first_name,
|
17 |
+
"last name": fake.last_name,
|
18 |
+
"email address": fake.email,
|
19 |
+
"phone number": fake.phone_number,
|
20 |
+
"street address": fake.street_address,
|
21 |
+
"city name": fake.city,
|
22 |
+
"state name": fake.state,
|
23 |
+
"country name": fake.country,
|
24 |
+
"zip code": fake.zipcode,
|
25 |
+
"job title": fake.job,
|
26 |
+
"company name": fake.company,
|
27 |
+
"credit card number": fake.credit_card_number,
|
28 |
+
"date of birth": fake.date_of_birth,
|
29 |
+
"username": fake.user_name,
|
30 |
+
"website url": fake.url,
|
31 |
+
"paragraph text": fake.paragraph,
|
32 |
+
"sentence text": fake.sentence,
|
33 |
+
}
|
34 |
+
|
35 |
+
|
36 |
+
def get_highest_score_functions(result, faker_functions, threshold=0.18):
|
37 |
+
"""
|
38 |
+
Process model results and map sequences to appropriate Faker functions.
|
39 |
+
|
40 |
+
Args:
|
41 |
+
result: The classification results from the model
|
42 |
+
faker_functions: Dictionary mapping labels to Faker functions
|
43 |
+
threshold: Minimum confidence score to assign a function (default: 0.18)
|
44 |
+
|
45 |
+
Returns:
|
46 |
+
dict: Mapping of input sequences to corresponding Faker functions
|
47 |
+
"""
|
48 |
+
sequence_to_function = {}
|
49 |
+
for item in result:
|
50 |
+
sequence = item["sequence"]
|
51 |
+
label = item["labels"][0]
|
52 |
+
score = item["scores"][0]
|
53 |
+
if score >= threshold:
|
54 |
+
sequence_to_function[sequence] = faker_functions.get(label)
|
55 |
+
else:
|
56 |
+
sequence_to_function[sequence] = None
|
57 |
+
return sequence_to_function
|
58 |
+
|
59 |
+
|
60 |
+
def get_functions_for_descriptions(descriptions):
|
61 |
+
"""
|
62 |
+
Get mock data functions based on descriptions or property names.
|
63 |
+
|
64 |
+
Uses zero-shot classification to match text descriptions to appropriate
|
65 |
+
Faker functions for generating synthetic data.
|
66 |
+
|
67 |
+
Args:
|
68 |
+
descriptions: Array of descriptions or property names to classify
|
69 |
+
|
70 |
+
Returns:
|
71 |
+
dict: Mapping of descriptions to corresponding mock data functions
|
72 |
+
"""
|
73 |
+
# Create pipeline with Facebook's BART model for zero-shot classification
|
74 |
+
pipe = pipeline(model="facebook/bart-large-mnli")
|
75 |
+
|
76 |
+
# Call pipeline with descriptions and available Faker function labels
|
77 |
+
result = pipe(descriptions, candidate_labels=list(faker_functions.keys()))
|
78 |
+
|
79 |
+
# Process results using helper function with default confidence threshold
|
80 |
+
return get_highest_score_functions(result, faker_functions, threshold=0.18)
|
utils/extract.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def extract_descriptions(schema, path=""):
|
2 |
+
"""
|
3 |
+
Recursively extract all descriptions from a JSON schema.
|
4 |
+
If a property doesn't have a description, use its property name.
|
5 |
+
|
6 |
+
Args:
|
7 |
+
schema (dict): The JSON schema to extract descriptions from
|
8 |
+
path (str): Current path in the schema (for nested properties)
|
9 |
+
|
10 |
+
Returns:
|
11 |
+
list: All descriptions found in the schema
|
12 |
+
"""
|
13 |
+
descriptions = []
|
14 |
+
|
15 |
+
# Handle schema description or use path as fallback
|
16 |
+
# Skip if type is object
|
17 |
+
if schema.get("type") != "object":
|
18 |
+
if "description" in schema:
|
19 |
+
descriptions.append(schema["description"])
|
20 |
+
elif path: # Only add path if it's not empty (root element)
|
21 |
+
descriptions.append(path.split(".")[-1]) # Use the last part of the path
|
22 |
+
|
23 |
+
# Handle properties
|
24 |
+
if "properties" in schema:
|
25 |
+
for prop_name, prop_schema in schema["properties"].items():
|
26 |
+
prop_path = f"{path}.{prop_name}" if path else prop_name
|
27 |
+
descriptions.extend(extract_descriptions(prop_schema, prop_path))
|
28 |
+
|
29 |
+
# Handle array items
|
30 |
+
if "items" in schema and isinstance(schema["items"], dict):
|
31 |
+
item_path = f"{path}[]" if path else "items"
|
32 |
+
descriptions.extend(extract_descriptions(schema["items"], item_path))
|
33 |
+
|
34 |
+
# Handle oneOf, anyOf, allOf
|
35 |
+
for key in ["oneOf", "anyOf", "allOf"]:
|
36 |
+
if key in schema and isinstance(schema[key], list):
|
37 |
+
for i, sub_schema in enumerate(schema[key]):
|
38 |
+
sub_path = f"{path}.{key}[{i}]" if path else f"{key}[{i}]"
|
39 |
+
descriptions.extend(extract_descriptions(sub_schema, sub_path))
|
40 |
+
|
41 |
+
# Handle additional properties
|
42 |
+
if "additionalProperties" in schema and isinstance(
|
43 |
+
schema["additionalProperties"], dict
|
44 |
+
):
|
45 |
+
add_path = f"{path}.additionalProperties" if path else "additionalProperties"
|
46 |
+
descriptions.extend(
|
47 |
+
extract_descriptions(schema["additionalProperties"], add_path)
|
48 |
+
)
|
49 |
+
|
50 |
+
return descriptions
|
51 |
+
|
52 |
+
|
53 |
+
# Example usage:
|
54 |
+
if __name__ == "__main__":
|
55 |
+
sample_schema = {
|
56 |
+
"type": "object",
|
57 |
+
"description": "A person object",
|
58 |
+
"properties": {
|
59 |
+
"name": {"type": "string", "description": "The person's full name"},
|
60 |
+
"age": {
|
61 |
+
"type": "integer",
|
62 |
+
# No description for age, will use property name
|
63 |
+
},
|
64 |
+
"address": {
|
65 |
+
"type": "object",
|
66 |
+
"description": "The person's address",
|
67 |
+
"properties": {
|
68 |
+
"street": {
|
69 |
+
"type": "string",
|
70 |
+
"description": "Street name and number",
|
71 |
+
},
|
72 |
+
"city": {
|
73 |
+
"type": "string"
|
74 |
+
# No description for city, will use property name
|
75 |
+
},
|
76 |
+
},
|
77 |
+
},
|
78 |
+
"hobbies": {
|
79 |
+
"type": "array",
|
80 |
+
"description": "List of hobbies",
|
81 |
+
"items": {"type": "string", "description": "A hobby name"},
|
82 |
+
},
|
83 |
+
},
|
84 |
+
}
|
85 |
+
|
86 |
+
descriptions = extract_descriptions(sample_schema)
|
87 |
+
print("Extracted descriptions:")
|
88 |
+
for desc in descriptions:
|
89 |
+
print(f"- {desc}")
|
utils/fake.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
|
3 |
+
def generate_mock_data(schema, faker_mapping):
|
4 |
+
"""
|
5 |
+
Generate mock data based on a JSON schema.
|
6 |
+
For integer and boolean, use custom logic to generate fake data
|
7 |
+
For remaining types, call the corresponding faker function if available.
|
8 |
+
|
9 |
+
Args:
|
10 |
+
schema (dict): The JSON schema
|
11 |
+
faker_mapping (dict): Mapping of descriptions/property names to faker functions
|
12 |
+
|
13 |
+
Returns:
|
14 |
+
dict: Generated mock data
|
15 |
+
"""
|
16 |
+
result = {}
|
17 |
+
|
18 |
+
if "properties" in schema:
|
19 |
+
for prop_name, prop_schema in schema["properties"].items():
|
20 |
+
if prop_schema.get("type") == "object":
|
21 |
+
# Recursively process objects
|
22 |
+
result[prop_name] = generate_mock_data(prop_schema, faker_mapping)
|
23 |
+
elif prop_schema.get("type") == "integer":
|
24 |
+
# Check if min/max properties exist for bounds
|
25 |
+
minimum = prop_schema.get("minimum")
|
26 |
+
maximum = prop_schema.get("maximum")
|
27 |
+
|
28 |
+
if minimum is not None and maximum is not None:
|
29 |
+
result[prop_name] = random.randint(minimum, maximum)
|
30 |
+
elif minimum is not None:
|
31 |
+
result[prop_name] = random.randint(
|
32 |
+
minimum, minimum + 100
|
33 |
+
) # Arbitrary upper bound
|
34 |
+
elif maximum is not None:
|
35 |
+
result[prop_name] = random.randint(
|
36 |
+
0, maximum
|
37 |
+
) # Assume 0 as lower bound
|
38 |
+
else:
|
39 |
+
result[prop_name] = random.randint(0, 100) # Default range
|
40 |
+
elif prop_schema.get("type") == "boolean":
|
41 |
+
result[prop_name] = random.choice(
|
42 |
+
[True, False]
|
43 |
+
) # Randomly pick True or False
|
44 |
+
else:
|
45 |
+
# For non-objects, try to find a faker function
|
46 |
+
description = prop_schema.get("description", "")
|
47 |
+
|
48 |
+
# Look for faker function by description or property name
|
49 |
+
faker_func = None
|
50 |
+
if description in faker_mapping:
|
51 |
+
faker_func = faker_mapping[description]
|
52 |
+
elif prop_name in faker_mapping:
|
53 |
+
faker_func = faker_mapping[prop_name]
|
54 |
+
|
55 |
+
# Call faker function if found, otherwise set to None
|
56 |
+
result[prop_name] = faker_func() if faker_func else None
|
57 |
+
|
58 |
+
return result
|
59 |
+
|
60 |
+
|
61 |
+
# Example usage:
|
62 |
+
if __name__ == "__main__":
|
63 |
+
from faker import Faker
|
64 |
+
|
65 |
+
fake = Faker()
|
66 |
+
|
67 |
+
sample_schema = {
|
68 |
+
"type": "object",
|
69 |
+
"properties": {
|
70 |
+
"name": {"type": "string", "description": "The person's full name"},
|
71 |
+
"age": {"type": "integer"},
|
72 |
+
"address": {
|
73 |
+
"type": "object",
|
74 |
+
"properties": {
|
75 |
+
"street": {
|
76 |
+
"type": "string",
|
77 |
+
"description": "Street name and number",
|
78 |
+
},
|
79 |
+
"city": {"type": "string"},
|
80 |
+
},
|
81 |
+
},
|
82 |
+
},
|
83 |
+
}
|
84 |
+
|
85 |
+
# Create faker function mapping
|
86 |
+
faker_mapping = {
|
87 |
+
"The person's full name": fake.name,
|
88 |
+
"age": fake.random_int,
|
89 |
+
"Street name and number": fake.street_address,
|
90 |
+
"city": fake.city,
|
91 |
+
}
|
92 |
+
|
93 |
+
# Generate mock data
|
94 |
+
mock_data = generate_mock_data(sample_schema, faker_mapping)
|
95 |
+
print(mock_data)
|
utils/serialize_json.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import datetime
|
3 |
+
from decimal import Decimal
|
4 |
+
from uuid import UUID
|
5 |
+
|
6 |
+
|
7 |
+
class CustomJSONEncoder(json.JSONEncoder):
|
8 |
+
"""
|
9 |
+
Custom JSON encoder that handles various Python types that aren't JSON serializable by default:
|
10 |
+
- datetime.datetime, datetime.date, datetime.time
|
11 |
+
- Decimal
|
12 |
+
- UUID
|
13 |
+
- Sets
|
14 |
+
- Any object with a to_json method
|
15 |
+
- Custom objects with __dict__ attribute
|
16 |
+
"""
|
17 |
+
|
18 |
+
def default(self, obj):
|
19 |
+
# Handle datetime objects
|
20 |
+
if isinstance(obj, (datetime.datetime, datetime.date)):
|
21 |
+
return obj.isoformat()
|
22 |
+
|
23 |
+
# Handle time objects
|
24 |
+
if isinstance(obj, datetime.time):
|
25 |
+
return obj.isoformat()
|
26 |
+
|
27 |
+
# Handle Decimal objects
|
28 |
+
if isinstance(obj, Decimal):
|
29 |
+
return float(obj)
|
30 |
+
|
31 |
+
# Handle UUID objects
|
32 |
+
if isinstance(obj, UUID):
|
33 |
+
return str(obj)
|
34 |
+
|
35 |
+
# Handle set objects
|
36 |
+
if isinstance(obj, set):
|
37 |
+
return list(obj)
|
38 |
+
|
39 |
+
# Handle objects that have implemented a to_json method
|
40 |
+
if hasattr(obj, "to_json"):
|
41 |
+
return obj.to_json()
|
42 |
+
|
43 |
+
# Try to convert the object to a dict
|
44 |
+
try:
|
45 |
+
return obj.__dict__
|
46 |
+
except AttributeError:
|
47 |
+
pass
|
48 |
+
|
49 |
+
# Let the base class handle it or raise TypeError
|
50 |
+
return super().default(obj)
|
51 |
+
|
52 |
+
|
53 |
+
def serialize_to_json(data, pretty=False):
|
54 |
+
"""
|
55 |
+
Serialize Python data to JSON string, handling various Python types.
|
56 |
+
|
57 |
+
Args:
|
58 |
+
data: The Python object to serialize
|
59 |
+
pretty (bool): Whether to format the JSON with indentation for readability
|
60 |
+
|
61 |
+
Returns:
|
62 |
+
str: JSON string representation of the data
|
63 |
+
"""
|
64 |
+
indent = 2 if pretty else None
|
65 |
+
return json.dumps(data, cls=CustomJSONEncoder, indent=indent)
|
utils/validate.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from jsonschema import validate, exceptions
|
3 |
+
|
4 |
+
meta_schema = {
|
5 |
+
"type": "object",
|
6 |
+
"required": ["type", "properties"],
|
7 |
+
"properties": {"type": {"type": "string"}, "properties": {"type": "object"}},
|
8 |
+
}
|
9 |
+
|
10 |
+
|
11 |
+
def validate_schema_structure(schema_str):
|
12 |
+
"""
|
13 |
+
Parse and Validate a JSON Schema object
|
14 |
+
|
15 |
+
Args:
|
16 |
+
schema_str: A valid JSON schema object seralized to a string
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
tuple: (bool, object) where:
|
20 |
+
- First element is a boolean indicating success (True) or failure (False)
|
21 |
+
- Second element is either the validated schema object (on success)
|
22 |
+
or an error message string (on failure)
|
23 |
+
"""
|
24 |
+
|
25 |
+
try:
|
26 |
+
schema = json.loads(schema_str)
|
27 |
+
validate(instance=schema, schema=meta_schema)
|
28 |
+
return True, schema
|
29 |
+
except exceptions.ValidationError as e:
|
30 |
+
return False, f"Schema structure validation error: {e}"
|
31 |
+
except json.JSONDecodeError as e:
|
32 |
+
return False, f"Invalid JSON syntax: {str(e)}"
|
33 |
+
except Exception as e:
|
34 |
+
return False, f"Unexpected error: {str(e)}"
|