Spaces:

DeepJudge
/

Applicant-Task-Submission

Running

App Files Files

Applicant-Task-Submission / app.py

Timothy-Vinzent

Update app.py

d7b6b69 verified about 1 month ago

raw

history blame

24.5 kB

	import os
	import re
	import json
	import gradio as gr
	from openai import OpenAI
	import gspread
	from google.oauth2.service_account import Credentials

	SCOPES = [
	"https://www.googleapis.com/auth/spreadsheets",
	"https://www.googleapis.com/auth/drive"
	]

	MODEL_NAME = "gpt-4o-mini" # Ensure this matches your deployed model.
	TEMPERATURE = 0.2

	# Initialize the OpenAI client with the API key from environment variables.
	client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

	# In-memory set to track submitted emails (this resets when the app restarts).
	submitted_emails = set()

	def get_google_sheet():
	"""
	Connects to the Google Sheet using service account credentials stored
	in the environment variable "GOOGLE_CREDS_JSON" and returns the worksheet
	named "Submissions" from the spreadsheet identified by "SPREADSHEET_ID".
	"""
	creds = Credentials.from_service_account_info(
	json.loads(os.environ["GOOGLE_CREDS_JSON"]),
	scopes=SCOPES
	)
	gc = gspread.authorize(creds)
	sh = gc.open_by_key(os.environ["SPREADSHEET_ID"])
	worksheet = sh.worksheet("Submissions")
	return worksheet

	def get_evaluation_questions():
	"""
	Loads evaluation questions and expected answers from environment variables.

	Expected environment variables:
	- TEST_QUESTION_1: a JSON array of user query strings.
	- TEST_EXPECTED_1: a JSON array of JSON-like strings representing expected outputs.

	Both lists must be of equal length.
	"""
	questions_str = os.environ.get("TEST_QUESTION_1")
	docs_str = os.environ.get("TEST_DOCUMENTS_1")
	expected_str = os.environ.get("TEST_EXPECTED_1")

	if not questions_str or not expected_str or not docs_str:
	return []
	try:
	questions_list = json.loads(questions_str)
	except Exception as e:
	print(f"Error parsing questions: {str(e)}")
	return []
	try:
	expected_list = json.loads(expected_str)
	except Exception as e:
	print(f"Error parsing expected answers: {str(e)}")
	return []
	try:
	docs_list = json.loads(docs_str)
	except Exception as e:
	print(f"Error parsing documents: {str(e)}")
	return []

	# Ensure all lists are of the same length.
	if len(questions_list) != len(expected_list) or len(questions_list) != len(docs_list):
	print("Mismatch in length: questions list and expected answers list must have the same length.")
	return []

	return [{"question": q, "expected": e, "docs": d} for q, e, d in zip(questions_list, expected_list, docs_list)]

	# Load evaluation questions at startup.
	EVALUATION_QUESTIONS = get_evaluation_questions()

	def sanitize_input(text):
	"""
	Sanitizes input to allow only alphanumerics and some punctuation,
	then truncates to 500 characters.
	"""
	clean_text = re.sub(r"[^a-zA-Z0-9\s.,!?@:\-]", "", text)
	return clean_text.strip()[:500]

	def sanitize_prompt(text):
	"""
	Sanitizes the system prompt by stripping and limiting its length.
	"""
	return text.strip()[:8000]

	def validate_email(email):
	"""
	Validates that the provided email is in a valid format.
	Returns True if valid, False otherwise.
	"""
	email_regex = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
	return re.match(email_regex, email) is not None


	def submit_prompt(email, name, system_prompt_1, system_prompt_2, system_prompt_3):
	"""
	Handles the full submission process:
	- Validates email format.
	- Checks if the email has already been used (by in-memory set and Google Sheet).
	- Sanitizes input fields.
	- Processes the system prompt against each evaluation question using the OpenAI API.
	- For each test question, records the verdict and answer.
	- Appends the submission as a new row in the Google Sheet with columns:
	Name, Email, System Prompt, Score, and for each of the 7 test questions: verdict and answer.
	Returns a result message with evaluation details.
	"""
	# Validate email format.
	if not validate_email(email):
	return "Invalid email address. Please enter a valid email."

	# Check if this email has already been submitted (in-memory).
	if email in submitted_emails:
	return f"Submission already received for {email}. You can only submit once."

	# Connect to Google Sheet and check if the email already exists.
	try:
	sheet = get_google_sheet()
	email_col = sheet.col_values(2) # Assumes column 2 contains the email addresses.
	if email in email_col[1:]: # Skip header row.
	return f"Submission already received for {email}. You can only submit once."
	except Exception as e:
	print(f"Error accessing Google Sheet: {str(e)}")
	return f"Error accessing Google Sheet: {str(e)}"

	# Sanitize inputs.
	email = sanitize_input(email)
	name = sanitize_input(name)
	system_prompt_1 = sanitize_prompt(system_prompt_1)
	system_prompt_2 = sanitize_prompt(system_prompt_2)
	system_prompt_3 = sanitize_prompt(system_prompt_3)

	score = 0
	responses = [] # For display output.
	verdicts = [] # For storing each question's verdict in the sheet.
	answers_list = [] # For storing each question's answer in the sheet.

	start_tag = "<user_message>"
	end_tag = "</user_message>"


	# Process each evaluation question.
	for item in EVALUATION_QUESTIONS:
	# Usual assumption is that the question is relevant unless proven otherwise.
	notRelevant = False
	question = item["question"]
	docs = item["docs"].split("---") if item["docs"] else []
	expected = item["expected"]
	try:
	response = client.chat.completions.create(
	model=MODEL_NAME, # Ensure this model identifier matches your deployed model.
	messages=[
	{"role": "system", "content": system_prompt_1},
	{"role": "user", "content": question}
	],
	temperature=TEMPERATURE
	)
	output1 = response.choices[0].message.content.strip()
	except Exception as e:
	output1 = f"Error during OpenAI API call: {str(e)}"

	# Check if the answer contains the user message tags.
	if start_tag in output1 and end_tag in output1:
	# Extract the content between the tags.
	start_index = output1.index(start_tag) + len(start_tag)
	end_index = output1.index(end_tag)
	# Extract the answer between the tags and stop the execution for this question as the query is deemed irrelevant.
	answer = output1[start_index:end_index].strip()
	notRelevant = True
	else:
	# If no tags, treat the entire answer as the response.
	output1 = output1.strip()
	output2 = ""

	for doc in docs:
	try:
	response = client.chat.completions.create(
	model=MODEL_NAME,
	messages=[
	{"role": "system", "content": system_prompt_2},
	{"role": "user", "content": f"Target company context: \n{output1} \n\n Paragraph:\n {doc}"}
	],
	temperature=TEMPERATURE
	)
	output2 += "\n" + response.choices[0].message.content.strip()
	except Exception as e:
	output2 += f"\nError processing document: {str(e)}"

	# Prepare the final output for LLM3.

	output2 = output2.strip()
	try:
	response = client.chat.completions.create(
	model=MODEL_NAME,
	messages=[
	{"role": "system", "content": system_prompt_3},
	{"role": "user", "content": f"Extracted information: \n{output2}"}
	],
	temperature=TEMPERATURE,
	)
	answer = response.choices[0].message.content.strip()
	except Exception as e:
	answer = f"Error during final OpenAI API call: {str(e)}"


	verdict = ""

	# When the expected output is a string, it indicates that the query was irrelevant.
	if isinstance(expected, str):
	if notRelevant:
	verdict = f"Correct"
	score += 1
	responses.append(
	f"Question: {question}\n"
	f"Answer: {output1}\n --- \n{answer}\n"
	f"Expected: {expected}\n"
	f"Result: {verdict}\n"
	)
	verdicts.append(verdict)
	answers_list.append(f"{output1}\n --- \n{answer}\n")
	continue
	else:
	verdict = "Incorrect (Query was irrelevant, but no user message found)"
	responses.append(
	f"Question: {question}\n"
	f"Answer: {output1}\n --- \n{answer}\n"
	f"Expected: {expected}\n"
	f"Result: {verdict}\n"
	)
	verdicts.append(verdict)
	answers_list.append(f"{output1}\n --- \n{answer}\n")
	continue

	# If the expected output is a JSON object but answer is a String
	if notRelevant and not isinstance(expected, str):
	verdict = "Incorrect (Query was relevant, but user message found)"
	responses.append(
	f"Question: {question}\n"
	f"Answer: {output1}\n --- \n{answer}\n"
	f"Expected: {json.dumps(expected)}\n"
	f"Result: {verdict}\n"
	)
	verdicts.append(verdict)
	answers_list.append(f"{output1}\n --- \n{answer}\n")
	continue

	try:
	parsed_answer = json.loads(answer)
	answer_to_store = json.dumps(parsed_answer) # Normalize parsed JSON as string.
	except json.JSONDecodeError as e:
	verdict = f"Incorrect (Invalid JSON: {str(e)})"
	responses.append(
	f"Question: {question}\n"
	f"Answer: {output1}\n --- \n{answer}\n"
	f"Expected: {json.dumps(expected)}\n"
	f"Result: {verdict}\n"
	)
	verdicts.append(verdict)
	answers_list.append(f"{output1}\n --- \n{answer}\n")
	continue

	# Verify that all required keys are present.
	required_keys = ["buyer_firm", "seller_firm", "third_party", "contains_target_firm"]
	missing_keys = [key for key in required_keys if key not in parsed_answer]
	if missing_keys:
	verdict = f"Incorrect (Missing Keys: {', '.join(missing_keys)})"
	responses.append(
	f"Question: {question}\n"
	f"Answer: {output1}\n --- \n{json.dumps(parsed_answer)}\n"
	f"Expected: {json.dumps(expected)}\n"
	f"Result: {verdict}\n"
	)
	verdicts.append(verdict)
	answers_list.append(f"{output1}\n --- \n{json.dumps(parsed_answer)}\n")
	continue

	# Compare values for each required key.
	incorrect_values = []
	for key in required_keys:
	if parsed_answer[key] != expected[key]:
	incorrect_values.append(key)

	if len(incorrect_values) > 1:
	verdict = f"Incorrect (Values for keys {', '.join([repr(k) for k in incorrect_values])} are incorrect)"
	elif len(incorrect_values) == 1:
	verdict = f"Incorrect (Value for key '{incorrect_values[0]}' is incorrect)"
	else:
	score += 1
	verdict = "Correct"

	responses.append(
	f"Question: {question}\n"
	f"Answer: {output1}\n --- \n{json.dumps(parsed_answer)}\n"
	f"Expected: {json.dumps(expected)}\n"
	f"Result: {verdict}\n"
	)
	verdicts.append(verdict)
	answers_list.append(f"{output1}\n --- \n{json.dumps(parsed_answer)}\n")

	result_details = "\n".join(responses)

	# Record this email locally so that subsequent submissions are blocked.
	submitted_emails.add(email)
	system_prompt = f"{system_prompt_1}\n---\n{system_prompt_2}\n---\n{system_prompt_3}"

	# Prepare the row for Google Sheets:
	# The row format is: Name, Email, System Prompt, Score, then for each of the 7 test questions: Verdict, Answer.
	row = [name, email, system_prompt, str(score)]
	for v, a in zip(verdicts, answers_list):
	row.extend([v, a])

	# Append the new row to the Google Sheet.
	try:
	sheet.append_row(row)
	except Exception as e:
	print(f"Error appending row to Google Sheet: {str(e)}")
	return f"Error saving submission: {str(e)}"

	return (
	f"Thank you for your submission, {name}!\n\n"
	)

	def build_interface():
	"""
	Constructs the Gradio interface with a submission button and single-submission mechanism.
	"""
	with gr.Blocks() as demo:
	gr.Markdown("""
	# Applicant Task: Target Company & Law Firm Identification

	This task involves processing a user query to determine the relevance to the intended task, followed by analyzing textual data to extract information about law firms representing parties (Buyer, Seller, and Third Parties) and verifying the presence of a target company. For reference, see this sample agreement: [SEC Agreement Example](https://www.sec.gov/Archives/edgar/data/28452/000119312505012401/dex101.htm)

	> Note:
	> This evaluation system uses the `gpt-4o-mini` model with a temperature setting of `0.2` for all LLM steps.

	The system is designed to sequentially leverage three LLM functions:

	### Step 1: LLM1
	- Determines if the user's query mentions any target company.
	- If no target company is found, LLM1 responds with a message wrapped in `<user_message></user_message>` XML tags to inform the user that the query is irrelevant to this task.
	- If the query contains a target company, LLM1 moves forward with a formatted acknowledgment of the identified target company.

	### Step 2: LLM2
	- Examines four separate paragraphs independently.
	- For each paragraph, extracts:
	- Buyer's representative law firm
	- Seller's representative law firm
	- Any third-party law firm present
	- Whether the target company is mentioned in the paragraph
	- Each paragraph's results are formatted and concatenated for the next step.

	### Step 3: LLM3
	- Compiles the information from all analyzed paragraphs and outputs a structured JSON object:

	```json
	{
	"buyer_firm": "string",
	"seller_firm": "string",
	"third_party": "string",
	"contains_target_firm": boolean
	}
	```

	\| Field \| Default Value if Missing \| Type \|
	\| ---------------------- \| ------------------------ \| --------- \|
	\| `buyer_firm` \| `"unknown"` \| `string` \|
	\| `seller_firm` \| `"unknown"` \| `string` \|
	\| `third_party` \| `"unknown"` \| `string` \|
	\| `contains_target_firm` \| `false` \| `boolean` \|

	The goal is to identify the representative law firms of involved parties and determine if the target company is mentioned, ensuring the results are structured and accurate.

	---

	Key Considerations:
	- The output must adhere to the prescribed JSON format for the final step.
	- Ensure the system can accurately extract and classify relevant information from the input paragraphs.
	""")
	gr.Image("mermaid_chart.png", label="LLM Flowchart")

	# Example Inputs and Outputs in an Accordion
	with gr.Accordion("Example Workflow", open=False):
	gr.Markdown("""
	User Query:
	```
	Is Kirkland & Ellis present in the agreement?
	```

	Document Provided:

	Paragraph 1:
	```
	This Stock and Asset Purchase Agreement is entered into as of October 28, 2021, among Purolite Corporation, a Delaware corporation, along with Stefan E. Brodie and Don B. Brodie (collectively referred to as the Sellers), and Ecolab Inc., a Delaware corporation, as the Purchaser. Additionally, Gibson, Dunn & Crutcher LLP, as an independent third-party representative, is engaged for specific advisory roles outlined in this Agreement.
	```

	Paragraph 2:
	```
	This Agreement shall be governed by and construed in accordance with the internal laws of the State of Delaware, without giving effect to any choice or conflict of law provision. Each clause within this Agreement shall be interpreted independently, and the invalidity of one clause shall not affect the enforceability of the remaining provisions. Headings are for convenience only and shall not affect the interpretation of this Agreement. Nothing herein shall be construed as limiting or waiving any rights or obligations under applicable law unless expressly stated.
	```

	Paragraph 3:
	```
	Such notices, demands, and other communications shall be directed to the Parties at their respective addresses. One Party may be contacted at:
	1 Ecolab Place
	St. Paul, Minnesota 55102
	Attention: General Counsel
	with a copy (which shall not constitute notice) to:
	Shearman & Sterling LLP
	599 Lexington Avenue
	New York, New York 10022
	Attention: Adam Miller
	Another Party may be reached at:
	Purolite Corporation
	2201 Renaissance Boulevard
	King of Prussia, Pennsylvania 19406
	Attention: Stefan E. Brodie; Howard Brodie
	with a copy (which shall not constitute notice) to:
	Cleary Gottlieb Steen & Hamilton LLP
	One Liberty Plaza
	New York, New York 10006
	Attention: John Reynolds; Sarah Lee
	Additional communications relating to the role of the third-party representative shall be directed to:
	Gibson, Dunn & Crutcher LLP
	200 Park Avenue
	New York, New York 10166
	Attention: Jane Smith
	```

	Paragraph 4:
	```
	All references to the singular include the plural and vice versa, and all references to any gender include all genders. The Parties agree that any ambiguities in the language of this Agreement shall not be construed against either Party. Section headings used in this Agreement are for reference only and shall not affect the meaning or interpretation of any provision.
	```

	---

	Expected Steps and Outputs:

	Step 1 (LLM1):
	- If no target company is identified:
	```
	<user_message>Query is not relevant to the intended task.</user_message>
	```
	- If a target company is identified:
	```
	The target company is Kirkland & Ellis LLP.
	```

	Step 2 (LLM2 for Paragraphs):
	- Example Input:
	```
	This Stock and Asset Purchase Agreement is entered into as of October 28, 2021, among Purolite Corporation, a Delaware corporation, along with Stefan E. Brodie and Don B. Brodie (collectively referred to as the Sellers), and Ecolab Inc., a Delaware corporation, as the Purchaser. Additionally, Gibson, Dunn & Crutcher LLP, as an independent third-party representative, is engaged for specific advisory roles outlined in this Agreement.
	```

	- Example Output:
	```
	Buyer: Ecolab Inc.
	Buyer Representative: Not stated
	Seller: Purolite Corporation
	Seller Representative: Not stated
	Third-Party Representation: Advisory roles, Gibson, Dunn & Crutcher LLP
	Target Company Mentioned: No
	```

	Step 3 (LLM3 Final Output):
	- Compiled JSON:
	```json
	{
	"buyer_firm": "Shearman & Sterling LLP",
	"seller_firm": "Cleary Gottlieb Steen & Hamilton LLP",
	"third_party": "Gibson, Dunn & Crutcher LLP",
	"contains_target_firm": false
	}
	```

	""")

	# Challenge instructions and testing guidance
	with gr.Accordion("Task Instructions and Testing", open=False):
	gr.Markdown("""
	---
	Task Instructions:
	- Design prompts that ensure proper interaction between the three LLM systems, with each step contributing to the final output.
	- Ensure strict adherence to JSON formatting requirements (e.g., no extra characters that may cause JSON parsing errors).
	- Test extensively to verify accurate law firm and target company identification.

	Output Requirements:
	- Ensure final LLM3 JSON output has the following keys:
	- `"buyer_firm"`
	- `"seller_firm"`
	- `"third_party"`
	- `"contains_target_firm"`
	- Values must be accurately extracted or classified based on LLM2's parsed data.

	Hints for Crafting System Prompts:
	- Explicitly specify formatting requirements at each step.
	- Clarify the task definitions and expected classifications in each system prompt for LLM1, LLM2, and LLM3.
	- Test using diverse sample data for robustness.
	---
	""")

	gr.Markdown("""
	---
	### Submission Instructions

	Enter your name and email below, as listed in your CV, and submit your designed prompts.

	You can only submit once, so validate your system prompts thoroughly using mock queries and example data before final submission.

	Good Luck!

	_Remember: Focus on clarity, accuracy, and structured responses to achieve a high score!_
	---
	""")

	email_input = gr.Textbox(label="Email", placeholder="your.email@example.com")
	name_input = gr.Textbox(label="First Name, Last Name", placeholder="John, Smith")
	system_prompt_input_1 = gr.Textbox(
	label="System Prompt for LLM1",
	placeholder="Enter your system prompt here...",
	lines=6,
	)

	system_prompt_input_2 = gr.Textbox(
	label="System Prompt for LLM2",
	placeholder="Enter your system prompt here...",
	lines=10,
	)

	system_prompt_input_3 = gr.Textbox(
	label="System Prompt for LLM3",
	placeholder="Enter your system prompt here...",
	lines=6,
	)
	gr.Markdown("""
	<div style="background-color:#fff7e6; padding:16px; border-radius:8px; border:1px solid #ffe5b4; margin-bottom:1em;">
	<b>⏳ Please note:</b><br>
	Submitting may take up to <b>120 seconds</b>.<br>
	<strong>After clicking <span style='color:#006ce1;'>Submit</span>, please wait and <span style='color:crimson;'>do not press it again</span>.</strong>
	</div>
	""")


	submit_button = gr.Button("Submit")
	output_text = gr.Textbox(label="Results", lines=15)
	feedback_md = gr.Markdown("", visible=False)

	def submit_and_disable(email, name, s1, s2, s3):
	message = submit_prompt(email, name, s1, s2, s3)
	# Feedback to be shown in the Markdown field
	feedback = "✅ Submission received! Thank you.<br>Please wait for results to appear above. You can close the page."
	return message, gr.update(interactive=False), gr.update(value=feedback, visible=True)

	submit_button.click(
	fn=submit_and_disable,
	inputs=[email_input, name_input, system_prompt_input_1, system_prompt_input_2, system_prompt_input_3],
	outputs=[output_text, submit_button, feedback_md],
	)

	return demo

	if __name__ == "__main__":
	interface = build_interface()
	# Launch the app on 0.0.0.0 so it is accessible externally (e.g., in a container).
	interface.launch(server_name="0.0.0.0", server_port=7860)