Spaces:

Zwounds
/

Boolean_Search_Query_Model

Runtime error

App Files Files Community

Boolean_Search_Query_Model / demo.py

Zwounds

Upload folder using huggingface_hub

0adb26f verified 9 months ago

raw

history blame

6.64 kB

	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch
	import logging

	# Setup logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	SYSTEM_INSTRUCTION = """Convert natural language queries into boolean search queries by following these rules:

	1. FIRST: Remove all meta-terms from this list (they should NEVER appear in output):
	- articles, papers, research, studies
	- examining, investigating, analyzing
	- findings, documents, literature
	- publications, journals, reviews
	Example: "Research examining X" → just "X"

	2. SECOND: Remove generic implied terms that don't add search value:
	- Remove words like "practices," "techniques," "methods," "approaches," "strategies"
	- Remove words like "impacts," "effects," "influences," "role," "applications"
	- For example: "sustainable agriculture practices" → "sustainable agriculture"
	- For example: "teaching methodologies" → "teaching"
	- For example: "leadership styles" → "leadership"

	3. THEN: Format the remaining terms:
	CRITICAL QUOTING RULES:
	- Multi-word phrases MUST ALWAYS be in quotes - NO EXCEPTIONS
	- Examples of correct quoting:
	- Wrong: machine learning AND deep learning
	- Right: "machine learning" AND "deep learning"
	- Wrong: natural language processing
	- Right: "natural language processing"
	- Single words must NEVER have quotes (e.g., science, research, learning)
	- Use AND to connect required concepts
	- Use OR with parentheses for alternatives"""

	def load_model():
	"""Load the model and set up tokenizer."""
	logger.info("Loading model...")
	model = AutoModelForCausalLM.from_pretrained(
	"Zwounds/boolean-search-model",
	device_map="cpu",
	torch_dtype=torch.float32
	)
	tokenizer = AutoTokenizer.from_pretrained("Zwounds/boolean-search-model")
	tokenizer.use_default_system_prompt = False
	logger.info("Model loaded successfully")

	return model, tokenizer

	def extract_response(output: str) -> str:
	"""Extract the response part from the output."""
	start_marker = "<\|start_header_id\|>assistant<\|end_header_id\|>"
	end_marker = "<\|eot_id\|>"

	start_idx = output.find(start_marker)
	if start_idx != -1:
	start_idx += len(start_marker)
	end_idx = output.find(end_marker, start_idx)
	if end_idx != -1:
	return output[start_idx:end_idx].strip()

	return output.strip()

	def get_boolean_query(query: str, model=None, tokenizer=None) -> str:
	"""Generate boolean query from natural language."""
	# Format the conversation
	conversation = [
	{"role": "system", "content": SYSTEM_INSTRUCTION},
	{"role": "user", "content": query}
	]

	# Format into chat template
	prompt = tokenizer.apply_chat_template(conversation, tokenize=False)
	inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

	# Generate response
	outputs = model.generate(
	**inputs,
	max_new_tokens=64,
	do_sample=False,
	use_cache=True,
	pad_token_id=tokenizer.pad_token_id,
	eos_token_id=tokenizer.eos_token_id
	)

	return extract_response(tokenizer.batch_decode(outputs)[0])

	# Example queries demonstrating various cases
	examples = [
	# Testing removal of meta-terms
	["Find research papers examining the long-term effects of meditation on brain structure"],

	# Testing removal of generic implied terms (practices, techniques, methods)
	["Articles about deep learning techniques for natural language processing tasks"],

	# Testing removal of impact/effect terms
	["Studies on the impact of early childhood nutrition on cognitive development"],

	# Testing handling of technology applications
	["Information on virtual reality applications in architectural design and urban planning"],

	# Testing proper OR relationship with parentheses
	["Research on electric vehicles adoption in urban environments or rural communities"],

	# Testing proper quoting of multi-word concepts only
	["Articles on biodiversity loss in coral reefs and rainforest ecosystems"],

	# Testing removal of strategy/approach terms
	["Studies about different teaching approaches for children with learning disabilities"],

	# Testing complex OR relationships
	["Research examining social media influence on political polarization or public discourse"],

	# Testing implied terms in specific industries
	["Articles about implementation strategies for blockchain in supply chain management or financial services"],

	# Testing qualifiers that don't add search value
	["Research on effective leadership styles in multicultural organizations"],

	# Testing removal of multiple implied terms
	["Studies on the effects of microplastic pollution techniques on marine ecosystem health"],

	# Testing domain-specific implied terms
	["Articles about successful cybersecurity protection methods for critical infrastructure"],

	# Testing generalized vs specific concepts
	["Research papers on quantum computing algorithms for cryptography or optimization problems"],

	# Testing implied terms in outcome descriptions
	["Studies examining the relationship between sleep quality and academic performance outcomes"],

	# Testing complex nesting of concepts
	["Articles about renewable energy integration challenges in developing countries or island nations"]
	]

	# Load model globally
	logger.info("Initializing model...")
	model, tokenizer = load_model()

	# Create Gradio interface
	title = "Natural Language to Boolean Search"
	description = """Convert natural language queries into boolean search expressions. The model will:

	1. Remove search-related terms (like 'articles', 'research', etc.)
	2. Handle generic implied terms (like 'practices', 'methods')
	3. Format concepts using proper boolean syntax:
	- Multi-word phrases in quotes
	- Single words without quotes
	- AND to connect required concepts
	- OR with parentheses for alternatives
	"""

	demo = gr.Interface(
	fn=lambda x: get_boolean_query(x, model, tokenizer),
	inputs=[
	gr.Textbox(
	label="Enter your natural language query",
	placeholder="e.g., I'm looking for information about climate change and renewable energy"
	)
	],
	outputs=gr.Textbox(label="Boolean Search Query"),
	title=title,
	description=description,
	examples=examples,
	theme=gr.themes.Soft()
	)

	if __name__ == "__main__":
	demo.launch()