slotmatch / extractor.py
GenAIDevTOProd's picture
Upload folder using huggingface_hub
8983b2d verified
from slotmatch.schema import SchemaValidator
from slotmatch.utils import extract_value_by_regex, fuzzy_match_key, compute_confidence
class SlotExtractor:
def __init__(self, schema: dict):
self.validator = SchemaValidator(schema)
self.schema = self.validator.get_schema()
self.schema_keys = list(self.schema.keys())
def extract(self, text: str) -> dict:
result = {}
for expected_key in self.schema_keys:
# 1. Try regex directly
raw_value = extract_value_by_regex(text, expected_key)
if raw_value is not None:
result[expected_key] = {
"value": self._coerce_type(raw_value, self.schema[expected_key]),
"confidence": compute_confidence("regex")
}
continue
# 2. Try fuzzy match
fuzzy_key, score = fuzzy_match_key(expected_key, self._get_all_keys_from_text(text))
if fuzzy_key:
raw_value = extract_value_by_regex(text, fuzzy_key)
if raw_value is not None:
result[expected_key] = {
"value": self._coerce_type(raw_value, self.schema[expected_key]),
"confidence": compute_confidence("fuzzy") * score
}
continue
# 3. Fallback
result[expected_key] = {
"value": None,
"confidence": 0.0
}
return result
def _get_all_keys_from_text(self, text: str) -> list:
import re
pattern = r'["\']?([\w-]+)["\']?\s*[:=]'
return list(set(re.findall(pattern, text)))
def _coerce_type(self, value, expected_type):
try:
if expected_type == bool:
return value.lower() in ['true', 'yes', '1']
return expected_type(value)
except:
return value # fallback to original