Spaces:
Paused
Paused
# prescription_validation/fuzzy_match.py | |
import sqlite3 | |
import re | |
from rapidfuzz.distance import Levenshtein | |
from config import DB_PATH, LEV_THRESH | |
class RxLookup: | |
def __init__(self, db_path: str = DB_PATH): | |
self.conn = sqlite3.connect(db_path) | |
self.conn.row_factory = sqlite3.Row | |
self.drugs = self.conn.execute("SELECT name, cui FROM drugs").fetchall() | |
def _clean_token(self, token: str) -> str: | |
"""Removes dosage, form factor, and non-alpha characters.""" | |
cleaned = token.lower() | |
cleaned = re.sub(r'(\d+)\s*(mg|ml|mcg|tab|cap|#)', '', cleaned) | |
cleaned = re.sub(r'[^a-z]', '', cleaned) | |
return cleaned | |
def match(self, token: str) -> tuple[str | None, str | None]: | |
if not token: | |
return (None, None) | |
cleaned_token = self._clean_token(token) | |
if not cleaned_token: | |
return (None, None) | |
best_match = None | |
min_distance = float('inf') | |
for row in self.drugs: | |
name, cui = row["name"], row["cui"] | |
cleaned_db_name = self._clean_token(name) | |
distance = Levenshtein.distance(cleaned_token, cleaned_db_name) | |
if distance < min_distance: | |
min_distance = distance | |
best_match = (name, cui) | |
if best_match and min_distance / len(cleaned_token) < LEV_THRESH: | |
return best_match | |
return (None, None) | |