Spaces:
Sleeping
Sleeping
# src/processing/llm_extractor.py | |
import json | |
import logging | |
from typing import Dict, Any | |
from openai import AzureOpenAI | |
from utils.cost_tracker import cost_tracker | |
logger = logging.getLogger(__name__) | |
class AzureO1MedicationExtractor: | |
def __init__( | |
self, | |
endpoint: str, | |
api_key: str, | |
api_version: str, | |
deployment: str, | |
model_name: str = None, | |
): | |
self.client = AzureOpenAI( | |
api_version=api_version, | |
azure_endpoint=endpoint, | |
api_key=api_key, | |
) | |
self.deployment = deployment | |
self.model_name = model_name or deployment | |
def extract_medication_sections(self, doc_json: Dict[str, Any]) -> Dict[str, Any]: | |
texts = doc_json.get("texts", []) | |
text_analysis = [] | |
for i, text_elem in enumerate(texts): | |
text_analysis.append({ | |
"index": i, | |
"text": text_elem.get("text", ""), | |
"label": text_elem.get("label", ""), | |
"level": text_elem.get("level", 0), | |
"parent": text_elem.get("parent", {}), | |
}) | |
prompt = f""" | |
You are a medical document analysis expert specializing in discharge letters. Your task is to identify ONLY the formal medication lists that should be redacted, while preserving ALL other content including medical history tables. | |
**CRITICAL: You should ONLY remove formal medication lists with explicit medication names, dosages, and frequencies.** | |
**What to REMOVE (medication lists only):** | |
1. **Current medication list** - sections with headers like "Huidige thuismedicatie", "Current medications", "Medicatie" | |
2. **Discharge medication list** - sections with headers like "Als verdere behandeling stellen wij voor", "Thuismedicatie", "Discharge medications" | |
**What medication lists look like:** | |
- Header: "Huidige thuismedicatie" or similar | |
- Followed by multiple lines with medication names, dosages, frequencies | |
- Example: "Pantomed 20mg Tablet Oral - 1 tablet - 2 maal daags" | |
- Example: "Forlax 10g Zakje Oral - 2 zakje - 1 maal daags (zo nodig)" | |
**What to ABSOLUTELY NEVER REMOVE:** | |
1. **Medical history tables** - Tables with "Datum" and "Bespreking" columns containing dates and medical events | |
2. **Treatment history** - Narrative descriptions of medical procedures, treatments, or events | |
3. **Clinical discussions** - Any text discussing medical conditions, procedures, or clinical decisions | |
4. **Tables with dates and procedures** - Any table format showing timeline of medical events | |
5. **Individual medication mentions in clinical text** - References to medications within clinical narratives | |
**EXAMPLES OF CONTENT TO NEVER REMOVE:** | |
- Tables like: "| Datum | Bespreking |" followed by medical events | |
- "| 07/07/2017 | Niertransplantatie met donornier..." | |
- "| 15/8/2017 | Uitgestelde transplantfunctie..." | |
- "| 26/03/2018 | plaatsing peritoneaal dialysekatheter..." | |
- Any text describing medical procedures, surgeries, or treatments | |
- Clinical narratives mentioning medications in context (e.g., "behandeling met Sotrovimab") | |
**KEY DISTINGUISHING FEATURES:** | |
- **Medication lists**: Standalone sections with drug names + dosages + frequencies | |
- **Medical history**: Tables or narratives describing medical events, procedures, surgeries | |
- **Clinical text**: Discussions of treatment decisions, medical events, or conditions | |
**If you see a table with dates and medical procedures, it is MEDICAL HISTORY, not a medication list.** | |
**If you see clinical text discussing treatments or procedures, it is CLINICAL DISCUSSION, not a medication list.** | |
Document structure: | |
{text_analysis} | |
**Analysis Instructions:** | |
1. Look ONLY for formal medication sections with clear headers (e.g., "Thuismedicatie", "Huidige thuismedicatie") | |
2. Identify sections that contain LISTS of medications with dosages and frequencies | |
3. **NEVER identify medical history tables as medication lists** | |
4. **NEVER identify clinical discussions as medication lists** | |
5. Be extremely conservative - if in doubt, do NOT remove | |
6. Focus ONLY on standalone medication documentation sections | |
Return your analysis as a JSON object with this exact structure: | |
{{ | |
"indices_to_remove": [list of integer indices - ONLY formal medication lists], | |
"reasoning": {{ | |
"justification": "explanation of why only formal medication lists were selected for removal", | |
"confidence": "high/medium/low" | |
}} | |
}} | |
""" | |
logger.info(f"Prompt length: {len(prompt)}") | |
logger.info(f"Number of text elements: {len(text_analysis)}") | |
try: | |
response = self.client.chat.completions.create( | |
messages=[ | |
{ | |
"role": "system", | |
"content": "You are a helpful assistant that analyzes medical documents and identifies formal medication lists for redaction.", | |
}, | |
{ | |
"role": "user", | |
"content": prompt, | |
} | |
], | |
max_completion_tokens=100000, | |
model=self.deployment, | |
response_format={"type": "json_object"} | |
) | |
# Record token usage and cost | |
if hasattr(response, 'usage') and response.usage: | |
cost_tracker.record_usage( | |
prompt_tokens=response.usage.prompt_tokens, | |
completion_tokens=response.usage.completion_tokens, | |
model=self.model_name | |
) | |
logger.info(f"API call completed - Input: {response.usage.prompt_tokens}, " | |
f"Output: {response.usage.completion_tokens}, " | |
f"Total: {response.usage.total_tokens} tokens") | |
except Exception as e: | |
logger.error(f"Exception during LLM call: {e}", exc_info=True) | |
return {"indices_to_remove": [], "reasoning": {"confidence": "low"}} | |
try: | |
logger.info(f"Raw LLM response: {response.choices[0].message.content!r}") | |
# Parse the structured JSON response | |
result = json.loads(response.choices[0].message.content) | |
# Get the indices to remove | |
indices_to_remove = result.get("indices_to_remove", []) | |
# Log what the LLM suggested | |
logger.info(f"LLM suggested removing {len(indices_to_remove)} elements: {indices_to_remove}") | |
# Log detailed information about what's being removed | |
if indices_to_remove: | |
logger.info("DETAILED ANALYSIS OF LLM SUGGESTIONS:") | |
logger.info("=" * 60) | |
for idx in indices_to_remove: | |
if idx < len(text_analysis): | |
text_content = text_analysis[idx].get("text", "") | |
text_label = text_analysis[idx].get("label", "") | |
logger.info(f"Index {idx} ({text_label}): '{text_content}'") | |
else: | |
logger.error(f"Index {idx} is out of bounds (max: {len(text_analysis)-1})") | |
logger.info("=" * 60) | |
# Log the reasoning if provided | |
reasoning = result.get("reasoning", {}) | |
if reasoning: | |
logger.info(f"LLM reasoning: {reasoning}") | |
logger.info(f"Final removal list: {len(indices_to_remove)} elements will be removed") | |
else: | |
logger.info("No elements will be removed") | |
return result | |
except Exception as e: | |
logger.error(f"Failed to parse LLM response: {e}") | |
return {"indices_to_remove": [], "reasoning": {"confidence": "low"}} |