Spaces:

levalencia
/

docling

Sleeping

App Files Files Community

docling / src /processing /llm_extractor.py

levalencia

Add reportlab dependency for PDF generation and enhance document processing

5d3ebd9 about 2 months ago

raw

history blame contribute delete

7.93 kB

	# src/processing/llm_extractor.py


	import json
	import logging
	from typing import Dict, Any

	from openai import AzureOpenAI
	from utils.cost_tracker import cost_tracker

	logger = logging.getLogger(__name__)

	class AzureO1MedicationExtractor:
	def __init__(
	self,
	endpoint: str,
	api_key: str,
	api_version: str,
	deployment: str,
	model_name: str = None,
	):
	self.client = AzureOpenAI(
	api_version=api_version,
	azure_endpoint=endpoint,
	api_key=api_key,
	)
	self.deployment = deployment
	self.model_name = model_name or deployment

	def extract_medication_sections(self, doc_json: Dict[str, Any]) -> Dict[str, Any]:
	texts = doc_json.get("texts", [])
	text_analysis = []
	for i, text_elem in enumerate(texts):
	text_analysis.append({
	"index": i,
	"text": text_elem.get("text", ""),
	"label": text_elem.get("label", ""),
	"level": text_elem.get("level", 0),
	"parent": text_elem.get("parent", {}),
	})

	prompt = f"""
	You are a medical document analysis expert specializing in discharge letters. Your task is to identify ONLY the formal medication lists that should be redacted, while preserving ALL other content including medical history tables.

	CRITICAL: You should ONLY remove formal medication lists with explicit medication names, dosages, and frequencies.

	What to REMOVE (medication lists only):
	1. Current medication list - sections with headers like "Huidige thuismedicatie", "Current medications", "Medicatie"
	2. Discharge medication list - sections with headers like "Als verdere behandeling stellen wij voor", "Thuismedicatie", "Discharge medications"

	What medication lists look like:
	- Header: "Huidige thuismedicatie" or similar
	- Followed by multiple lines with medication names, dosages, frequencies
	- Example: "Pantomed 20mg Tablet Oral - 1 tablet - 2 maal daags"
	- Example: "Forlax 10g Zakje Oral - 2 zakje - 1 maal daags (zo nodig)"

	What to ABSOLUTELY NEVER REMOVE:
	1. Medical history tables - Tables with "Datum" and "Bespreking" columns containing dates and medical events
	2. Treatment history - Narrative descriptions of medical procedures, treatments, or events
	3. Clinical discussions - Any text discussing medical conditions, procedures, or clinical decisions
	4. Tables with dates and procedures - Any table format showing timeline of medical events
	5. Individual medication mentions in clinical text - References to medications within clinical narratives

	EXAMPLES OF CONTENT TO NEVER REMOVE:
	- Tables like: "\| Datum \| Bespreking \|" followed by medical events
	- "\| 07/07/2017 \| Niertransplantatie met donornier..."
	- "\| 15/8/2017 \| Uitgestelde transplantfunctie..."
	- "\| 26/03/2018 \| plaatsing peritoneaal dialysekatheter..."
	- Any text describing medical procedures, surgeries, or treatments
	- Clinical narratives mentioning medications in context (e.g., "behandeling met Sotrovimab")

	KEY DISTINGUISHING FEATURES:
	- Medication lists: Standalone sections with drug names + dosages + frequencies
	- Medical history: Tables or narratives describing medical events, procedures, surgeries
	- Clinical text: Discussions of treatment decisions, medical events, or conditions

	If you see a table with dates and medical procedures, it is MEDICAL HISTORY, not a medication list.
	If you see clinical text discussing treatments or procedures, it is CLINICAL DISCUSSION, not a medication list.

	Document structure:
	{text_analysis}

	Analysis Instructions:
	1. Look ONLY for formal medication sections with clear headers (e.g., "Thuismedicatie", "Huidige thuismedicatie")
	2. Identify sections that contain LISTS of medications with dosages and frequencies
	3. NEVER identify medical history tables as medication lists
	4. NEVER identify clinical discussions as medication lists
	5. Be extremely conservative - if in doubt, do NOT remove
	6. Focus ONLY on standalone medication documentation sections

	Return your analysis as a JSON object with this exact structure:
	{{
	"indices_to_remove": [list of integer indices - ONLY formal medication lists],
	"reasoning": {{
	"justification": "explanation of why only formal medication lists were selected for removal",
	"confidence": "high/medium/low"
	}}
	}}
	"""

	logger.info(f"Prompt length: {len(prompt)}")
	logger.info(f"Number of text elements: {len(text_analysis)}")

	try:
	response = self.client.chat.completions.create(
	messages=[
	{
	"role": "system",
	"content": "You are a helpful assistant that analyzes medical documents and identifies formal medication lists for redaction.",
	},
	{
	"role": "user",
	"content": prompt,
	}
	],
	max_completion_tokens=100000,
	model=self.deployment,
	response_format={"type": "json_object"}
	)

	# Record token usage and cost
	if hasattr(response, 'usage') and response.usage:
	cost_tracker.record_usage(
	prompt_tokens=response.usage.prompt_tokens,
	completion_tokens=response.usage.completion_tokens,
	model=self.model_name
	)
	logger.info(f"API call completed - Input: {response.usage.prompt_tokens}, "
	f"Output: {response.usage.completion_tokens}, "
	f"Total: {response.usage.total_tokens} tokens")

	except Exception as e:
	logger.error(f"Exception during LLM call: {e}", exc_info=True)
	return {"indices_to_remove": [], "reasoning": {"confidence": "low"}}

	try:
	logger.info(f"Raw LLM response: {response.choices[0].message.content!r}")

	# Parse the structured JSON response
	result = json.loads(response.choices[0].message.content)

	# Get the indices to remove
	indices_to_remove = result.get("indices_to_remove", [])

	# Log what the LLM suggested
	logger.info(f"LLM suggested removing {len(indices_to_remove)} elements: {indices_to_remove}")

	# Log detailed information about what's being removed
	if indices_to_remove:
	logger.info("DETAILED ANALYSIS OF LLM SUGGESTIONS:")
	logger.info("=" * 60)

	for idx in indices_to_remove:
	if idx < len(text_analysis):
	text_content = text_analysis[idx].get("text", "")
	text_label = text_analysis[idx].get("label", "")
	logger.info(f"Index {idx} ({text_label}): '{text_content}'")
	else:
	logger.error(f"Index {idx} is out of bounds (max: {len(text_analysis)-1})")

	logger.info("=" * 60)

	# Log the reasoning if provided
	reasoning = result.get("reasoning", {})
	if reasoning:
	logger.info(f"LLM reasoning: {reasoning}")

	logger.info(f"Final removal list: {len(indices_to_remove)} elements will be removed")
	else:
	logger.info("No elements will be removed")

	return result

	except Exception as e:
	logger.error(f"Failed to parse LLM response: {e}")
	return {"indices_to_remove": [], "reasoning": {"confidence": "low"}}