Spaces:

ArthyP
/

enhanced-rag-demo

Sleeping

enhanced-rag-demo / src /components /generators /response_parsers /markdown_parser.py

Arthur Passuello

initial commit

5e1a30c about 1 month ago

12.9 kB

	"""
	Markdown response parser implementation.

	This module provides a parser that extracts structured information
	from markdown-formatted LLM responses, including citations and formatting.

	Architecture Notes:
	- Direct implementation (no adapter needed)
	- Pure text parsing algorithms
	- Handles various markdown conventions
	- Robust citation extraction
	"""

	import re
	import logging
	from typing import List, Dict, Any, Optional, Tuple
	from dataclasses import dataclass

	from ..base import ResponseParser, Citation, Document, ParsingError, ConfigurableComponent

	logger = logging.getLogger(__name__)


	class MarkdownParser(ResponseParser, ConfigurableComponent):
	"""
	Parser for markdown-formatted responses.

	Features:
	- Extract main answer text
	- Parse inline citations [1], [Document 1], etc.
	- Handle footnote-style citations
	- Preserve formatting (headers, lists, code blocks)
	- Extract confidence statements

	Configuration:
	- extract_citations: Whether to extract citations (default: True)
	- citation_patterns: Regex patterns for citations (customizable)
	- preserve_formatting: Keep markdown formatting (default: True)
	- extract_sections: Parse into sections by headers (default: False)
	"""

	# Default citation patterns
	DEFAULT_CITATION_PATTERNS = [
	r'\[(\d+)\]', # [1], [2], etc.
	r'\[Document\s+(\d+)\]', # [Document 1], [Document 2]
	r'\[Document\s+(\d+),\s*Page\s+\d+\]', # [Document 1, Page 1], [Document 2, Page 15]
	r'\[Doc\s+(\d+)\]', # [Doc 1], [Doc 2]
	r'\[\^(\d+)\]', # Footnote style [^1]
	r'¹²³⁴⁵⁶⁷⁸⁹⁰', # Unicode superscripts
	]

	def __init__(self,
	extract_citations: bool = True,
	preserve_formatting: bool = True,
	extract_sections: bool = False,
	citation_patterns: Optional[List[str]] = None,
	config: Optional[Dict[str, Any]] = None):
	"""
	Initialize markdown parser.

	Args:
	extract_citations: Whether to extract citations
	preserve_formatting: Keep markdown formatting
	extract_sections: Parse into sections by headers
	citation_patterns: Custom citation regex patterns
	config: Additional configuration
	"""
	# Merge config
	parser_config = {
	'extract_citations': extract_citations,
	'preserve_formatting': preserve_formatting,
	'extract_sections': extract_sections,
	'citation_patterns': citation_patterns or self.DEFAULT_CITATION_PATTERNS,
	**(config or {})
	}

	super().__init__(parser_config)

	self.extract_citations_enabled = parser_config['extract_citations']
	self.preserve_formatting = parser_config['preserve_formatting']
	self.extract_sections = parser_config['extract_sections']

	# Compile citation patterns
	self.citation_patterns = [
	re.compile(pattern) for pattern in parser_config['citation_patterns']
	]

	def parse(self, raw_response: str) -> Dict[str, Any]:
	"""
	Parse the raw LLM response into structured format.

	Args:
	raw_response: Raw text from LLM

	Returns:
	Structured dictionary with parsed content

	Raises:
	ParsingError: If parsing fails
	"""
	if not raw_response:
	raise ParsingError("Empty response to parse")

	try:
	# Clean response
	cleaned = self._clean_response(raw_response)

	# Extract main components
	result = {
	'answer': cleaned,
	'raw_response': raw_response,
	'format': 'markdown',
	'metadata': {}
	}

	# Extract sections if requested
	if self.extract_sections:
	sections = self._extract_sections(cleaned)
	result['sections'] = sections
	result['answer'] = self._merge_sections(sections)

	# Extract confidence if present
	confidence = self._extract_confidence(cleaned)
	if confidence is not None:
	result['confidence'] = confidence

	# Extract any metadata
	metadata = self._extract_metadata(cleaned)
	result['metadata'].update(metadata)

	return result

	except Exception as e:
	logger.error(f"Failed to parse response: {str(e)}")
	raise ParsingError(f"Markdown parsing failed: {str(e)}")

	def extract_citations(self, response: Dict[str, Any], context: List[Document]) -> List[Citation]:
	"""
	Extract citations from the parsed response.

	Args:
	response: Parsed response dictionary
	context: Original context documents

	Returns:
	List of extracted citations
	"""
	if not self.extract_citations_enabled:
	return []

	answer_text = response.get('answer', '')
	citations = []

	# Find all citation markers in the text
	for pattern in self.citation_patterns:
	for match in pattern.finditer(answer_text):
	citation_marker = match.group(0)
	citation_id = match.group(1) if match.groups() else match.group(0)

	# Try to resolve to document
	doc_index = self._resolve_citation_index(citation_id)
	if doc_index is not None and 0 <= doc_index < len(context):
	# Create citation object
	citation = Citation(
	source_id=f"doc_{doc_index}",
	text=citation_marker,
	start_pos=match.start(),
	end_pos=match.end(),
	confidence=0.9 # High confidence for explicit citations
	)
	citations.append(citation)

	# Remove duplicates while preserving order
	seen = set()
	unique_citations = []
	for citation in citations:
	key = (citation.source_id, citation.text)
	if key not in seen:
	seen.add(key)
	unique_citations.append(citation)

	logger.debug(f"Extracted {len(unique_citations)} unique citations")
	return unique_citations

	def get_parser_info(self) -> Dict[str, Any]:
	"""Get information about the parser."""
	return {
	'type': 'markdown',
	'parser_class': self.__class__.__name__,
	'extract_citations': self.extract_citations_enabled,
	'preserve_formatting': self.preserve_formatting,
	'extract_sections': self.extract_sections,
	'citation_patterns': len(self.citation_patterns),
	'capabilities': {
	'handles_markdown': True,
	'extracts_structure': self.extract_sections,
	'preserves_formatting': self.preserve_formatting
	}
	}

	def _clean_response(self, response: str) -> str:
	"""
	Clean the response while preserving formatting.

	Args:
	response: Raw response text

	Returns:
	Cleaned response
	"""
	# Remove leading/trailing whitespace
	cleaned = response.strip()

	# Remove any markdown artifacts if not preserving
	if not self.preserve_formatting:
	# Remove code blocks
	cleaned = re.sub(r'```[\s\S]*?```', '', cleaned)
	# Remove inline code
	cleaned = re.sub(r'`[^`]+`', lambda m: m.group(0)[1:-1], cleaned)
	# Remove emphasis
	cleaned = re.sub(r'\\([^]+)\\*', r'\1', cleaned)
	cleaned = re.sub(r'\([^]+)\*', r'\1', cleaned)
	cleaned = re.sub(r'__([^_]+)__', r'\1', cleaned)
	cleaned = re.sub(r'_([^_]+)_', r'\1', cleaned)

	return cleaned

	def _extract_sections(self, text: str) -> Dict[str, str]:
	"""
	Extract sections based on markdown headers.

	Args:
	text: Markdown text

	Returns:
	Dictionary of section_name -> content
	"""
	sections = {}
	current_section = "main"
	current_content = []

	lines = text.split('\n')
	for line in lines:
	# Check for headers
	header_match = re.match(r'^#+\s+(.+)$', line)
	if header_match:
	# Save previous section
	if current_content:
	sections[current_section] = '\n'.join(current_content).strip()

	# Start new section
	current_section = header_match.group(1).strip()
	current_content = []
	else:
	current_content.append(line)

	# Save last section
	if current_content:
	sections[current_section] = '\n'.join(current_content).strip()

	return sections

	def _merge_sections(self, sections: Dict[str, str]) -> str:
	"""
	Merge sections back into a single answer.

	Args:
	sections: Dictionary of sections

	Returns:
	Merged text
	"""
	# Prioritize certain sections
	priority_sections = ['answer', 'response', 'main', 'summary']

	merged = []

	# Add priority sections first
	for section_name in priority_sections:
	if section_name in sections and sections[section_name]:
	merged.append(sections[section_name])

	# Add remaining sections
	for section_name, content in sections.items():
	if section_name not in priority_sections and content:
	merged.append(content)

	return '\n\n'.join(merged)

	def _extract_confidence(self, text: str) -> Optional[float]:
	"""
	Extract confidence score if mentioned in text.

	Args:
	text: Response text

	Returns:
	Confidence score or None
	"""
	# Look for confidence patterns
	confidence_patterns = [
	r'confidence:?\s(\d+(?:\.\d+)?)\s%',
	r'confidence:?\s*(\d+(?:\.\d+)?)',
	r'(\d+(?:\.\d+)?)\s%\sconfident',
	]

	for pattern in confidence_patterns:
	match = re.search(pattern, text, re.IGNORECASE)
	if match:
	try:
	value = float(match.group(1))
	# Normalize to 0-1 range
	if value > 1:
	value = value / 100
	return min(max(value, 0.0), 1.0)
	except ValueError:
	continue

	return None

	def _extract_metadata(self, text: str) -> Dict[str, Any]:
	"""
	Extract any metadata from the response.

	Args:
	text: Response text

	Returns:
	Metadata dictionary
	"""
	metadata = {}

	# Extract word count
	words = text.split()
	metadata['word_count'] = len(words)

	# Check for specific markers
	if re.search(r'uncertain\|not sure\|unclear', text, re.IGNORECASE):
	metadata['uncertainty_detected'] = True

	if re.search(r'no information\|not found\|not available', text, re.IGNORECASE):
	metadata['no_answer_detected'] = True

	# Count citations
	citation_count = 0
	for pattern in self.citation_patterns:
	citation_count += len(pattern.findall(text))
	metadata['citation_count'] = citation_count

	return metadata

	def _resolve_citation_index(self, citation_id: str) -> Optional[int]:
	"""
	Resolve citation ID to document index.

	Args:
	citation_id: Citation identifier (e.g., "1", "2")

	Returns:
	Zero-based document index or None
	"""
	try:
	# Try to parse as integer
	index = int(citation_id) - 1 # Convert to 0-based
	return index
	except ValueError:
	# Handle special cases
	if citation_id.lower() in ['a', 'b', 'c', 'd', 'e']:
	return ord(citation_id.lower()) - ord('a')
	return None