Arthur Passuello
initial commit
5e1a30c
"""
Markdown response parser implementation.
This module provides a parser that extracts structured information
from markdown-formatted LLM responses, including citations and formatting.
Architecture Notes:
- Direct implementation (no adapter needed)
- Pure text parsing algorithms
- Handles various markdown conventions
- Robust citation extraction
"""
import re
import logging
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass
from ..base import ResponseParser, Citation, Document, ParsingError, ConfigurableComponent
logger = logging.getLogger(__name__)
class MarkdownParser(ResponseParser, ConfigurableComponent):
"""
Parser for markdown-formatted responses.
Features:
- Extract main answer text
- Parse inline citations [1], [Document 1], etc.
- Handle footnote-style citations
- Preserve formatting (headers, lists, code blocks)
- Extract confidence statements
Configuration:
- extract_citations: Whether to extract citations (default: True)
- citation_patterns: Regex patterns for citations (customizable)
- preserve_formatting: Keep markdown formatting (default: True)
- extract_sections: Parse into sections by headers (default: False)
"""
# Default citation patterns
DEFAULT_CITATION_PATTERNS = [
r'\[(\d+)\]', # [1], [2], etc.
r'\[Document\s+(\d+)\]', # [Document 1], [Document 2]
r'\[Document\s+(\d+),\s*Page\s+\d+\]', # [Document 1, Page 1], [Document 2, Page 15]
r'\[Doc\s+(\d+)\]', # [Doc 1], [Doc 2]
r'\[\^(\d+)\]', # Footnote style [^1]
r'¹²³⁴⁵⁶⁷⁸⁹⁰', # Unicode superscripts
]
def __init__(self,
extract_citations: bool = True,
preserve_formatting: bool = True,
extract_sections: bool = False,
citation_patterns: Optional[List[str]] = None,
config: Optional[Dict[str, Any]] = None):
"""
Initialize markdown parser.
Args:
extract_citations: Whether to extract citations
preserve_formatting: Keep markdown formatting
extract_sections: Parse into sections by headers
citation_patterns: Custom citation regex patterns
config: Additional configuration
"""
# Merge config
parser_config = {
'extract_citations': extract_citations,
'preserve_formatting': preserve_formatting,
'extract_sections': extract_sections,
'citation_patterns': citation_patterns or self.DEFAULT_CITATION_PATTERNS,
**(config or {})
}
super().__init__(parser_config)
self.extract_citations_enabled = parser_config['extract_citations']
self.preserve_formatting = parser_config['preserve_formatting']
self.extract_sections = parser_config['extract_sections']
# Compile citation patterns
self.citation_patterns = [
re.compile(pattern) for pattern in parser_config['citation_patterns']
]
def parse(self, raw_response: str) -> Dict[str, Any]:
"""
Parse the raw LLM response into structured format.
Args:
raw_response: Raw text from LLM
Returns:
Structured dictionary with parsed content
Raises:
ParsingError: If parsing fails
"""
if not raw_response:
raise ParsingError("Empty response to parse")
try:
# Clean response
cleaned = self._clean_response(raw_response)
# Extract main components
result = {
'answer': cleaned,
'raw_response': raw_response,
'format': 'markdown',
'metadata': {}
}
# Extract sections if requested
if self.extract_sections:
sections = self._extract_sections(cleaned)
result['sections'] = sections
result['answer'] = self._merge_sections(sections)
# Extract confidence if present
confidence = self._extract_confidence(cleaned)
if confidence is not None:
result['confidence'] = confidence
# Extract any metadata
metadata = self._extract_metadata(cleaned)
result['metadata'].update(metadata)
return result
except Exception as e:
logger.error(f"Failed to parse response: {str(e)}")
raise ParsingError(f"Markdown parsing failed: {str(e)}")
def extract_citations(self, response: Dict[str, Any], context: List[Document]) -> List[Citation]:
"""
Extract citations from the parsed response.
Args:
response: Parsed response dictionary
context: Original context documents
Returns:
List of extracted citations
"""
if not self.extract_citations_enabled:
return []
answer_text = response.get('answer', '')
citations = []
# Find all citation markers in the text
for pattern in self.citation_patterns:
for match in pattern.finditer(answer_text):
citation_marker = match.group(0)
citation_id = match.group(1) if match.groups() else match.group(0)
# Try to resolve to document
doc_index = self._resolve_citation_index(citation_id)
if doc_index is not None and 0 <= doc_index < len(context):
# Create citation object
citation = Citation(
source_id=f"doc_{doc_index}",
text=citation_marker,
start_pos=match.start(),
end_pos=match.end(),
confidence=0.9 # High confidence for explicit citations
)
citations.append(citation)
# Remove duplicates while preserving order
seen = set()
unique_citations = []
for citation in citations:
key = (citation.source_id, citation.text)
if key not in seen:
seen.add(key)
unique_citations.append(citation)
logger.debug(f"Extracted {len(unique_citations)} unique citations")
return unique_citations
def get_parser_info(self) -> Dict[str, Any]:
"""Get information about the parser."""
return {
'type': 'markdown',
'parser_class': self.__class__.__name__,
'extract_citations': self.extract_citations_enabled,
'preserve_formatting': self.preserve_formatting,
'extract_sections': self.extract_sections,
'citation_patterns': len(self.citation_patterns),
'capabilities': {
'handles_markdown': True,
'extracts_structure': self.extract_sections,
'preserves_formatting': self.preserve_formatting
}
}
def _clean_response(self, response: str) -> str:
"""
Clean the response while preserving formatting.
Args:
response: Raw response text
Returns:
Cleaned response
"""
# Remove leading/trailing whitespace
cleaned = response.strip()
# Remove any markdown artifacts if not preserving
if not self.preserve_formatting:
# Remove code blocks
cleaned = re.sub(r'```[\s\S]*?```', '', cleaned)
# Remove inline code
cleaned = re.sub(r'`[^`]+`', lambda m: m.group(0)[1:-1], cleaned)
# Remove emphasis
cleaned = re.sub(r'\*\*([^*]+)\*\*', r'\1', cleaned)
cleaned = re.sub(r'\*([^*]+)\*', r'\1', cleaned)
cleaned = re.sub(r'__([^_]+)__', r'\1', cleaned)
cleaned = re.sub(r'_([^_]+)_', r'\1', cleaned)
return cleaned
def _extract_sections(self, text: str) -> Dict[str, str]:
"""
Extract sections based on markdown headers.
Args:
text: Markdown text
Returns:
Dictionary of section_name -> content
"""
sections = {}
current_section = "main"
current_content = []
lines = text.split('\n')
for line in lines:
# Check for headers
header_match = re.match(r'^#+\s+(.+)$', line)
if header_match:
# Save previous section
if current_content:
sections[current_section] = '\n'.join(current_content).strip()
# Start new section
current_section = header_match.group(1).strip()
current_content = []
else:
current_content.append(line)
# Save last section
if current_content:
sections[current_section] = '\n'.join(current_content).strip()
return sections
def _merge_sections(self, sections: Dict[str, str]) -> str:
"""
Merge sections back into a single answer.
Args:
sections: Dictionary of sections
Returns:
Merged text
"""
# Prioritize certain sections
priority_sections = ['answer', 'response', 'main', 'summary']
merged = []
# Add priority sections first
for section_name in priority_sections:
if section_name in sections and sections[section_name]:
merged.append(sections[section_name])
# Add remaining sections
for section_name, content in sections.items():
if section_name not in priority_sections and content:
merged.append(content)
return '\n\n'.join(merged)
def _extract_confidence(self, text: str) -> Optional[float]:
"""
Extract confidence score if mentioned in text.
Args:
text: Response text
Returns:
Confidence score or None
"""
# Look for confidence patterns
confidence_patterns = [
r'confidence:?\s*(\d+(?:\.\d+)?)\s*%',
r'confidence:?\s*(\d+(?:\.\d+)?)',
r'(\d+(?:\.\d+)?)\s*%\s*confident',
]
for pattern in confidence_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
try:
value = float(match.group(1))
# Normalize to 0-1 range
if value > 1:
value = value / 100
return min(max(value, 0.0), 1.0)
except ValueError:
continue
return None
def _extract_metadata(self, text: str) -> Dict[str, Any]:
"""
Extract any metadata from the response.
Args:
text: Response text
Returns:
Metadata dictionary
"""
metadata = {}
# Extract word count
words = text.split()
metadata['word_count'] = len(words)
# Check for specific markers
if re.search(r'uncertain|not sure|unclear', text, re.IGNORECASE):
metadata['uncertainty_detected'] = True
if re.search(r'no information|not found|not available', text, re.IGNORECASE):
metadata['no_answer_detected'] = True
# Count citations
citation_count = 0
for pattern in self.citation_patterns:
citation_count += len(pattern.findall(text))
metadata['citation_count'] = citation_count
return metadata
def _resolve_citation_index(self, citation_id: str) -> Optional[int]:
"""
Resolve citation ID to document index.
Args:
citation_id: Citation identifier (e.g., "1", "2")
Returns:
Zero-based document index or None
"""
try:
# Try to parse as integer
index = int(citation_id) - 1 # Convert to 0-based
return index
except ValueError:
# Handle special cases
if citation_id.lower() in ['a', 'b', 'c', 'd', 'e']:
return ord(citation_id.lower()) - ord('a')
return None