Spaces:
Sleeping
Sleeping
File size: 12,913 Bytes
5e1a30c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 |
"""
Markdown response parser implementation.
This module provides a parser that extracts structured information
from markdown-formatted LLM responses, including citations and formatting.
Architecture Notes:
- Direct implementation (no adapter needed)
- Pure text parsing algorithms
- Handles various markdown conventions
- Robust citation extraction
"""
import re
import logging
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass
from ..base import ResponseParser, Citation, Document, ParsingError, ConfigurableComponent
logger = logging.getLogger(__name__)
class MarkdownParser(ResponseParser, ConfigurableComponent):
"""
Parser for markdown-formatted responses.
Features:
- Extract main answer text
- Parse inline citations [1], [Document 1], etc.
- Handle footnote-style citations
- Preserve formatting (headers, lists, code blocks)
- Extract confidence statements
Configuration:
- extract_citations: Whether to extract citations (default: True)
- citation_patterns: Regex patterns for citations (customizable)
- preserve_formatting: Keep markdown formatting (default: True)
- extract_sections: Parse into sections by headers (default: False)
"""
# Default citation patterns
DEFAULT_CITATION_PATTERNS = [
r'\[(\d+)\]', # [1], [2], etc.
r'\[Document\s+(\d+)\]', # [Document 1], [Document 2]
r'\[Document\s+(\d+),\s*Page\s+\d+\]', # [Document 1, Page 1], [Document 2, Page 15]
r'\[Doc\s+(\d+)\]', # [Doc 1], [Doc 2]
r'\[\^(\d+)\]', # Footnote style [^1]
r'¹²³⁴⁵⁶⁷⁸⁹⁰', # Unicode superscripts
]
def __init__(self,
extract_citations: bool = True,
preserve_formatting: bool = True,
extract_sections: bool = False,
citation_patterns: Optional[List[str]] = None,
config: Optional[Dict[str, Any]] = None):
"""
Initialize markdown parser.
Args:
extract_citations: Whether to extract citations
preserve_formatting: Keep markdown formatting
extract_sections: Parse into sections by headers
citation_patterns: Custom citation regex patterns
config: Additional configuration
"""
# Merge config
parser_config = {
'extract_citations': extract_citations,
'preserve_formatting': preserve_formatting,
'extract_sections': extract_sections,
'citation_patterns': citation_patterns or self.DEFAULT_CITATION_PATTERNS,
**(config or {})
}
super().__init__(parser_config)
self.extract_citations_enabled = parser_config['extract_citations']
self.preserve_formatting = parser_config['preserve_formatting']
self.extract_sections = parser_config['extract_sections']
# Compile citation patterns
self.citation_patterns = [
re.compile(pattern) for pattern in parser_config['citation_patterns']
]
def parse(self, raw_response: str) -> Dict[str, Any]:
"""
Parse the raw LLM response into structured format.
Args:
raw_response: Raw text from LLM
Returns:
Structured dictionary with parsed content
Raises:
ParsingError: If parsing fails
"""
if not raw_response:
raise ParsingError("Empty response to parse")
try:
# Clean response
cleaned = self._clean_response(raw_response)
# Extract main components
result = {
'answer': cleaned,
'raw_response': raw_response,
'format': 'markdown',
'metadata': {}
}
# Extract sections if requested
if self.extract_sections:
sections = self._extract_sections(cleaned)
result['sections'] = sections
result['answer'] = self._merge_sections(sections)
# Extract confidence if present
confidence = self._extract_confidence(cleaned)
if confidence is not None:
result['confidence'] = confidence
# Extract any metadata
metadata = self._extract_metadata(cleaned)
result['metadata'].update(metadata)
return result
except Exception as e:
logger.error(f"Failed to parse response: {str(e)}")
raise ParsingError(f"Markdown parsing failed: {str(e)}")
def extract_citations(self, response: Dict[str, Any], context: List[Document]) -> List[Citation]:
"""
Extract citations from the parsed response.
Args:
response: Parsed response dictionary
context: Original context documents
Returns:
List of extracted citations
"""
if not self.extract_citations_enabled:
return []
answer_text = response.get('answer', '')
citations = []
# Find all citation markers in the text
for pattern in self.citation_patterns:
for match in pattern.finditer(answer_text):
citation_marker = match.group(0)
citation_id = match.group(1) if match.groups() else match.group(0)
# Try to resolve to document
doc_index = self._resolve_citation_index(citation_id)
if doc_index is not None and 0 <= doc_index < len(context):
# Create citation object
citation = Citation(
source_id=f"doc_{doc_index}",
text=citation_marker,
start_pos=match.start(),
end_pos=match.end(),
confidence=0.9 # High confidence for explicit citations
)
citations.append(citation)
# Remove duplicates while preserving order
seen = set()
unique_citations = []
for citation in citations:
key = (citation.source_id, citation.text)
if key not in seen:
seen.add(key)
unique_citations.append(citation)
logger.debug(f"Extracted {len(unique_citations)} unique citations")
return unique_citations
def get_parser_info(self) -> Dict[str, Any]:
"""Get information about the parser."""
return {
'type': 'markdown',
'parser_class': self.__class__.__name__,
'extract_citations': self.extract_citations_enabled,
'preserve_formatting': self.preserve_formatting,
'extract_sections': self.extract_sections,
'citation_patterns': len(self.citation_patterns),
'capabilities': {
'handles_markdown': True,
'extracts_structure': self.extract_sections,
'preserves_formatting': self.preserve_formatting
}
}
def _clean_response(self, response: str) -> str:
"""
Clean the response while preserving formatting.
Args:
response: Raw response text
Returns:
Cleaned response
"""
# Remove leading/trailing whitespace
cleaned = response.strip()
# Remove any markdown artifacts if not preserving
if not self.preserve_formatting:
# Remove code blocks
cleaned = re.sub(r'```[\s\S]*?```', '', cleaned)
# Remove inline code
cleaned = re.sub(r'`[^`]+`', lambda m: m.group(0)[1:-1], cleaned)
# Remove emphasis
cleaned = re.sub(r'\*\*([^*]+)\*\*', r'\1', cleaned)
cleaned = re.sub(r'\*([^*]+)\*', r'\1', cleaned)
cleaned = re.sub(r'__([^_]+)__', r'\1', cleaned)
cleaned = re.sub(r'_([^_]+)_', r'\1', cleaned)
return cleaned
def _extract_sections(self, text: str) -> Dict[str, str]:
"""
Extract sections based on markdown headers.
Args:
text: Markdown text
Returns:
Dictionary of section_name -> content
"""
sections = {}
current_section = "main"
current_content = []
lines = text.split('\n')
for line in lines:
# Check for headers
header_match = re.match(r'^#+\s+(.+)$', line)
if header_match:
# Save previous section
if current_content:
sections[current_section] = '\n'.join(current_content).strip()
# Start new section
current_section = header_match.group(1).strip()
current_content = []
else:
current_content.append(line)
# Save last section
if current_content:
sections[current_section] = '\n'.join(current_content).strip()
return sections
def _merge_sections(self, sections: Dict[str, str]) -> str:
"""
Merge sections back into a single answer.
Args:
sections: Dictionary of sections
Returns:
Merged text
"""
# Prioritize certain sections
priority_sections = ['answer', 'response', 'main', 'summary']
merged = []
# Add priority sections first
for section_name in priority_sections:
if section_name in sections and sections[section_name]:
merged.append(sections[section_name])
# Add remaining sections
for section_name, content in sections.items():
if section_name not in priority_sections and content:
merged.append(content)
return '\n\n'.join(merged)
def _extract_confidence(self, text: str) -> Optional[float]:
"""
Extract confidence score if mentioned in text.
Args:
text: Response text
Returns:
Confidence score or None
"""
# Look for confidence patterns
confidence_patterns = [
r'confidence:?\s*(\d+(?:\.\d+)?)\s*%',
r'confidence:?\s*(\d+(?:\.\d+)?)',
r'(\d+(?:\.\d+)?)\s*%\s*confident',
]
for pattern in confidence_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
try:
value = float(match.group(1))
# Normalize to 0-1 range
if value > 1:
value = value / 100
return min(max(value, 0.0), 1.0)
except ValueError:
continue
return None
def _extract_metadata(self, text: str) -> Dict[str, Any]:
"""
Extract any metadata from the response.
Args:
text: Response text
Returns:
Metadata dictionary
"""
metadata = {}
# Extract word count
words = text.split()
metadata['word_count'] = len(words)
# Check for specific markers
if re.search(r'uncertain|not sure|unclear', text, re.IGNORECASE):
metadata['uncertainty_detected'] = True
if re.search(r'no information|not found|not available', text, re.IGNORECASE):
metadata['no_answer_detected'] = True
# Count citations
citation_count = 0
for pattern in self.citation_patterns:
citation_count += len(pattern.findall(text))
metadata['citation_count'] = citation_count
return metadata
def _resolve_citation_index(self, citation_id: str) -> Optional[int]:
"""
Resolve citation ID to document index.
Args:
citation_id: Citation identifier (e.g., "1", "2")
Returns:
Zero-based document index or None
"""
try:
# Try to parse as integer
index = int(citation_id) - 1 # Convert to 0-based
return index
except ValueError:
# Handle special cases
if citation_id.lower() in ['a', 'b', 'c', 'd', 'e']:
return ord(citation_id.lower()) - ord('a')
return None |