File size: 12,913 Bytes
5e1a30c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
"""
Markdown response parser implementation.

This module provides a parser that extracts structured information
from markdown-formatted LLM responses, including citations and formatting.

Architecture Notes:
- Direct implementation (no adapter needed)
- Pure text parsing algorithms
- Handles various markdown conventions
- Robust citation extraction
"""

import re
import logging
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass

from ..base import ResponseParser, Citation, Document, ParsingError, ConfigurableComponent

logger = logging.getLogger(__name__)


class MarkdownParser(ResponseParser, ConfigurableComponent):
    """
    Parser for markdown-formatted responses.
    
    Features:
    - Extract main answer text
    - Parse inline citations [1], [Document 1], etc.
    - Handle footnote-style citations
    - Preserve formatting (headers, lists, code blocks)
    - Extract confidence statements
    
    Configuration:
    - extract_citations: Whether to extract citations (default: True)
    - citation_patterns: Regex patterns for citations (customizable)
    - preserve_formatting: Keep markdown formatting (default: True)
    - extract_sections: Parse into sections by headers (default: False)
    """
    
    # Default citation patterns
    DEFAULT_CITATION_PATTERNS = [
        r'\[(\d+)\]',                    # [1], [2], etc.
        r'\[Document\s+(\d+)\]',          # [Document 1], [Document 2]
        r'\[Document\s+(\d+),\s*Page\s+\d+\]',  # [Document 1, Page 1], [Document 2, Page 15]
        r'\[Doc\s+(\d+)\]',               # [Doc 1], [Doc 2]
        r'\[\^(\d+)\]',                   # Footnote style [^1]
        r'¹²³⁴⁵⁶⁷⁸⁹⁰',                   # Unicode superscripts
    ]
    
    def __init__(self,
                 extract_citations: bool = True,
                 preserve_formatting: bool = True,
                 extract_sections: bool = False,
                 citation_patterns: Optional[List[str]] = None,
                 config: Optional[Dict[str, Any]] = None):
        """
        Initialize markdown parser.
        
        Args:
            extract_citations: Whether to extract citations
            preserve_formatting: Keep markdown formatting
            extract_sections: Parse into sections by headers
            citation_patterns: Custom citation regex patterns
            config: Additional configuration
        """
        # Merge config
        parser_config = {
            'extract_citations': extract_citations,
            'preserve_formatting': preserve_formatting,
            'extract_sections': extract_sections,
            'citation_patterns': citation_patterns or self.DEFAULT_CITATION_PATTERNS,
            **(config or {})
        }
        
        super().__init__(parser_config)
        
        self.extract_citations_enabled = parser_config['extract_citations']
        self.preserve_formatting = parser_config['preserve_formatting']
        self.extract_sections = parser_config['extract_sections']
        
        # Compile citation patterns
        self.citation_patterns = [
            re.compile(pattern) for pattern in parser_config['citation_patterns']
        ]
    
    def parse(self, raw_response: str) -> Dict[str, Any]:
        """
        Parse the raw LLM response into structured format.
        
        Args:
            raw_response: Raw text from LLM
            
        Returns:
            Structured dictionary with parsed content
            
        Raises:
            ParsingError: If parsing fails
        """
        if not raw_response:
            raise ParsingError("Empty response to parse")
        
        try:
            # Clean response
            cleaned = self._clean_response(raw_response)
            
            # Extract main components
            result = {
                'answer': cleaned,
                'raw_response': raw_response,
                'format': 'markdown',
                'metadata': {}
            }
            
            # Extract sections if requested
            if self.extract_sections:
                sections = self._extract_sections(cleaned)
                result['sections'] = sections
                result['answer'] = self._merge_sections(sections)
            
            # Extract confidence if present
            confidence = self._extract_confidence(cleaned)
            if confidence is not None:
                result['confidence'] = confidence
            
            # Extract any metadata
            metadata = self._extract_metadata(cleaned)
            result['metadata'].update(metadata)
            
            return result
            
        except Exception as e:
            logger.error(f"Failed to parse response: {str(e)}")
            raise ParsingError(f"Markdown parsing failed: {str(e)}")
    
    def extract_citations(self, response: Dict[str, Any], context: List[Document]) -> List[Citation]:
        """
        Extract citations from the parsed response.
        
        Args:
            response: Parsed response dictionary
            context: Original context documents
            
        Returns:
            List of extracted citations
        """
        if not self.extract_citations_enabled:
            return []
        
        answer_text = response.get('answer', '')
        citations = []
        
        # Find all citation markers in the text
        for pattern in self.citation_patterns:
            for match in pattern.finditer(answer_text):
                citation_marker = match.group(0)
                citation_id = match.group(1) if match.groups() else match.group(0)
                
                # Try to resolve to document
                doc_index = self._resolve_citation_index(citation_id)
                if doc_index is not None and 0 <= doc_index < len(context):
                    # Create citation object
                    citation = Citation(
                        source_id=f"doc_{doc_index}",
                        text=citation_marker,
                        start_pos=match.start(),
                        end_pos=match.end(),
                        confidence=0.9  # High confidence for explicit citations
                    )
                    citations.append(citation)
        
        # Remove duplicates while preserving order
        seen = set()
        unique_citations = []
        for citation in citations:
            key = (citation.source_id, citation.text)
            if key not in seen:
                seen.add(key)
                unique_citations.append(citation)
        
        logger.debug(f"Extracted {len(unique_citations)} unique citations")
        return unique_citations
    
    def get_parser_info(self) -> Dict[str, Any]:
        """Get information about the parser."""
        return {
            'type': 'markdown',
            'parser_class': self.__class__.__name__,
            'extract_citations': self.extract_citations_enabled,
            'preserve_formatting': self.preserve_formatting,
            'extract_sections': self.extract_sections,
            'citation_patterns': len(self.citation_patterns),
            'capabilities': {
                'handles_markdown': True,
                'extracts_structure': self.extract_sections,
                'preserves_formatting': self.preserve_formatting
            }
        }
    
    def _clean_response(self, response: str) -> str:
        """
        Clean the response while preserving formatting.
        
        Args:
            response: Raw response text
            
        Returns:
            Cleaned response
        """
        # Remove leading/trailing whitespace
        cleaned = response.strip()
        
        # Remove any markdown artifacts if not preserving
        if not self.preserve_formatting:
            # Remove code blocks
            cleaned = re.sub(r'```[\s\S]*?```', '', cleaned)
            # Remove inline code
            cleaned = re.sub(r'`[^`]+`', lambda m: m.group(0)[1:-1], cleaned)
            # Remove emphasis
            cleaned = re.sub(r'\*\*([^*]+)\*\*', r'\1', cleaned)
            cleaned = re.sub(r'\*([^*]+)\*', r'\1', cleaned)
            cleaned = re.sub(r'__([^_]+)__', r'\1', cleaned)
            cleaned = re.sub(r'_([^_]+)_', r'\1', cleaned)
        
        return cleaned
    
    def _extract_sections(self, text: str) -> Dict[str, str]:
        """
        Extract sections based on markdown headers.
        
        Args:
            text: Markdown text
            
        Returns:
            Dictionary of section_name -> content
        """
        sections = {}
        current_section = "main"
        current_content = []
        
        lines = text.split('\n')
        for line in lines:
            # Check for headers
            header_match = re.match(r'^#+\s+(.+)$', line)
            if header_match:
                # Save previous section
                if current_content:
                    sections[current_section] = '\n'.join(current_content).strip()
                
                # Start new section
                current_section = header_match.group(1).strip()
                current_content = []
            else:
                current_content.append(line)
        
        # Save last section
        if current_content:
            sections[current_section] = '\n'.join(current_content).strip()
        
        return sections
    
    def _merge_sections(self, sections: Dict[str, str]) -> str:
        """
        Merge sections back into a single answer.
        
        Args:
            sections: Dictionary of sections
            
        Returns:
            Merged text
        """
        # Prioritize certain sections
        priority_sections = ['answer', 'response', 'main', 'summary']
        
        merged = []
        
        # Add priority sections first
        for section_name in priority_sections:
            if section_name in sections and sections[section_name]:
                merged.append(sections[section_name])
        
        # Add remaining sections
        for section_name, content in sections.items():
            if section_name not in priority_sections and content:
                merged.append(content)
        
        return '\n\n'.join(merged)
    
    def _extract_confidence(self, text: str) -> Optional[float]:
        """
        Extract confidence score if mentioned in text.
        
        Args:
            text: Response text
            
        Returns:
            Confidence score or None
        """
        # Look for confidence patterns
        confidence_patterns = [
            r'confidence:?\s*(\d+(?:\.\d+)?)\s*%',
            r'confidence:?\s*(\d+(?:\.\d+)?)',
            r'(\d+(?:\.\d+)?)\s*%\s*confident',
        ]
        
        for pattern in confidence_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                try:
                    value = float(match.group(1))
                    # Normalize to 0-1 range
                    if value > 1:
                        value = value / 100
                    return min(max(value, 0.0), 1.0)
                except ValueError:
                    continue
        
        return None
    
    def _extract_metadata(self, text: str) -> Dict[str, Any]:
        """
        Extract any metadata from the response.
        
        Args:
            text: Response text
            
        Returns:
            Metadata dictionary
        """
        metadata = {}
        
        # Extract word count
        words = text.split()
        metadata['word_count'] = len(words)
        
        # Check for specific markers
        if re.search(r'uncertain|not sure|unclear', text, re.IGNORECASE):
            metadata['uncertainty_detected'] = True
        
        if re.search(r'no information|not found|not available', text, re.IGNORECASE):
            metadata['no_answer_detected'] = True
        
        # Count citations
        citation_count = 0
        for pattern in self.citation_patterns:
            citation_count += len(pattern.findall(text))
        metadata['citation_count'] = citation_count
        
        return metadata
    
    def _resolve_citation_index(self, citation_id: str) -> Optional[int]:
        """
        Resolve citation ID to document index.
        
        Args:
            citation_id: Citation identifier (e.g., "1", "2")
            
        Returns:
            Zero-based document index or None
        """
        try:
            # Try to parse as integer
            index = int(citation_id) - 1  # Convert to 0-based
            return index
        except ValueError:
            # Handle special cases
            if citation_id.lower() in ['a', 'b', 'c', 'd', 'e']:
                return ord(citation_id.lower()) - ord('a')
            return None