"""Markdown parsing module for fabric-to-espanso.""" from typing import Tuple, List, Optional, Set from pathlib import Path import regex import logging from .exceptions import ParsingError from .config import config logger = logging.getLogger('fabric_to_espanso') def create_section_pattern(keywords: Set[str]) -> regex.Pattern: keyword_pattern = '|'.join(regex.escape(kw) for kw in keywords) return regex.compile( rf'^#\s+.*(?:{keyword_pattern}).*$\n?(?:(?!^#).*\n?)*', regex.MULTILINE | regex.IGNORECASE ) def parse_markdown_file( file_path: str | Path, keywords: Optional[Set[str]] = None ) -> Tuple[str, Optional[str]]: """Extract sections with specified keywords from markdown file. Args: file_path: Path to markdown file keywords: Set of keywords to match in headings. If None, uses defaults from config Returns: Tuple of (full_content, extracted_sections) If no sections match, returns (full_content, None) Raises: ParsingError: If file reading or parsing fails """ try: # Use provided keywords or defaults from config keywords = keywords or set(config.base_words) # Create regex pattern for keywords in headings and text section_pattern = create_section_pattern(keywords) # Read file content path = Path(file_path) try: content = path.read_text(encoding='utf-8') except Exception as e: raise ParsingError(f"Failed to read {path}: {str(e)}") from e # Find all matching headings section_matches = list(section_pattern.findall(content)) # If no matches found, return full content if not section_matches: logger.debug(f"No matching sections found in {path.name}") return content, None # Join sections with double newline extracted = '\n\n'.join(section_matches) logger.debug(f"Extracted {len(section_matches)} sections from {path.name}") return content, extracted except Exception as e: logger.error(f"Error parsing {file_path}: {str(e)}", exc_info=True) if isinstance(e, ParsingError): raise raise ParsingError(f"Unexpected error parsing {file_path}: {str(e)}") from e def main(): # Example usage try: # Custom keywords can be passed as second argument result = parse_markdown_file('document.md') # result = extract_sections('document.md', {'Identity', 'Purpose', 'Scope'}) print(result) except Exception as e: print(f"An error occurred: {e}") if __name__ == '__main__': main()