""" Entity extraction for technical documents in Epic 2 Week 2. This module provides entity extraction capabilities for RISC-V technical documents, using spaCy for natural language processing and custom patterns for technical terms. """ import logging import time from typing import List, Dict, Any, Optional, Set, Tuple import re from dataclasses import dataclass try: import spacy from spacy.matcher import Matcher from spacy.tokens import Doc, Span SPACY_AVAILABLE = True except ImportError: spacy = None Span = None Doc = None Matcher = None SPACY_AVAILABLE = False from src.core.interfaces import Document from .config.graph_config import EntityExtractionConfig logger = logging.getLogger(__name__) @dataclass class Entity: """Represents an extracted entity with metadata.""" text: str label: str start_pos: int end_pos: int confidence: float document_id: str context: str = "" def __hash__(self) -> int: return hash((self.text.lower(), self.label, self.document_id)) def __eq__(self, other) -> bool: if not isinstance(other, Entity): return False return ( self.text.lower() == other.text.lower() and self.label == other.label and self.document_id == other.document_id ) class EntityExtractionError(Exception): """Raised when entity extraction operations fail.""" pass class EntityExtractor: """ Entity extractor for RISC-V technical documents. This class uses spaCy for natural language processing and custom patterns to identify technical entities in RISC-V documentation, including: - Technical concepts (TECH) - Protocols and standards (PROTOCOL) - Architectures and implementations (ARCH) - Extensions and specifications (EXTENSION) The extractor is optimized for technical documentation and provides high-accuracy entity recognition with configurable confidence thresholds. """ def __init__(self, config: EntityExtractionConfig): """ Initialize the entity extractor. Args: config: Entity extraction configuration """ self.config = config self.nlp = None self.matcher = None self.custom_patterns = self._get_risc_v_patterns() self.stats = { "documents_processed": 0, "entities_extracted": 0, "processing_time": 0.0, "model_load_time": 0.0 } # Initialize spaCy model self._initialize_model() def _initialize_model(self) -> None: """Initialize spaCy model and custom patterns.""" if spacy is None: raise EntityExtractionError("spaCy is not installed. Install with: pip install spacy") start_time = time.time() try: # Load spaCy model self.nlp = spacy.load(self.config.model) logger.info(f"Loaded spaCy model: {self.config.model}") # Initialize matcher for custom patterns self.matcher = Matcher(self.nlp.vocab) self._add_custom_patterns() self.stats["model_load_time"] = time.time() - start_time logger.info(f"Entity extractor initialized in {self.stats['model_load_time']:.3f}s") except OSError as e: if "Can't find model" in str(e): logger.warning(f"spaCy model '{self.config.model}' not found. Attempting to download...") try: # Attempt to download the model import subprocess import sys result = subprocess.run([ sys.executable, "-m", "spacy", "download", self.config.model ], capture_output=True, text=True, timeout=300) # 5 minute timeout if result.returncode == 0: logger.info(f"Successfully downloaded spaCy model '{self.config.model}'") # Try loading again self.nlp = spacy.load(self.config.model) logger.info(f"Loaded spaCy model: {self.config.model}") else: logger.error(f"Failed to download spaCy model: {result.stderr}") raise EntityExtractionError( f"spaCy model '{self.config.model}' not found and auto-download failed. " f"Manual install: python -m spacy download {self.config.model}" ) from e except (subprocess.TimeoutExpired, subprocess.CalledProcessError, Exception) as download_error: logger.error(f"Model download failed: {download_error}") raise EntityExtractionError( f"spaCy model '{self.config.model}' not found and auto-download failed. " f"Manual install: python -m spacy download {self.config.model}. " f"Download error: {download_error}" ) from e else: raise EntityExtractionError(f"Failed to load spaCy model: {str(e)}") from e except Exception as e: raise EntityExtractionError(f"Failed to initialize entity extractor: {str(e)}") from e def _get_risc_v_patterns(self) -> Dict[str, List[List[Dict[str, Any]]]]: """ Get RISC-V specific entity patterns. Returns: Dictionary mapping entity types to spaCy patterns """ patterns = { "TECH": [ # RISC-V technical terms [{"LOWER": "risc-v"}], [{"LOWER": "riscv"}], [{"LOWER": "isa"}], [{"LOWER": "instruction"}, {"LOWER": "set"}], [{"LOWER": "instruction"}, {"LOWER": "set"}, {"LOWER": "architecture"}], [{"LOWER": "vector"}, {"LOWER": "extension"}], [{"LOWER": "atomic"}, {"LOWER": "operations"}], [{"LOWER": "privilege"}, {"LOWER": "levels"}], [{"LOWER": "csr"}, {"LOWER": "registers"}], [{"LOWER": "control"}, {"LOWER": "status"}, {"LOWER": "register"}], [{"LOWER": "pipeline"}], [{"LOWER": "microarchitecture"}], [{"LOWER": "cache"}, {"LOWER": "coherence"}], [{"LOWER": "memory"}, {"LOWER": "management"}], [{"LOWER": "virtual"}, {"LOWER": "memory"}], [{"LOWER": "page"}, {"LOWER": "table"}], [{"LOWER": "interrupt"}, {"LOWER": "handling"}], [{"LOWER": "exception"}, {"LOWER": "handling"}], [{"LOWER": "floating"}, {"LOWER": "point"}], [{"LOWER": "compressed"}, {"LOWER": "instructions"}], [{"LOWER": "bit"}, {"LOWER": "manipulation"}], ], "PROTOCOL": [ # Communication protocols and standards [{"LOWER": "axi"}], [{"LOWER": "ahb"}], [{"LOWER": "apb"}], [{"LOWER": "amba"}], [{"LOWER": "tilelink"}], [{"LOWER": "debug"}, {"LOWER": "transport"}, {"LOWER": "module"}], [{"LOWER": "jtag"}], [{"LOWER": "openocd"}], [{"LOWER": "gdb"}], [{"LOWER": "trace"}, {"LOWER": "encoder"}], [{"LOWER": "performance"}, {"LOWER": "counters"}], [{"LOWER": "pmp"}], # Physical Memory Protection [{"LOWER": "pma"}], # Physical Memory Attributes ], "ARCH": [ # Architecture implementations and designs [{"LOWER": "rv32i"}], [{"LOWER": "rv64i"}], [{"LOWER": "rv32gc"}], [{"LOWER": "rv64gc"}], [{"LOWER": "rv32e"}], [{"LOWER": "zicsr"}], [{"LOWER": "zifencei"}], [{"LOWER": "zmmul"}], [{"LOWER": "rocket"}, {"LOWER": "chip"}], [{"LOWER": "boom"}], [{"LOWER": "ariane"}], [{"LOWER": "cva6"}], [{"LOWER": "ibex"}], [{"LOWER": "vexriscv"}], [{"LOWER": "picorv32"}], [{"LOWER": "syntacore"}], [{"LOWER": "scr1"}], [{"LOWER": "sifive"}], [{"LOWER": "berkeley"}], [{"LOWER": "lowrisc"}], ], "EXTENSION": [ # RISC-V extensions [{"LOWER": "m"}, {"LOWER": "extension"}], [{"LOWER": "a"}, {"LOWER": "extension"}], [{"LOWER": "f"}, {"LOWER": "extension"}], [{"LOWER": "d"}, {"LOWER": "extension"}], [{"LOWER": "c"}, {"LOWER": "extension"}], [{"LOWER": "v"}, {"LOWER": "extension"}], [{"LOWER": "h"}, {"LOWER": "extension"}], [{"LOWER": "s"}, {"LOWER": "extension"}], [{"LOWER": "n"}, {"LOWER": "extension"}], [{"LOWER": "p"}, {"LOWER": "extension"}], [{"LOWER": "b"}, {"LOWER": "extension"}], [{"LOWER": "k"}, {"LOWER": "extension"}], [{"LOWER": "j"}, {"LOWER": "extension"}], [{"LOWER": "zb"}], [{"LOWER": "zk"}], [{"LOWER": "zf"}], [{"TEXT": {"REGEX": r"^rv\d+[a-z]+$"}}], # RV32I, RV64GC, etc. ] } # Add user-defined custom patterns if self.config.custom_patterns: for entity_type, custom_patterns in self.config.custom_patterns.items(): if entity_type in patterns: # Convert string patterns to spaCy patterns for pattern_text in custom_patterns: pattern = [{"LOWER": token.lower()} for token in pattern_text.split()] patterns[entity_type].append(pattern) else: patterns[entity_type] = [] for pattern_text in custom_patterns: pattern = [{"LOWER": token.lower()} for token in pattern_text.split()] patterns[entity_type].append(pattern) return patterns def _add_custom_patterns(self) -> None: """Add custom patterns to the spaCy matcher.""" try: for entity_type, patterns in self.custom_patterns.items(): if entity_type in self.config.entity_types: for i, pattern in enumerate(patterns): pattern_id = f"{entity_type}_{i}" self.matcher.add(pattern_id, [pattern]) logger.info(f"Added {len(self.custom_patterns)} custom pattern sets") except Exception as e: logger.warning(f"Failed to add some custom patterns: {str(e)}") def extract_entities(self, documents: List[Document]) -> Dict[str, List[Entity]]: """ Extract entities from a list of documents. Args: documents: List of documents to process Returns: Dictionary mapping document IDs to extracted entities """ if not documents: return {} start_time = time.time() try: all_entities = {} # Process documents in batches for efficiency batch_size = self.config.batch_size for i in range(0, len(documents), batch_size): batch = documents[i:i + batch_size] batch_entities = self._extract_batch(batch) all_entities.update(batch_entities) # Update statistics processing_time = time.time() - start_time self.stats["documents_processed"] += len(documents) self.stats["processing_time"] += processing_time self.stats["entities_extracted"] += sum(len(entities) for entities in all_entities.values()) logger.info( f"Extracted entities from {len(documents)} documents in {processing_time:.3f}s " f"({len(documents)/processing_time:.1f} docs/sec)" ) return all_entities except Exception as e: logger.error(f"Entity extraction failed: {str(e)}") raise EntityExtractionError(f"Failed to extract entities: {str(e)}") from e def _extract_batch(self, documents: List[Document]) -> Dict[str, List[Entity]]: """ Extract entities from a batch of documents. Args: documents: Batch of documents to process Returns: Dictionary mapping document IDs to extracted entities """ batch_entities = {} for document in documents: try: entities = self._extract_from_document(document) doc_id = document.metadata.get("id", "unknown") batch_entities[doc_id] = entities except Exception as e: doc_id = document.metadata.get("id", "unknown") logger.warning(f"Failed to extract entities from document {doc_id}: {str(e)}") batch_entities[doc_id] = [] return batch_entities def _extract_from_document(self, document: Document) -> List[Entity]: """ Extract entities from a single document. Args: document: Document to process Returns: List of extracted entities """ if not document.content or not document.content.strip(): return [] # Process text with spaCy doc = self.nlp(document.content) entities = [] # Extract named entities from spaCy NER if hasattr(doc, 'ents'): for ent in doc.ents: if self._is_relevant_entity(ent): entity = Entity( text=ent.text.strip(), label=self._normalize_label(ent.label_), start_pos=ent.start_char, end_pos=ent.end_char, confidence=self._calculate_confidence(ent), document_id=document.metadata.get("id", "unknown"), context=self._extract_context(doc, ent) ) entities.append(entity) # Extract custom pattern matches matches = self.matcher(doc) for match_id, start, end in matches: span = doc[start:end] label = self._get_label_from_match_id(self.nlp.vocab.strings[match_id]) if label and self._meets_confidence_threshold(span): entity = Entity( text=span.text.strip(), label=label, start_pos=span.start_char, end_pos=span.end_char, confidence=self._calculate_pattern_confidence(span), document_id=document.metadata.get("id", "unknown"), context=self._extract_context(doc, span) ) entities.append(entity) # Remove duplicates and apply confidence filtering entities = self._deduplicate_entities(entities) entities = [e for e in entities if e.confidence >= self.config.confidence_threshold] return entities def _is_relevant_entity(self, ent: Any) -> bool: """Check if a spaCy entity is relevant for technical extraction.""" # Map spaCy labels to our entity types relevant_labels = { "ORG": "TECH", # Organizations (often tech companies) "PRODUCT": "TECH", # Products (often technical products) "MISC": "TECH", # Miscellaneous (often technical terms) "GPE": "ARCH", # Geopolitical entities (sometimes architectures) } return ent.label_ in relevant_labels def _normalize_label(self, spacy_label: str) -> str: """Normalize spaCy labels to our entity types.""" label_mapping = { "ORG": "TECH", "PRODUCT": "TECH", "MISC": "TECH", "GPE": "ARCH", } return label_mapping.get(spacy_label, "TECH") def _calculate_confidence(self, ent: Any) -> float: """Calculate confidence score for a spaCy entity.""" # Base confidence on entity properties base_confidence = 0.7 # Boost confidence for longer entities (more specific) length_bonus = min(len(ent.text.split()) * 0.1, 0.2) # Boost confidence for uppercase entities (likely acronyms) if ent.text.isupper() and len(ent.text) > 1: acronym_bonus = 0.1 else: acronym_bonus = 0.0 # Check if it matches our patterns pattern_bonus = 0.1 if self._matches_technical_pattern(ent.text) else 0.0 return min(base_confidence + length_bonus + acronym_bonus + pattern_bonus, 1.0) def _calculate_pattern_confidence(self, span: Any) -> float: """Calculate confidence score for pattern matches.""" # Pattern matches have higher base confidence base_confidence = 0.8 # Exact technical term matches get highest confidence if self._is_exact_technical_term(span.text): return 0.95 return base_confidence def _matches_technical_pattern(self, text: str) -> bool: """Check if text matches common technical patterns.""" technical_patterns = [ r'^rv\d+[a-z]*$', # RV32I, RV64GC, etc. r'^[a-z]+\d+$', # Technical IDs r'^[A-Z]{2,}$', # Acronyms ] for pattern in technical_patterns: if re.match(pattern, text, re.IGNORECASE): return True return False def _is_exact_technical_term(self, text: str) -> bool: """Check if text is an exact technical term.""" exact_terms = { "risc-v", "riscv", "isa", "csr", "mmu", "alu", "fpu", "rv32i", "rv64i", "rv32gc", "rv64gc", "axi", "ahb", "apb" } return text.lower() in exact_terms def _get_label_from_match_id(self, match_id: str) -> Optional[str]: """Extract entity label from matcher ID.""" try: return match_id.split('_')[0] except (IndexError, AttributeError): return None def _meets_confidence_threshold(self, span: Any) -> bool: """Check if span meets confidence threshold.""" # Simple heuristics for pattern matches if len(span.text) < 2: return False if span.text.isdigit(): return False return True def _extract_context(self, doc: Any, entity: Any) -> str: """Extract surrounding context for an entity.""" context_window = 50 # Characters on each side start = max(0, entity.start_char - context_window) end = min(len(doc.text), entity.end_char + context_window) return doc.text[start:end].strip() def _deduplicate_entities(self, entities: List[Entity]) -> List[Entity]: """Remove duplicate entities, keeping the highest confidence ones.""" seen = {} for entity in entities: key = (entity.text.lower(), entity.label, entity.document_id) if key not in seen or entity.confidence > seen[key].confidence: seen[key] = entity return list(seen.values()) def get_entity_types(self) -> List[str]: """Get list of supported entity types.""" return self.config.entity_types def get_statistics(self) -> Dict[str, Any]: """ Get extraction statistics. Returns: Dictionary with extraction statistics """ stats = self.stats.copy() if stats["documents_processed"] > 0: stats["avg_entities_per_document"] = stats["entities_extracted"] / stats["documents_processed"] stats["avg_processing_time_per_document"] = stats["processing_time"] / stats["documents_processed"] else: stats["avg_entities_per_document"] = 0.0 stats["avg_processing_time_per_document"] = 0.0 return stats def reset_statistics(self) -> None: """Reset extraction statistics.""" self.stats = { "documents_processed": 0, "entities_extracted": 0, "processing_time": 0.0, "model_load_time": self.stats["model_load_time"] # Keep model load time }