| |
|
| |
|
| | import spacy
|
| | from collections import Counter
|
| | from spacy import displacy
|
| | import re
|
| | from streamlit.components.v1 import html
|
| | import base64
|
| |
|
| | from collections import Counter
|
| | import re
|
| | from ..utils.widget_utils import generate_unique_key
|
| |
|
| | import logging
|
| | logger = logging.getLogger(__name__)
|
| |
|
| |
|
| |
|
| | POS_COLORS = {
|
| | 'ADJ': '#FFA07A',
|
| | 'ADP': '#98FB98',
|
| | 'ADV': '#87CEFA',
|
| | 'AUX': '#DDA0DD',
|
| | 'CCONJ': '#F0E68C',
|
| | 'DET': '#FFB6C1',
|
| | 'INTJ': '#FF6347',
|
| | 'NOUN': '#90EE90',
|
| | 'NUM': '#FAFAD2',
|
| | 'PART': '#D3D3D3',
|
| | 'PRON': '#FFA500',
|
| | 'PROPN': '#20B2AA',
|
| | 'SCONJ': '#DEB887',
|
| | 'SYM': '#7B68EE',
|
| | 'VERB': '#FF69B4',
|
| | 'X': '#A9A9A9',
|
| | }
|
| |
|
| | POS_TRANSLATIONS = {
|
| | 'es': {
|
| | 'ADJ': 'Adjetivo', 'ADP': 'Preposición', 'ADV': 'Adverbio', 'AUX': 'Auxiliar',
|
| | 'CCONJ': 'Conjunción Coordinante', 'DET': 'Determinante', 'INTJ': 'Interjección',
|
| | 'NOUN': 'Sustantivo', 'NUM': 'Número', 'PART': 'Partícula', 'PRON': 'Pronombre',
|
| | 'PROPN': 'Nombre Propio', 'SCONJ': 'Conjunción Subordinante', 'SYM': 'Símbolo',
|
| | 'VERB': 'Verbo', 'X': 'Otro',
|
| | },
|
| | 'en': {
|
| | 'ADJ': 'Adjective', 'ADP': 'Preposition', 'ADV': 'Adverb', 'AUX': 'Auxiliary',
|
| | 'CCONJ': 'Coordinating Conjunction', 'DET': 'Determiner', 'INTJ': 'Interjection',
|
| | 'NOUN': 'Noun', 'NUM': 'Number', 'PART': 'Particle', 'PRON': 'Pronoun',
|
| | 'PROPN': 'Proper Noun', 'SCONJ': 'Subordinating Conjunction', 'SYM': 'Symbol',
|
| | 'VERB': 'Verb', 'X': 'Other',
|
| | },
|
| | 'fr': {
|
| | 'ADJ': 'Adjectif', 'ADP': 'Préposition', 'ADV': 'Adverbe', 'AUX': 'Auxiliaire',
|
| | 'CCONJ': 'Conjonction de Coordination', 'DET': 'Déterminant', 'INTJ': 'Interjection',
|
| | 'NOUN': 'Nom', 'NUM': 'Nombre', 'PART': 'Particule', 'PRON': 'Pronom',
|
| | 'PROPN': 'Nom Propre', 'SCONJ': 'Conjonction de Subordination', 'SYM': 'Symbole',
|
| | 'VERB': 'Verbe', 'X': 'Autre',
|
| | },
|
| | 'pt': {
|
| | 'ADJ': 'Adjetivo', 'ADP': 'Preposição', 'ADV': 'Advérbio', 'AUX': 'Auxiliar',
|
| | 'CCONJ': 'Conjunção Coordenativa', 'DET': 'Determinante', 'INTJ': 'Interjeição',
|
| | 'NOUN': 'Substantivo', 'NUM': 'Número', 'PART': 'Partícula', 'PRON': 'Pronome',
|
| | 'PROPN': 'Nome Próprio', 'SCONJ': 'Conjunção Subordinativa', 'SYM': 'Símbolo',
|
| | 'VERB': 'Verbo', 'X': 'Outro',
|
| | }
|
| | }
|
| |
|
| |
|
| | def get_repeated_words_colors(doc):
|
| | word_counts = Counter(token.text.lower() for token in doc if token.pos_ != 'PUNCT')
|
| | repeated_words = {word: count for word, count in word_counts.items() if count > 1}
|
| |
|
| | word_colors = {}
|
| | for token in doc:
|
| | if token.text.lower() in repeated_words:
|
| | word_colors[token.text.lower()] = POS_COLORS.get(token.pos_, '#FFFFFF')
|
| |
|
| | return word_colors
|
| |
|
| |
|
| | def highlight_repeated_words(doc, word_colors):
|
| | highlighted_text = []
|
| | for token in doc:
|
| | if token.text.lower() in word_colors:
|
| | color = word_colors[token.text.lower()]
|
| | highlighted_text.append(f'<span style="background-color: {color};">{token.text}</span>')
|
| | else:
|
| | highlighted_text.append(token.text)
|
| | return ' '.join(highlighted_text)
|
| |
|
| |
|
| |
|
| | def generate_arc_diagram(doc):
|
| | """
|
| | Genera diagramas de arco para cada oración en el documento usando spacy-streamlit.
|
| |
|
| | Args:
|
| | doc: Documento procesado por spaCy
|
| | Returns:
|
| | list: Lista de diagramas en formato HTML
|
| | """
|
| | arc_diagrams = []
|
| | try:
|
| | options = {
|
| | "compact": False,
|
| | "color": "#ffffff",
|
| | "bg": "#0d6efd",
|
| | "font": "Arial",
|
| | "offset_x": 50,
|
| | "distance": 100,
|
| | "arrow_spacing": 12,
|
| | "arrow_width": 2,
|
| | "arrow_stroke": 2,
|
| | "word_spacing": 25,
|
| | "maxZoom": 2
|
| | }
|
| |
|
| | for sent in doc.sents:
|
| | try:
|
| |
|
| | html = displacy.render(sent, style="dep", options=options)
|
| | arc_diagrams.append(html)
|
| | except Exception as e:
|
| | logger.error(f"Error al renderizar oración: {str(e)}")
|
| | continue
|
| |
|
| | return arc_diagrams
|
| | except Exception as e:
|
| | logger.error(f"Error general en generate_arc_diagram: {str(e)}")
|
| | return None
|
| |
|
| |
|
| |
|
| | def get_detailed_pos_analysis(doc):
|
| | """
|
| | Realiza un análisis detallado de las categorías gramaticales (POS) en el texto.
|
| | """
|
| | pos_counts = Counter(token.pos_ for token in doc)
|
| | total_tokens = len(doc)
|
| | pos_analysis = []
|
| | for pos, count in pos_counts.items():
|
| | percentage = (count / total_tokens) * 100
|
| | pos_analysis.append({
|
| | 'pos': pos,
|
| | 'count': count,
|
| | 'percentage': round(percentage, 2),
|
| | 'examples': [token.text for token in doc if token.pos_ == pos][:5]
|
| | })
|
| | return sorted(pos_analysis, key=lambda x: x['count'], reverse=True)
|
| |
|
| |
|
| | def get_morphological_analysis(doc):
|
| | """
|
| | Realiza un análisis morfológico detallado de las palabras en el texto.
|
| | """
|
| | morphology_analysis = []
|
| | for token in doc:
|
| | if token.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV']:
|
| | morphology_analysis.append({
|
| | 'text': token.text,
|
| | 'lemma': token.lemma_,
|
| | 'pos': token.pos_,
|
| | 'tag': token.tag_,
|
| | 'dep': token.dep_,
|
| | 'shape': token.shape_,
|
| | 'is_alpha': token.is_alpha,
|
| | 'is_stop': token.is_stop,
|
| | 'morph': str(token.morph)
|
| | })
|
| | return morphology_analysis
|
| |
|
| |
|
| | def get_sentence_structure_analysis(doc):
|
| | """
|
| | Analiza la estructura de las oraciones en el texto.
|
| | """
|
| | sentence_analysis = []
|
| | for sent in doc.sents:
|
| | sentence_analysis.append({
|
| | 'text': sent.text,
|
| | 'root': sent.root.text,
|
| | 'root_pos': sent.root.pos_,
|
| | 'num_tokens': len(sent),
|
| | 'num_words': len([token for token in sent if token.is_alpha]),
|
| | 'subjects': [token.text for token in sent if "subj" in token.dep_],
|
| | 'objects': [token.text for token in sent if "obj" in token.dep_],
|
| | 'verbs': [token.text for token in sent if token.pos_ == "VERB"]
|
| | })
|
| | return sentence_analysis
|
| |
|
| |
|
| | def perform_advanced_morphosyntactic_analysis(text, nlp):
|
| | """
|
| | Realiza un análisis morfosintáctico avanzado del texto.
|
| | """
|
| | try:
|
| |
|
| | model_lang = nlp.lang
|
| | logger.info(f"Realizando análisis con modelo de idioma: {model_lang}")
|
| |
|
| |
|
| | doc = nlp(text)
|
| |
|
| |
|
| | return {
|
| | 'doc': doc,
|
| | 'pos_analysis': get_detailed_pos_analysis(doc),
|
| | 'morphological_analysis': get_morphological_analysis(doc),
|
| | 'sentence_structure': get_sentence_structure_analysis(doc),
|
| | 'arc_diagrams': generate_arc_diagram(doc),
|
| | 'repeated_words': get_repeated_words_colors(doc),
|
| | 'highlighted_text': highlight_repeated_words(doc, get_repeated_words_colors(doc))
|
| | }
|
| | except Exception as e:
|
| | logger.error(f"Error en análisis morfosintáctico: {str(e)}")
|
| | return None
|
| |
|
| |
|
| | __all__ = [
|
| | 'perform_advanced_morphosyntactic_analysis',
|
| | 'get_repeated_words_colors',
|
| | 'highlight_repeated_words',
|
| | 'generate_arc_diagram',
|
| | 'get_detailed_pos_analysis',
|
| | 'get_morphological_analysis',
|
| | 'get_sentence_structure_analysis',
|
| | 'POS_COLORS',
|
| | 'POS_TRANSLATIONS'
|
| | ]
|
| |
|