CoffeBank commited on
Commit
15639f6
·
1 Parent(s): 20e51d1
Files changed (2) hide show
  1. .gitignore +2 -0
  2. text_analysis.py +174 -0
.gitignore CHANGED
@@ -1,2 +1,4 @@
1
  __pycache__/model_utils.cpython-310.pyc
2
  demo/__pycache__/binary_classifier_demo.cpython-310.pyc
 
 
 
1
  __pycache__/model_utils.cpython-310.pyc
2
  demo/__pycache__/binary_classifier_demo.cpython-310.pyc
3
+ __pycache__/feature_extraction.cpython-310.pyc
4
+ NN_classifier/__pycache__/simple_binary_classifier.cpython-310.pyc
text_analysis.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ from collections import Counter
3
+
4
+ nlp = spacy.load("ru_core_news_lg")
5
+
6
+ def analyze_text(text):
7
+ doc = nlp(text)
8
+
9
+ tokens = [token.text for token in doc]
10
+ words = [token.text for token in doc if token.is_alpha]
11
+ unique_words = set(words)
12
+ stop_words = [token.text for token in doc if token.is_stop]
13
+ avg_word_length = sum(len(word) for word in words) / len(words) if words else 0
14
+
15
+ pos_counts = Counter([token.pos_ for token in doc if token.is_alpha])
16
+ lemmas = [token.lemma_ for token in doc if token.is_alpha]
17
+ unique_lemmas = set(lemmas)
18
+
19
+ dependencies = Counter([token.dep_ for token in doc if token.dep_ != ""])
20
+
21
+ has_noun_chunks = False
22
+ try:
23
+ next(doc.noun_chunks, None)
24
+ has_noun_chunks = True
25
+ except NotImplementedError:
26
+ pass
27
+
28
+ entities = [(ent.text, ent.label_) for ent in doc.ents]
29
+ entity_counts = Counter([ent.label_ for ent in doc.ents])
30
+
31
+ ttr = len(unique_words) / len(words) if words else 0
32
+
33
+ def calculate_simplified_mtld(text_tokens, ttr_threshold=0.72):
34
+ if len(text_tokens) < 10:
35
+ return 0
36
+
37
+ segments = []
38
+ current_segment = []
39
+
40
+ for token in text_tokens:
41
+ current_segment.append(token)
42
+ current_ttr = len(set(current_segment)) / len(current_segment)
43
+
44
+ if current_ttr <= ttr_threshold and len(current_segment) >= 10:
45
+ segments.append(current_segment)
46
+ current_segment = []
47
+
48
+ if current_segment:
49
+ segments.append(current_segment)
50
+
51
+ if not segments:
52
+ return 0
53
+
54
+ return len(text_tokens) / len(segments)
55
+
56
+ mtld = calculate_simplified_mtld(words)
57
+
58
+ sentences = list(doc.sents)
59
+ sentence_lengths = [len(sent) for sent in sentences]
60
+ avg_sentence_length = sum(sentence_lengths) / len(sentences) if sentences else 0
61
+
62
+ words_per_sentence = len(words) / len(sentences) if sentences else 0
63
+
64
+ def count_syllables_ru(word):
65
+ return len([c for c in word.lower() if c in 'аеёиоуыэюя'])
66
+
67
+ syllables = sum(count_syllables_ru(word) for word in words)
68
+ syllables_per_word = syllables / len(words) if words else 0
69
+ flesh_kincaid = 206.835 - 1.3 * words_per_sentence - 60.1 * syllables_per_word
70
+
71
+ long_words = [word for word in words if count_syllables_ru(word) > 4]
72
+ long_words_percent = len(long_words) / len(words) * 100 if words else 0
73
+
74
+ sentence_count = len(sentences)
75
+ question_count = sum(1 for sent in sentences if sent.text.strip().endswith('?'))
76
+ exclamation_count = sum(1 for sent in sentences if sent.text.strip().endswith('!'))
77
+
78
+ coherence_scores = []
79
+ if len(sentences) > 1:
80
+ for i in range(len(sentences)-1):
81
+ if len(sentences[i]) > 0 and len(sentences[i+1]) > 0:
82
+ try:
83
+ if sentences[i].vector_norm > 0 and sentences[i+1].vector_norm > 0:
84
+ sim = sentences[i].similarity(sentences[i+1])
85
+ coherence_scores.append(sim)
86
+ except:
87
+ pass
88
+
89
+ avg_coherence = sum(coherence_scores) / len(coherence_scores) if coherence_scores else 0
90
+
91
+ analysis_results = {
92
+ 'basic_stats': {
93
+ 'total_tokens': len(tokens),
94
+ 'total_words': len(words),
95
+ 'unique_words': len(unique_words),
96
+ 'stop_words': len(stop_words),
97
+ 'avg_word_length': avg_word_length
98
+ },
99
+ 'morphological_analysis': {
100
+ 'pos_distribution': {pos: count for pos, count in pos_counts.items()},
101
+ 'unique_lemmas': len(unique_lemmas),
102
+ 'lemma_word_ratio': len(unique_lemmas) / len(unique_words) if unique_words else 0
103
+ },
104
+ 'syntactic_analysis': {
105
+ 'dependencies': {dep: count for dep, count in dependencies.most_common(10)},
106
+ 'noun_chunks': has_noun_chunks
107
+ },
108
+ 'named_entities': {
109
+ 'total_entities': len(entities),
110
+ 'entity_types': {label: count for label, count in entity_counts.items()}
111
+ },
112
+ 'lexical_diversity': {
113
+ 'ttr': ttr,
114
+ 'mtld': mtld
115
+ },
116
+ 'text_structure': {
117
+ 'sentence_count': sentence_count,
118
+ 'avg_sentence_length': avg_sentence_length,
119
+ 'question_sentences': question_count,
120
+ 'exclamation_sentences': exclamation_count
121
+ },
122
+ 'readability': {
123
+ 'words_per_sentence': words_per_sentence,
124
+ 'syllables_per_word': syllables_per_word,
125
+ 'flesh_kincaid_score': flesh_kincaid,
126
+ 'long_words_percent': long_words_percent
127
+ },
128
+ 'semantic_coherence': {
129
+ 'avg_coherence_score': avg_coherence
130
+ }
131
+ }
132
+
133
+ return analysis_results
134
+
135
+ def show_text_analysis(analysis):
136
+ print("\n📊 TEXT ANALYSIS")
137
+
138
+ print("\n=== BASIC STATISTICS ===")
139
+ print(f"- Total tokens: {analysis['basic_stats']['total_tokens']}")
140
+ print(f"- Total words: {analysis['basic_stats']['total_words']}")
141
+ print(f"- Unique words: {analysis['basic_stats']['unique_words']}")
142
+ print(f"- Stop words: {analysis['basic_stats']['stop_words']}")
143
+ print(f"- Average word length: {analysis['basic_stats']['avg_word_length']:.2f} characters")
144
+
145
+ print("\n=== MORPHOLOGICAL ANALYSIS ===")
146
+ print("- POS distribution:")
147
+ for pos, count in analysis['morphological_analysis']['pos_distribution'].items():
148
+ print(f" • {pos}: {count}")
149
+ print(f"- Unique lemmas: {analysis['morphological_analysis']['unique_lemmas']}")
150
+
151
+ print("\n=== SYNTACTIC ANALYSIS ===")
152
+ print("- Syntactic dependencies (top-5):")
153
+ for i, (dep, count) in enumerate(analysis['syntactic_analysis']['dependencies'].items()):
154
+ if i >= 5:
155
+ break
156
+ print(f" • {dep}: {count}")
157
+
158
+ print("\n=== NAMED ENTITIES ===")
159
+ print(f"- Total entities: {analysis['named_entities']['total_entities']}")
160
+
161
+ print("\n=== LEXICAL DIVERSITY ===")
162
+ print(f"- TTR (type-token ratio): {analysis['lexical_diversity']['ttr']:.3f}")
163
+ print(f"- MTLD (simplified): {analysis['lexical_diversity']['mtld']:.2f}")
164
+
165
+ print("\n=== TEXT STRUCTURE ===")
166
+ print(f"- Sentence count: {analysis['text_structure']['sentence_count']}")
167
+ print(f"- Average sentence length: {analysis['text_structure']['avg_sentence_length']:.2f} tokens")
168
+
169
+ print("\n=== READABILITY ===")
170
+ print(f"- Flesch-Kincaid score: {analysis['readability']['flesh_kincaid_score']:.2f}")
171
+ print(f"- Long words percentage: {analysis['readability']['long_words_percent']:.2f}%")
172
+
173
+ print(f"\n=== SEMANTIC COHERENCE ===")
174
+ print(f"- Average coherence between sentences: {analysis['semantic_coherence']['avg_coherence_score']:.3f}")