Spaces:
Running
on
Zero
Running
on
Zero
update
Browse files- .gitignore +2 -0
- text_analysis.py +174 -0
.gitignore
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
__pycache__/model_utils.cpython-310.pyc
|
2 |
demo/__pycache__/binary_classifier_demo.cpython-310.pyc
|
|
|
|
|
|
1 |
__pycache__/model_utils.cpython-310.pyc
|
2 |
demo/__pycache__/binary_classifier_demo.cpython-310.pyc
|
3 |
+
__pycache__/feature_extraction.cpython-310.pyc
|
4 |
+
NN_classifier/__pycache__/simple_binary_classifier.cpython-310.pyc
|
text_analysis.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spacy
|
2 |
+
from collections import Counter
|
3 |
+
|
4 |
+
nlp = spacy.load("ru_core_news_lg")
|
5 |
+
|
6 |
+
def analyze_text(text):
|
7 |
+
doc = nlp(text)
|
8 |
+
|
9 |
+
tokens = [token.text for token in doc]
|
10 |
+
words = [token.text for token in doc if token.is_alpha]
|
11 |
+
unique_words = set(words)
|
12 |
+
stop_words = [token.text for token in doc if token.is_stop]
|
13 |
+
avg_word_length = sum(len(word) for word in words) / len(words) if words else 0
|
14 |
+
|
15 |
+
pos_counts = Counter([token.pos_ for token in doc if token.is_alpha])
|
16 |
+
lemmas = [token.lemma_ for token in doc if token.is_alpha]
|
17 |
+
unique_lemmas = set(lemmas)
|
18 |
+
|
19 |
+
dependencies = Counter([token.dep_ for token in doc if token.dep_ != ""])
|
20 |
+
|
21 |
+
has_noun_chunks = False
|
22 |
+
try:
|
23 |
+
next(doc.noun_chunks, None)
|
24 |
+
has_noun_chunks = True
|
25 |
+
except NotImplementedError:
|
26 |
+
pass
|
27 |
+
|
28 |
+
entities = [(ent.text, ent.label_) for ent in doc.ents]
|
29 |
+
entity_counts = Counter([ent.label_ for ent in doc.ents])
|
30 |
+
|
31 |
+
ttr = len(unique_words) / len(words) if words else 0
|
32 |
+
|
33 |
+
def calculate_simplified_mtld(text_tokens, ttr_threshold=0.72):
|
34 |
+
if len(text_tokens) < 10:
|
35 |
+
return 0
|
36 |
+
|
37 |
+
segments = []
|
38 |
+
current_segment = []
|
39 |
+
|
40 |
+
for token in text_tokens:
|
41 |
+
current_segment.append(token)
|
42 |
+
current_ttr = len(set(current_segment)) / len(current_segment)
|
43 |
+
|
44 |
+
if current_ttr <= ttr_threshold and len(current_segment) >= 10:
|
45 |
+
segments.append(current_segment)
|
46 |
+
current_segment = []
|
47 |
+
|
48 |
+
if current_segment:
|
49 |
+
segments.append(current_segment)
|
50 |
+
|
51 |
+
if not segments:
|
52 |
+
return 0
|
53 |
+
|
54 |
+
return len(text_tokens) / len(segments)
|
55 |
+
|
56 |
+
mtld = calculate_simplified_mtld(words)
|
57 |
+
|
58 |
+
sentences = list(doc.sents)
|
59 |
+
sentence_lengths = [len(sent) for sent in sentences]
|
60 |
+
avg_sentence_length = sum(sentence_lengths) / len(sentences) if sentences else 0
|
61 |
+
|
62 |
+
words_per_sentence = len(words) / len(sentences) if sentences else 0
|
63 |
+
|
64 |
+
def count_syllables_ru(word):
|
65 |
+
return len([c for c in word.lower() if c in 'аеёиоуыэюя'])
|
66 |
+
|
67 |
+
syllables = sum(count_syllables_ru(word) for word in words)
|
68 |
+
syllables_per_word = syllables / len(words) if words else 0
|
69 |
+
flesh_kincaid = 206.835 - 1.3 * words_per_sentence - 60.1 * syllables_per_word
|
70 |
+
|
71 |
+
long_words = [word for word in words if count_syllables_ru(word) > 4]
|
72 |
+
long_words_percent = len(long_words) / len(words) * 100 if words else 0
|
73 |
+
|
74 |
+
sentence_count = len(sentences)
|
75 |
+
question_count = sum(1 for sent in sentences if sent.text.strip().endswith('?'))
|
76 |
+
exclamation_count = sum(1 for sent in sentences if sent.text.strip().endswith('!'))
|
77 |
+
|
78 |
+
coherence_scores = []
|
79 |
+
if len(sentences) > 1:
|
80 |
+
for i in range(len(sentences)-1):
|
81 |
+
if len(sentences[i]) > 0 and len(sentences[i+1]) > 0:
|
82 |
+
try:
|
83 |
+
if sentences[i].vector_norm > 0 and sentences[i+1].vector_norm > 0:
|
84 |
+
sim = sentences[i].similarity(sentences[i+1])
|
85 |
+
coherence_scores.append(sim)
|
86 |
+
except:
|
87 |
+
pass
|
88 |
+
|
89 |
+
avg_coherence = sum(coherence_scores) / len(coherence_scores) if coherence_scores else 0
|
90 |
+
|
91 |
+
analysis_results = {
|
92 |
+
'basic_stats': {
|
93 |
+
'total_tokens': len(tokens),
|
94 |
+
'total_words': len(words),
|
95 |
+
'unique_words': len(unique_words),
|
96 |
+
'stop_words': len(stop_words),
|
97 |
+
'avg_word_length': avg_word_length
|
98 |
+
},
|
99 |
+
'morphological_analysis': {
|
100 |
+
'pos_distribution': {pos: count for pos, count in pos_counts.items()},
|
101 |
+
'unique_lemmas': len(unique_lemmas),
|
102 |
+
'lemma_word_ratio': len(unique_lemmas) / len(unique_words) if unique_words else 0
|
103 |
+
},
|
104 |
+
'syntactic_analysis': {
|
105 |
+
'dependencies': {dep: count for dep, count in dependencies.most_common(10)},
|
106 |
+
'noun_chunks': has_noun_chunks
|
107 |
+
},
|
108 |
+
'named_entities': {
|
109 |
+
'total_entities': len(entities),
|
110 |
+
'entity_types': {label: count for label, count in entity_counts.items()}
|
111 |
+
},
|
112 |
+
'lexical_diversity': {
|
113 |
+
'ttr': ttr,
|
114 |
+
'mtld': mtld
|
115 |
+
},
|
116 |
+
'text_structure': {
|
117 |
+
'sentence_count': sentence_count,
|
118 |
+
'avg_sentence_length': avg_sentence_length,
|
119 |
+
'question_sentences': question_count,
|
120 |
+
'exclamation_sentences': exclamation_count
|
121 |
+
},
|
122 |
+
'readability': {
|
123 |
+
'words_per_sentence': words_per_sentence,
|
124 |
+
'syllables_per_word': syllables_per_word,
|
125 |
+
'flesh_kincaid_score': flesh_kincaid,
|
126 |
+
'long_words_percent': long_words_percent
|
127 |
+
},
|
128 |
+
'semantic_coherence': {
|
129 |
+
'avg_coherence_score': avg_coherence
|
130 |
+
}
|
131 |
+
}
|
132 |
+
|
133 |
+
return analysis_results
|
134 |
+
|
135 |
+
def show_text_analysis(analysis):
|
136 |
+
print("\n📊 TEXT ANALYSIS")
|
137 |
+
|
138 |
+
print("\n=== BASIC STATISTICS ===")
|
139 |
+
print(f"- Total tokens: {analysis['basic_stats']['total_tokens']}")
|
140 |
+
print(f"- Total words: {analysis['basic_stats']['total_words']}")
|
141 |
+
print(f"- Unique words: {analysis['basic_stats']['unique_words']}")
|
142 |
+
print(f"- Stop words: {analysis['basic_stats']['stop_words']}")
|
143 |
+
print(f"- Average word length: {analysis['basic_stats']['avg_word_length']:.2f} characters")
|
144 |
+
|
145 |
+
print("\n=== MORPHOLOGICAL ANALYSIS ===")
|
146 |
+
print("- POS distribution:")
|
147 |
+
for pos, count in analysis['morphological_analysis']['pos_distribution'].items():
|
148 |
+
print(f" • {pos}: {count}")
|
149 |
+
print(f"- Unique lemmas: {analysis['morphological_analysis']['unique_lemmas']}")
|
150 |
+
|
151 |
+
print("\n=== SYNTACTIC ANALYSIS ===")
|
152 |
+
print("- Syntactic dependencies (top-5):")
|
153 |
+
for i, (dep, count) in enumerate(analysis['syntactic_analysis']['dependencies'].items()):
|
154 |
+
if i >= 5:
|
155 |
+
break
|
156 |
+
print(f" • {dep}: {count}")
|
157 |
+
|
158 |
+
print("\n=== NAMED ENTITIES ===")
|
159 |
+
print(f"- Total entities: {analysis['named_entities']['total_entities']}")
|
160 |
+
|
161 |
+
print("\n=== LEXICAL DIVERSITY ===")
|
162 |
+
print(f"- TTR (type-token ratio): {analysis['lexical_diversity']['ttr']:.3f}")
|
163 |
+
print(f"- MTLD (simplified): {analysis['lexical_diversity']['mtld']:.2f}")
|
164 |
+
|
165 |
+
print("\n=== TEXT STRUCTURE ===")
|
166 |
+
print(f"- Sentence count: {analysis['text_structure']['sentence_count']}")
|
167 |
+
print(f"- Average sentence length: {analysis['text_structure']['avg_sentence_length']:.2f} tokens")
|
168 |
+
|
169 |
+
print("\n=== READABILITY ===")
|
170 |
+
print(f"- Flesch-Kincaid score: {analysis['readability']['flesh_kincaid_score']:.2f}")
|
171 |
+
print(f"- Long words percentage: {analysis['readability']['long_words_percent']:.2f}%")
|
172 |
+
|
173 |
+
print(f"\n=== SEMANTIC COHERENCE ===")
|
174 |
+
print(f"- Average coherence between sentences: {analysis['semantic_coherence']['avg_coherence_score']:.3f}")
|