CoffeBank commited on
Commit
5d2f9ad
·
1 Parent(s): 95b0158
Files changed (1) hide show
  1. demo/binary_classifier_demo.py +100 -22
demo/binary_classifier_demo.py CHANGED
@@ -97,93 +97,171 @@ def run_binary_classifier(text, show_analysis=False):
97
  features = result['features']
98
  text_analysis = result['text_analysis']
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  analysis_md = "## Анализ текста\n\n"
101
 
102
  # Basic statistics
103
  analysis_md += "### Основная статистика\n"
104
  for key, value in text_analysis.get('basic_stats', {}).items():
 
105
  if isinstance(value, float):
106
- analysis_md += f"- {key}: {value:.2f}\n"
107
  else:
108
- analysis_md += f"- {key}: {value}\n"
109
  analysis_md += "\n"
110
 
111
  # Morphological analysis
112
  analysis_md += "### Морфологический анализ\n"
113
  morph_analysis = text_analysis.get('morphological_analysis', {})
114
  for key, value in morph_analysis.items():
 
115
  if key == 'pos_distribution':
116
- analysis_md += "- Распределение частей речи:\n"
117
  for pos, count in value.items():
118
- analysis_md += f" - {pos}: {count}\n"
 
 
 
 
 
 
 
 
 
 
 
119
  elif isinstance(value, float):
120
- analysis_md += f"- {key}: {value:.3f}\n"
121
  else:
122
- analysis_md += f"- {key}: {value}\n"
123
  analysis_md += "\n"
124
 
125
  # Syntactic analysis
126
  analysis_md += "### Син��аксический анализ\n"
127
  synt_analysis = text_analysis.get('syntactic_analysis', {})
128
  for key, value in synt_analysis.items():
 
129
  if key == 'dependencies':
130
- analysis_md += "- Зависимости:\n"
131
  for dep, count in value.items():
132
- analysis_md += f" - {dep}: {count}\n"
 
 
 
 
 
 
 
 
133
  elif isinstance(value, float):
134
- analysis_md += f"- {key}: {value:.3f}\n"
135
  else:
136
- analysis_md += f"- {key}: {value}\n"
137
  analysis_md += "\n"
138
 
139
  # Named entities
140
  analysis_md += "### Именованные сущности\n"
141
  entities = text_analysis.get('named_entities', {})
142
  for key, value in entities.items():
 
143
  if key == 'entity_types':
144
- analysis_md += "- Типы сущностей:\n"
145
  for ent, count in value.items():
146
- analysis_md += f" - {ent}: {count}\n"
 
 
 
 
147
  elif isinstance(value, float):
148
- analysis_md += f"- {key}: {value:.3f}\n"
149
  else:
150
- analysis_md += f"- {key}: {value}\n"
151
  analysis_md += "\n"
152
 
153
  # Lexical diversity
154
  analysis_md += "### Лексическое разнообразие\n"
155
  for key, value in text_analysis.get('lexical_diversity', {}).items():
 
156
  if isinstance(value, float):
157
- analysis_md += f"- {key}: {value:.3f}\n"
158
  else:
159
- analysis_md += f"- {key}: {value}\n"
160
  analysis_md += "\n"
161
 
162
  # Text structure
163
  analysis_md += "### Структура текста\n"
164
  for key, value in text_analysis.get('text_structure', {}).items():
 
165
  if isinstance(value, float):
166
- analysis_md += f"- {key}: {value:.2f}\n"
167
  else:
168
- analysis_md += f"- {key}: {value}\n"
169
  analysis_md += "\n"
170
 
171
  # Readability
172
  analysis_md += "### Читабельность\n"
173
  for key, value in text_analysis.get('readability', {}).items():
 
174
  if isinstance(value, float):
175
- analysis_md += f"- {key}: {value:.2f}\n"
176
  else:
177
- analysis_md += f"- {key}: {value}\n"
178
  analysis_md += "\n"
179
 
180
  # Semantic coherence
181
  analysis_md += "### Семантическая связность\n"
182
  for key, value in text_analysis.get('semantic_coherence', {}).items():
 
183
  if isinstance(value, float):
184
- analysis_md += f"- {key}: {value:.3f}\n"
185
  else:
186
- analysis_md += f"- {key}: {value}\n"
187
 
188
  return gr.Markdown(result_md), gr.Markdown(analysis_md) if analysis_md else None, text
189
 
 
97
  features = result['features']
98
  text_analysis = result['text_analysis']
99
 
100
+ basic_stats_dict = {
101
+ 'total_tokens': 'Количество токенов',
102
+ 'total_words': 'Количество слов',
103
+ 'unique_words': 'Количество уникальных слов',
104
+ 'stop_words': 'Количество стоп-слов',
105
+ 'avg_word_length': 'Средняя длина слова (символов)'
106
+ }
107
+
108
+ morph_dict = {
109
+ 'pos_distribution': 'Распределение частей речи',
110
+ 'unique_lemmas': 'Количество уникальных лемм',
111
+ 'lemma_word_ratio': 'Отношение лемм к словам'
112
+ }
113
+
114
+ synt_dict = {
115
+ 'dependencies': 'Зависимости между словами',
116
+ 'noun_chunks': 'Количество именных групп'
117
+ }
118
+
119
+ entities_dict = {
120
+ 'total_entities': 'Общее количество именованных сущностей',
121
+ 'entity_types': 'Типы именованных сущностей'
122
+ }
123
+
124
+ diversity_dict = {
125
+ 'ttr': 'TTR (отношение типов к токенам)',
126
+ 'mtld': 'MTLD (мера лексического разнообразия)'
127
+ }
128
+
129
+ structure_dict = {
130
+ 'sentence_count': 'Количество предложений',
131
+ 'avg_sentence_length': 'Средняя длина предложения (токенов)',
132
+ 'question_sentences': 'Количество вопросительных предложений',
133
+ 'exclamation_sentences': 'Количество восклицательных предложений'
134
+ }
135
+
136
+ readability_dict = {
137
+ 'words_per_sentence': 'Слов на предложение',
138
+ 'syllables_per_word': 'Слогов на слово',
139
+ 'flesh_kincaid_score': 'Индекс читабельности Флеша-Кинкейда',
140
+ 'long_words_percent': 'Процент длинных слов'
141
+ }
142
+
143
+ semantic_dict = {
144
+ 'avg_coherence_score': 'Средняя связность между предложениями'
145
+ }
146
+
147
  analysis_md = "## Анализ текста\n\n"
148
 
149
  # Basic statistics
150
  analysis_md += "### Основная статистика\n"
151
  for key, value in text_analysis.get('basic_stats', {}).items():
152
+ label = basic_stats_dict.get(key, key)
153
  if isinstance(value, float):
154
+ analysis_md += f"- {label}: {value:.2f}\n"
155
  else:
156
+ analysis_md += f"- {label}: {value}\n"
157
  analysis_md += "\n"
158
 
159
  # Morphological analysis
160
  analysis_md += "### Морфологический анализ\n"
161
  morph_analysis = text_analysis.get('morphological_analysis', {})
162
  for key, value in morph_analysis.items():
163
+ label = morph_dict.get(key, key)
164
  if key == 'pos_distribution':
165
+ analysis_md += f"- {label}:\n"
166
  for pos, count in value.items():
167
+ pos_name = pos
168
+ if pos == 'NOUN': pos_name = 'Существительные'
169
+ elif pos == 'VERB': pos_name = 'Глаголы'
170
+ elif pos == 'ADJ': pos_name = 'Прилагательные'
171
+ elif pos == 'ADV': pos_name = 'Наречия'
172
+ elif pos == 'PROPN': pos_name = 'Имена собственные'
173
+ elif pos == 'DET': pos_name = 'Определители'
174
+ elif pos == 'ADP': pos_name = 'Предлоги'
175
+ elif pos == 'PRON': pos_name = 'Местоимения'
176
+ elif pos == 'CCONJ': pos_name = 'Сочинительные союзы'
177
+ elif pos == 'SCONJ': pos_name = 'Подчинительные союзы'
178
+ analysis_md += f" - {pos_name}: {count}\n"
179
  elif isinstance(value, float):
180
+ analysis_md += f"- {label}: {value:.3f}\n"
181
  else:
182
+ analysis_md += f"- {label}: {value}\n"
183
  analysis_md += "\n"
184
 
185
  # Syntactic analysis
186
  analysis_md += "### Син��аксический анализ\n"
187
  synt_analysis = text_analysis.get('syntactic_analysis', {})
188
  for key, value in synt_analysis.items():
189
+ label = synt_dict.get(key, key)
190
  if key == 'dependencies':
191
+ analysis_md += f"- {label}:\n"
192
  for dep, count in value.items():
193
+ dep_name = dep
194
+ if dep == 'nsubj': dep_name = 'Подлежащие'
195
+ elif dep == 'obj': dep_name = 'Дополнения'
196
+ elif dep == 'amod': dep_name = 'Определения'
197
+ elif dep == 'nmod': dep_name = 'Именные модификаторы'
198
+ elif dep == 'ROOT': dep_name = 'Корневые узлы'
199
+ elif dep == 'punct': dep_name = 'Пунктуация'
200
+ elif dep == 'case': dep_name = 'Падежные маркеры'
201
+ analysis_md += f" - {dep_name}: {count}\n"
202
  elif isinstance(value, float):
203
+ analysis_md += f"- {label}: {value:.3f}\n"
204
  else:
205
+ analysis_md += f"- {label}: {value}\n"
206
  analysis_md += "\n"
207
 
208
  # Named entities
209
  analysis_md += "### Именованные сущности\n"
210
  entities = text_analysis.get('named_entities', {})
211
  for key, value in entities.items():
212
+ label = entities_dict.get(key, key)
213
  if key == 'entity_types':
214
+ analysis_md += f"- {label}:\n"
215
  for ent, count in value.items():
216
+ ent_name = ent
217
+ if ent == 'PER': ent_name = 'Люди'
218
+ elif ent == 'LOC': ent_name = 'Локации'
219
+ elif ent == 'ORG': ent_name = 'Организации'
220
+ analysis_md += f" - {ent_name}: {count}\n"
221
  elif isinstance(value, float):
222
+ analysis_md += f"- {label}: {value:.3f}\n"
223
  else:
224
+ analysis_md += f"- {label}: {value}\n"
225
  analysis_md += "\n"
226
 
227
  # Lexical diversity
228
  analysis_md += "### Лексическое разнообразие\n"
229
  for key, value in text_analysis.get('lexical_diversity', {}).items():
230
+ label = diversity_dict.get(key, key)
231
  if isinstance(value, float):
232
+ analysis_md += f"- {label}: {value:.3f}\n"
233
  else:
234
+ analysis_md += f"- {label}: {value}\n"
235
  analysis_md += "\n"
236
 
237
  # Text structure
238
  analysis_md += "### Структура текста\n"
239
  for key, value in text_analysis.get('text_structure', {}).items():
240
+ label = structure_dict.get(key, key)
241
  if isinstance(value, float):
242
+ analysis_md += f"- {label}: {value:.2f}\n"
243
  else:
244
+ analysis_md += f"- {label}: {value}\n"
245
  analysis_md += "\n"
246
 
247
  # Readability
248
  analysis_md += "### Читабельность\n"
249
  for key, value in text_analysis.get('readability', {}).items():
250
+ label = readability_dict.get(key, key)
251
  if isinstance(value, float):
252
+ analysis_md += f"- {label}: {value:.2f}\n"
253
  else:
254
+ analysis_md += f"- {label}: {value}\n"
255
  analysis_md += "\n"
256
 
257
  # Semantic coherence
258
  analysis_md += "### Семантическая связность\n"
259
  for key, value in text_analysis.get('semantic_coherence', {}).items():
260
+ label = semantic_dict.get(key, key)
261
  if isinstance(value, float):
262
+ analysis_md += f"- {label}: {value:.3f}\n"
263
  else:
264
+ analysis_md += f"- {label}: {value}\n"
265
 
266
  return gr.Markdown(result_md), gr.Markdown(analysis_md) if analysis_md else None, text
267