Spaces:
Sleeping
Sleeping
| from collections import defaultdict | |
| import json | |
| from huggingface_hub import hf_hub_download, list_repo_files | |
| from llama_index.core import Document | |
| from my_logging import log_message | |
| from config import MAX_CHARS_TABLE, MAX_ROWS_TABLE | |
| def create_table_content(table_data): | |
| doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно')) | |
| table_num = table_data.get('table_number', 'Неизвестно') | |
| table_title = table_data.get('table_title', 'Неизвестно') | |
| section = table_data.get('section', 'Неизвестно') | |
| content = f"Таблица: {table_num}\n" | |
| content += f"Название: {table_title}\n" | |
| content += f"Документ: {doc_id}\n" | |
| content += f"Раздел: {section}\n" | |
| headers = table_data.get('headers', []) | |
| if headers: | |
| content += f"\nЗаголовки: {' | '.join(headers)}\n" | |
| if 'data' in table_data and isinstance(table_data['data'], list): | |
| content += "\nДанные таблицы:\n" | |
| for row_idx, row in enumerate(table_data['data'], start=1): | |
| if isinstance(row, dict): | |
| row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v]) | |
| content += f"Строка {row_idx}: {row_text}\n" | |
| return content | |
| def chunk_table_document(doc, max_chunk_size=MAX_CHARS_TABLE, max_rows_per_chunk=MAX_ROWS_TABLE): | |
| lines = doc.text.strip().split('\n') | |
| header_lines = [] | |
| data_rows = [] | |
| in_data = False | |
| for line in lines: | |
| if line.startswith('Данные таблицы:'): | |
| in_data = True | |
| header_lines.append(line) | |
| elif in_data and line.startswith('Строка'): | |
| data_rows.append(line) | |
| elif not in_data: | |
| header_lines.append(line) | |
| header = '\n'.join(header_lines) + '\n' | |
| if not data_rows: | |
| return [doc] | |
| chunks = [] | |
| current_rows = [] | |
| current_size = len(header) | |
| for row in data_rows: | |
| row_size = len(row) + 1 | |
| # Check both limits: chunk size and row count | |
| if ((current_size + row_size > max_chunk_size or len(current_rows) >= max_rows_per_chunk) and current_rows): | |
| chunk_text = header + '\n'.join(current_rows) | |
| chunks.append(chunk_text) | |
| log_message(f"Создана часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками") | |
| current_rows = [] | |
| current_size = len(header) | |
| current_rows.append(row) | |
| current_size += row_size | |
| log_message(f"Добавлена строка к текущему чанку, текущий размер {current_size} символов") | |
| # Add final chunk | |
| if current_rows: | |
| chunk_text = header + '\n'.join(current_rows) | |
| chunks.append(chunk_text) | |
| log_message(f"Создана финальная часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками") | |
| # Create Document objects | |
| chunked_docs = [] | |
| for i, chunk_text in enumerate(chunks): | |
| chunk_doc = Document( | |
| text=chunk_text, | |
| metadata={ | |
| "type": "table", | |
| "table_number": doc.metadata.get('table_number'), | |
| "document_id": doc.metadata.get('document_id'), | |
| "section": doc.metadata.get('section'), | |
| "chunk_id": i, | |
| "total_chunks": len(chunks), | |
| "is_chunked": True | |
| } | |
| ) | |
| chunked_docs.append(chunk_doc) | |
| return chunked_docs | |
| # def table_to_document(table_data, document_id=None): | |
| # if not isinstance(table_data, dict): | |
| # return [] | |
| # doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно') | |
| # table_num = table_data.get('table_number', 'Неизвестно') | |
| # table_title = table_data.get('table_title', 'Неизвестно') | |
| # section = table_data.get('section', 'Неизвестно') | |
| # table_rows = table_data.get('data', []) | |
| # if not table_rows: | |
| # return [] | |
| # # Build table content | |
| # content = f"Таблица: {table_num}\n" | |
| # content += f"Название: {table_title}\n" | |
| # content += f"Документ: {doc_id}\n" | |
| # content += f"Раздел: {section}\n" | |
| # headers = table_data.get('headers', []) | |
| # if headers: | |
| # content += f"\nЗаголовки: {' | '.join(headers)}\n" | |
| # content += "\nДанные таблицы:\n" | |
| # for row_idx, row in enumerate(table_rows, start=1): | |
| # if isinstance(row, dict): | |
| # row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v]) | |
| # content += f"Строка {row_idx}: {row_text}\n" | |
| # # Create base document | |
| # base_doc = Document( | |
| # text=content, | |
| # metadata={ | |
| # "type": "table", | |
| # "table_number": table_num, | |
| # "document_id": doc_id, | |
| # "section": section | |
| # } | |
| # ) | |
| # if len(content) > 4000: | |
| # chunks = chunk_table_document(base_doc) | |
| # log_message(f"Таблица {table_num} разбита на {len(chunks)} частей") | |
| # return chunk_table_document(base_doc) | |
| # return [base_doc] | |
| # def load_table_data(repo_id, hf_token, table_data_dir): | |
| # try: | |
| # files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token) | |
| # table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')] | |
| # log_message(f"Найдено {len(table_files)} JSON файлов с таблицами") | |
| # table_documents = [] | |
| # stats = { | |
| # 'total_tables': 0, | |
| # 'total_size': 0, | |
| # 'by_document': defaultdict(lambda: {'count': 0, 'size': 0}) | |
| # } | |
| # for file_path in table_files: | |
| # try: | |
| # local_path = hf_hub_download( | |
| # repo_id=repo_id, | |
| # filename=file_path, | |
| # local_dir='', | |
| # repo_type="dataset", | |
| # token=hf_token | |
| # ) | |
| # log_message(f"\nОбработка файла: {file_path}") | |
| # with open(local_path, 'r', encoding='utf-8') as f: | |
| # table_data = json.load(f) | |
| # if isinstance(table_data, dict): | |
| # document_id = table_data.get('document', 'unknown') | |
| # if 'sheets' in table_data: | |
| # sorted_sheets = sorted( | |
| # table_data['sheets'], | |
| # key=lambda sheet: sheet.get('table_number', '') # or use 'table_number' | |
| # ) | |
| # for sheet in sorted_sheets: | |
| # sheet['document'] = document_id | |
| # docs_list = table_to_document(sheet, document_id) | |
| # table_documents.extend(docs_list) | |
| # for doc in docs_list: | |
| # stats['total_tables'] += 1 | |
| # size = doc.metadata.get('content_size', 0) | |
| # stats['total_size'] += size | |
| # stats['by_document'][document_id]['count'] += 1 | |
| # stats['by_document'][document_id]['size'] += size | |
| # log_message(f"Добавлена таблица {sheet.get('table_number', 'Неизвестно')} из документа {document_id}, размер {size} символов") | |
| # else: | |
| # docs_list = table_to_document(table_data, document_id) | |
| # table_documents.extend(docs_list) | |
| # for doc in docs_list: | |
| # stats['total_tables'] += 1 | |
| # size = doc.metadata.get('content_size', 0) | |
| # stats['total_size'] += size | |
| # stats['by_document'][document_id]['count'] += 1 | |
| # stats['by_document'][document_id]['size'] += size | |
| # except Exception as e: | |
| # log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}") | |
| # continue | |
| # # Log summary statistics | |
| # log_message("\n" + "=" * 60) | |
| # log_message("СТАТИСТИКА ПО ТАБЛИЦАМ") | |
| # log_message("=" * 60) | |
| # log_message(f"Всего таблиц добавлено: {stats['total_tables']}") | |
| # log_message(f"Общий размер: {stats['total_size']:,} символов") | |
| # log_message(f"Средний размер таблицы: {stats['total_size'] // stats['total_tables'] if stats['total_tables'] > 0 else 0:,} символов") | |
| # log_message("\nПо документам:") | |
| # for doc_id, doc_stats in sorted(stats['by_document'].items()): | |
| # log_message(f" • {doc_id}: {doc_stats['count']} таблиц, " | |
| # f"{doc_stats['size']:,} символов") | |
| # log_message("=" * 60) | |
| # return table_documents | |
| # except Exception as e: | |
| # log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}") | |
| # return [] | |