Spaces:
Sleeping
Sleeping
File size: 9,943 Bytes
f6a9f63 5884230 33c996e 5884230 4a17e89 0067c9d f6a9f63 0067c9d f6a9f63 6370d73 f6a9f63 6370d73 f6a9f63 5884230 0067c9d f6a9f63 5884230 0067c9d f6a9f63 0067c9d f6a9f63 5884230 f6a9f63 33c996e aa38fcf 38ed4e9 9da507d f6a9f63 38ed4e9 f6a9f63 9da507d f6a9f63 9da507d 38ed4e9 9da507d 38ed4e9 f6a9f63 a42e1ff 38ed4e9 a42e1ff 9da507d 38ed4e9 a42e1ff 38ed4e9 4a17e89 9da507d a42e1ff d1e7fd2 38ed4e9 9da507d a42e1ff ee54fb7 4a17e89 9da507d a42e1ff ee54fb7 38ed4e9 9da507d a42e1ff 9da507d 38ed4e9 f9e7c0c 451cdc6 35eb459 6370d73 35eb459 9da507d 35eb459 6370d73 35eb459 9da507d 35eb459 9da507d 35eb459 6370d73 35eb459 6370d73 35eb459 f6a9f63 35eb459 f6a9f63 35eb459 f6a9f63 35eb459 f6a9f63 35eb459 f6a9f63 35eb459 f6a9f63 35eb459 f6a9f63 35eb459 f6a9f63 35eb459 f6a9f63 35eb459 f6a9f63 35eb459 f6a9f63 35eb459 f6a9f63 35eb459 f6a9f63 35eb459 f6a9f63 35eb459 f6a9f63 35eb459 f6a9f63 35eb459 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 |
from collections import defaultdict
import json
from huggingface_hub import hf_hub_download, list_repo_files
from llama_index.core import Document
from my_logging import log_message
from config import MAX_CHARS_TABLE, MAX_ROWS_TABLE
def create_table_content(table_data):
doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
table_num = table_data.get('table_number', 'Неизвестно')
table_title = table_data.get('table_title', 'Неизвестно')
section = table_data.get('section', 'Неизвестно')
content = f"Таблица: {table_num}\n"
content += f"Название: {table_title}\n"
content += f"Документ: {doc_id}\n"
content += f"Раздел: {section}\n"
headers = table_data.get('headers', [])
if headers:
content += f"\nЗаголовки: {' | '.join(headers)}\n"
if 'data' in table_data and isinstance(table_data['data'], list):
content += "\nДанные таблицы:\n"
for row_idx, row in enumerate(table_data['data'], start=1):
if isinstance(row, dict):
row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
content += f"Строка {row_idx}: {row_text}\n"
return content
def chunk_table_document(doc, max_chunk_size=MAX_CHARS_TABLE, max_rows_per_chunk=MAX_ROWS_TABLE):
lines = doc.text.strip().split('\n')
header_lines = []
data_rows = []
in_data = False
for line in lines:
if line.startswith('Данные таблицы:'):
in_data = True
header_lines.append(line)
elif in_data and line.startswith('Строка'):
data_rows.append(line)
elif not in_data:
header_lines.append(line)
header = '\n'.join(header_lines) + '\n'
if not data_rows:
return [doc]
chunks = []
current_rows = []
current_size = len(header)
for row in data_rows:
row_size = len(row) + 1
# Check both limits: chunk size and row count
if ((current_size + row_size > max_chunk_size or len(current_rows) >= max_rows_per_chunk) and current_rows):
chunk_text = header + '\n'.join(current_rows)
chunks.append(chunk_text)
log_message(f"Создана часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками")
current_rows = []
current_size = len(header)
current_rows.append(row)
current_size += row_size
log_message(f"Добавлена строка к текущему чанку, текущий размер {current_size} символов")
# Add final chunk
if current_rows:
chunk_text = header + '\n'.join(current_rows)
chunks.append(chunk_text)
log_message(f"Создана финальная часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками")
# Create Document objects
chunked_docs = []
for i, chunk_text in enumerate(chunks):
chunk_doc = Document(
text=chunk_text,
metadata={
"type": "table",
"table_number": doc.metadata.get('table_number'),
"document_id": doc.metadata.get('document_id'),
"section": doc.metadata.get('section'),
"chunk_id": i,
"total_chunks": len(chunks),
"is_chunked": True
}
)
chunked_docs.append(chunk_doc)
return chunked_docs
# def table_to_document(table_data, document_id=None):
# if not isinstance(table_data, dict):
# return []
# doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
# table_num = table_data.get('table_number', 'Неизвестно')
# table_title = table_data.get('table_title', 'Неизвестно')
# section = table_data.get('section', 'Неизвестно')
# table_rows = table_data.get('data', [])
# if not table_rows:
# return []
# # Build table content
# content = f"Таблица: {table_num}\n"
# content += f"Название: {table_title}\n"
# content += f"Документ: {doc_id}\n"
# content += f"Раздел: {section}\n"
# headers = table_data.get('headers', [])
# if headers:
# content += f"\nЗаголовки: {' | '.join(headers)}\n"
# content += "\nДанные таблицы:\n"
# for row_idx, row in enumerate(table_rows, start=1):
# if isinstance(row, dict):
# row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
# content += f"Строка {row_idx}: {row_text}\n"
# # Create base document
# base_doc = Document(
# text=content,
# metadata={
# "type": "table",
# "table_number": table_num,
# "document_id": doc_id,
# "section": section
# }
# )
# if len(content) > 4000:
# chunks = chunk_table_document(base_doc)
# log_message(f"Таблица {table_num} разбита на {len(chunks)} частей")
# return chunk_table_document(base_doc)
# return [base_doc]
# def load_table_data(repo_id, hf_token, table_data_dir):
# try:
# files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
# table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
# log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
# table_documents = []
# stats = {
# 'total_tables': 0,
# 'total_size': 0,
# 'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
# }
# for file_path in table_files:
# try:
# local_path = hf_hub_download(
# repo_id=repo_id,
# filename=file_path,
# local_dir='',
# repo_type="dataset",
# token=hf_token
# )
# log_message(f"\nОбработка файла: {file_path}")
# with open(local_path, 'r', encoding='utf-8') as f:
# table_data = json.load(f)
# if isinstance(table_data, dict):
# document_id = table_data.get('document', 'unknown')
# if 'sheets' in table_data:
# sorted_sheets = sorted(
# table_data['sheets'],
# key=lambda sheet: sheet.get('table_number', '') # or use 'table_number'
# )
# for sheet in sorted_sheets:
# sheet['document'] = document_id
# docs_list = table_to_document(sheet, document_id)
# table_documents.extend(docs_list)
# for doc in docs_list:
# stats['total_tables'] += 1
# size = doc.metadata.get('content_size', 0)
# stats['total_size'] += size
# stats['by_document'][document_id]['count'] += 1
# stats['by_document'][document_id]['size'] += size
# log_message(f"Добавлена таблица {sheet.get('table_number', 'Неизвестно')} из документа {document_id}, размер {size} символов")
# else:
# docs_list = table_to_document(table_data, document_id)
# table_documents.extend(docs_list)
# for doc in docs_list:
# stats['total_tables'] += 1
# size = doc.metadata.get('content_size', 0)
# stats['total_size'] += size
# stats['by_document'][document_id]['count'] += 1
# stats['by_document'][document_id]['size'] += size
# except Exception as e:
# log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
# continue
# # Log summary statistics
# log_message("\n" + "=" * 60)
# log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
# log_message("=" * 60)
# log_message(f"Всего таблиц добавлено: {stats['total_tables']}")
# log_message(f"Общий размер: {stats['total_size']:,} символов")
# log_message(f"Средний размер таблицы: {stats['total_size'] // stats['total_tables'] if stats['total_tables'] > 0 else 0:,} символов")
# log_message("\nПо документам:")
# for doc_id, doc_stats in sorted(stats['by_document'].items()):
# log_message(f" • {doc_id}: {doc_stats['count']} таблиц, "
# f"{doc_stats['size']:,} символов")
# log_message("=" * 60)
# return table_documents
# except Exception as e:
# log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
# return []
|