RAG_AIEXP_01 / table_prep.py
MrSimple07's picture
top k = 150 + max chunk size is 4000 + max rows =15 + sim cut off = 0.45
35eb459
from collections import defaultdict
import json
from huggingface_hub import hf_hub_download, list_repo_files
from llama_index.core import Document
from my_logging import log_message
from config import MAX_CHARS_TABLE, MAX_ROWS_TABLE
def create_table_content(table_data):
doc_id = table_data.get('document_id', table_data.get('document', 'Неизвестно'))
table_num = table_data.get('table_number', 'Неизвестно')
table_title = table_data.get('table_title', 'Неизвестно')
section = table_data.get('section', 'Неизвестно')
content = f"Таблица: {table_num}\n"
content += f"Название: {table_title}\n"
content += f"Документ: {doc_id}\n"
content += f"Раздел: {section}\n"
headers = table_data.get('headers', [])
if headers:
content += f"\nЗаголовки: {' | '.join(headers)}\n"
if 'data' in table_data and isinstance(table_data['data'], list):
content += "\nДанные таблицы:\n"
for row_idx, row in enumerate(table_data['data'], start=1):
if isinstance(row, dict):
row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
content += f"Строка {row_idx}: {row_text}\n"
return content
def chunk_table_document(doc, max_chunk_size=MAX_CHARS_TABLE, max_rows_per_chunk=MAX_ROWS_TABLE):
lines = doc.text.strip().split('\n')
header_lines = []
data_rows = []
in_data = False
for line in lines:
if line.startswith('Данные таблицы:'):
in_data = True
header_lines.append(line)
elif in_data and line.startswith('Строка'):
data_rows.append(line)
elif not in_data:
header_lines.append(line)
header = '\n'.join(header_lines) + '\n'
if not data_rows:
return [doc]
chunks = []
current_rows = []
current_size = len(header)
for row in data_rows:
row_size = len(row) + 1
# Check both limits: chunk size and row count
if ((current_size + row_size > max_chunk_size or len(current_rows) >= max_rows_per_chunk) and current_rows):
chunk_text = header + '\n'.join(current_rows)
chunks.append(chunk_text)
log_message(f"Создана часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками")
current_rows = []
current_size = len(header)
current_rows.append(row)
current_size += row_size
log_message(f"Добавлена строка к текущему чанку, текущий размер {current_size} символов")
# Add final chunk
if current_rows:
chunk_text = header + '\n'.join(current_rows)
chunks.append(chunk_text)
log_message(f"Создана финальная часть таблицы размером {len(chunk_text)} символов с {len(current_rows)} строками")
# Create Document objects
chunked_docs = []
for i, chunk_text in enumerate(chunks):
chunk_doc = Document(
text=chunk_text,
metadata={
"type": "table",
"table_number": doc.metadata.get('table_number'),
"document_id": doc.metadata.get('document_id'),
"section": doc.metadata.get('section'),
"chunk_id": i,
"total_chunks": len(chunks),
"is_chunked": True
}
)
chunked_docs.append(chunk_doc)
return chunked_docs
# def table_to_document(table_data, document_id=None):
# if not isinstance(table_data, dict):
# return []
# doc_id = document_id or table_data.get('document_id') or table_data.get('document', 'Неизвестно')
# table_num = table_data.get('table_number', 'Неизвестно')
# table_title = table_data.get('table_title', 'Неизвестно')
# section = table_data.get('section', 'Неизвестно')
# table_rows = table_data.get('data', [])
# if not table_rows:
# return []
# # Build table content
# content = f"Таблица: {table_num}\n"
# content += f"Название: {table_title}\n"
# content += f"Документ: {doc_id}\n"
# content += f"Раздел: {section}\n"
# headers = table_data.get('headers', [])
# if headers:
# content += f"\nЗаголовки: {' | '.join(headers)}\n"
# content += "\nДанные таблицы:\n"
# for row_idx, row in enumerate(table_rows, start=1):
# if isinstance(row, dict):
# row_text = " | ".join([f"{k}: {v}" for k, v in row.items() if v])
# content += f"Строка {row_idx}: {row_text}\n"
# # Create base document
# base_doc = Document(
# text=content,
# metadata={
# "type": "table",
# "table_number": table_num,
# "document_id": doc_id,
# "section": section
# }
# )
# if len(content) > 4000:
# chunks = chunk_table_document(base_doc)
# log_message(f"Таблица {table_num} разбита на {len(chunks)} частей")
# return chunk_table_document(base_doc)
# return [base_doc]
# def load_table_data(repo_id, hf_token, table_data_dir):
# try:
# files = list_repo_files(repo_id=repo_id, repo_type="dataset", token=hf_token)
# table_files = [f for f in files if f.startswith(table_data_dir) and f.endswith('.json')]
# log_message(f"Найдено {len(table_files)} JSON файлов с таблицами")
# table_documents = []
# stats = {
# 'total_tables': 0,
# 'total_size': 0,
# 'by_document': defaultdict(lambda: {'count': 0, 'size': 0})
# }
# for file_path in table_files:
# try:
# local_path = hf_hub_download(
# repo_id=repo_id,
# filename=file_path,
# local_dir='',
# repo_type="dataset",
# token=hf_token
# )
# log_message(f"\nОбработка файла: {file_path}")
# with open(local_path, 'r', encoding='utf-8') as f:
# table_data = json.load(f)
# if isinstance(table_data, dict):
# document_id = table_data.get('document', 'unknown')
# if 'sheets' in table_data:
# sorted_sheets = sorted(
# table_data['sheets'],
# key=lambda sheet: sheet.get('table_number', '') # or use 'table_number'
# )
# for sheet in sorted_sheets:
# sheet['document'] = document_id
# docs_list = table_to_document(sheet, document_id)
# table_documents.extend(docs_list)
# for doc in docs_list:
# stats['total_tables'] += 1
# size = doc.metadata.get('content_size', 0)
# stats['total_size'] += size
# stats['by_document'][document_id]['count'] += 1
# stats['by_document'][document_id]['size'] += size
# log_message(f"Добавлена таблица {sheet.get('table_number', 'Неизвестно')} из документа {document_id}, размер {size} символов")
# else:
# docs_list = table_to_document(table_data, document_id)
# table_documents.extend(docs_list)
# for doc in docs_list:
# stats['total_tables'] += 1
# size = doc.metadata.get('content_size', 0)
# stats['total_size'] += size
# stats['by_document'][document_id]['count'] += 1
# stats['by_document'][document_id]['size'] += size
# except Exception as e:
# log_message(f"❌ ОШИБКА файла {file_path}: {str(e)}")
# continue
# # Log summary statistics
# log_message("\n" + "=" * 60)
# log_message("СТАТИСТИКА ПО ТАБЛИЦАМ")
# log_message("=" * 60)
# log_message(f"Всего таблиц добавлено: {stats['total_tables']}")
# log_message(f"Общий размер: {stats['total_size']:,} символов")
# log_message(f"Средний размер таблицы: {stats['total_size'] // stats['total_tables'] if stats['total_tables'] > 0 else 0:,} символов")
# log_message("\nПо документам:")
# for doc_id, doc_stats in sorted(stats['by_document'].items()):
# log_message(f" • {doc_id}: {doc_stats['count']} таблиц, "
# f"{doc_stats['size']:,} символов")
# log_message("=" * 60)
# return table_documents
# except Exception as e:
# log_message(f"❌ КРИТИЧЕСКАЯ ОШИБКА загрузки табличных данных: {str(e)}")
# return []