Spaces:
Running
Running
import os | |
from typing import List, Dict, Any | |
from utils.logger import logger | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
class TextProcessor: | |
def __init__(self, chunk_size: int = 500, chunk_overlap: int = 50): | |
self.chunk_size = chunk_size | |
self.chunk_overlap = chunk_overlap | |
self.text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=self.chunk_size, | |
chunk_overlap=self.chunk_overlap, | |
length_function=len, # count character, can be replaced | |
add_start_index=True # | |
) | |
logger.info(f"TextProcessor initialized with LangChain's RecursiveCharacterTextSplitter (chunk_size={chunk_size}, chunk_overlap={chunk_overlap})") | |
def process(self, file_path: str) -> List[Dict[str, Any]]: | |
try: | |
with open(file_path, "r", encoding="utf-8") as f: | |
text = f.read() | |
logger.info(f"Processing text document: {file_path}") | |
split_texts = self.text_splitter.split_text(text) | |
chunks = [] | |
for i, chunk_content in enumerate(split_texts): | |
chunk_id = f"{os.path.basename(file_path).split('.')[0]}_chunk_text_{i}" | |
metadata = { | |
"source_id": os.path.basename(file_path), | |
"type": "text", | |
"chunk_id": chunk_id, | |
"content_length": len(chunk_content) | |
} | |
chunks.append({ | |
"content": chunk_content, | |
"metadata": metadata | |
}) | |
logger.info(f"Generated {len(chunks)} text chunks from {file_path}") | |
return chunks | |
except Exception as e: | |
logger.error(f"Error processing text document {file_path}: {e}") | |
return [] |