File size: 1,857 Bytes
fcb8b13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be398ac
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import os

from typing import List, Dict, Any
from utils.logger import logger
from langchain_text_splitters import RecursiveCharacterTextSplitter

class TextProcessor:
    def __init__(self, chunk_size: int = 500, chunk_overlap: int = 50):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            length_function=len, # count character, can be replaced 
            add_start_index=True # 
        )
        
        logger.info(f"TextProcessor initialized with LangChain's RecursiveCharacterTextSplitter (chunk_size={chunk_size}, chunk_overlap={chunk_overlap})")
        
    def process(self, file_path: str) -> List[Dict[str, Any]]:
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()
            logger.info(f"Processing text document: {file_path}")
            split_texts = self.text_splitter.split_text(text)
            
            chunks = []
            for i, chunk_content in enumerate(split_texts):
                chunk_id = f"{os.path.basename(file_path).split('.')[0]}_chunk_text_{i}"
                metadata = {
                    "source_id": os.path.basename(file_path),
                    "type": "text",
                    "chunk_id": chunk_id,
                    "content_length": len(chunk_content)
                }
                chunks.append({
                    "content": chunk_content,
                    "metadata": metadata
                })
            logger.info(f"Generated {len(chunks)} text chunks from {file_path}")
            return chunks
        except Exception as e:
            logger.error(f"Error processing text document {file_path}: {e}")
            return []