palisade / src /text_splitter.py
Jina Camellia Yoo
upload full project structure
01661a1
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from typing import List
import re
class VehicleManualTextSplitter:
def __init__(self, chunk_size: int = 500, chunk_overlap: int = 100):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size = chunk_size,
chunk_overlap = chunk_overlap,
length_function = len,
separators=["\n\n", "\n", ".", "ใ€‚", "!", "?", ",", " ", ""],
is_separator_regex=False
)
self.chunks = []
def split_documents(self, documents: List[Document]) -> List[Document]:
print(f"{len(documents)}๊ฐœ ํŽ˜์ด์ง€๋ฅผ ์ฒญํฌ๋กœ ๋ถ„ํ•  ์ค‘...")
# ๋ถ„ํ•  ์ „ ์ „์ฒ˜๋ฆฌ
processed_docs = self._preprocess_documents(documents)
# LangChain์˜ split_documents ๋ฉ”์„œ๋“œ ์‚ฌ์šฉ
self.chunks = self.text_splitter.split_documents(processed_docs)
# ์ฒญํฌ์— ์ถ”๊ฐ€ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๋ถ€์—ฌ
self._add_chunk_metadata()
print(f"์ด {len(self.chunks)}๊ฐœ ์ฒญํฌ ์ƒ์„ฑ ์™„๋ฃŒ")
print(f"ํ‰๊ท  ์ฒญํฌ ํฌ๊ธฐ: {self._get_avg_chunk_size():.0f}์ž")
return self.chunks
def _preprocess_documents(self, documents: List[Document]) -> List[Document]:
processed = []
for doc in documents:
# ์—ฐ์†๋œ ๊ณต๋ฐฑ์„ ํ•˜๋‚˜๋กœ
content = re.sub(r'\s+', ' ', doc.page_content)
# ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ ํŒจํ„ด ์ œ๊ฑฐ (์˜ˆ: "- 123 -", "Page 123")
content = re.sub(r'-\s*\d+\s*-', '', content)
content = re.sub(r'Page\s*\d+', '', content, flags=re.IGNORECASE)
# ๋„ˆ๋ฌด ์งง์€ ํŽ˜์ด์ง€๋Š” ์Šคํ‚ต (๋ชฉ์ฐจ, ๋นˆ ํŽ˜์ด์ง€ ๋“ฑ)
if len(content.strip()) < 50:
continue
processed.append(Document(
page_content=content,
metadata=doc.metadata
))
return processed
def _add_chunk_metadata(self):
for i, chunk in enumerate(self.chunks):
chunk.metadata['chunk_id'] = f"chunk_{i:04d}"
chunk.metadata['chunk_index'] = i
# ์ฒญํฌ๊ฐ€ ์–ด๋–ค ์„น์…˜์— ์†ํ•˜๋Š”์ง€ ์ถ”๋ก  (์ œ๋ชฉ ๊ธฐ๋ฐ˜)
section = self._infer_section(chunk.page_content)
if section:
chunk.metadata['section'] = section
def _infer_section(self, text: str) -> str:
text_lower = text.lower()
# ์ฃผ์š” ์„น์…˜ ํ‚ค์›Œ๋“œ ๋งคํ•‘
section_keywords = {
'์—”์ง„': ['์—”์ง„', '์‹œ๋™', '์ถœ๋ ฅ', '์—ฐ๋ฃŒ'],
'๋ธŒ๋ ˆ์ดํฌ': ['๋ธŒ๋ ˆ์ดํฌ', '์ œ๋™', 'ํŽ˜๋‹ฌ'],
'ํƒ€์ด์–ด': ['ํƒ€์ด์–ด', 'ํœ ', '๊ณต๊ธฐ์••'],
'์ „๊ธฐ์žฅ์น˜': ['๋ฐฐํ„ฐ๋ฆฌ', 'ํ“จ์ฆˆ', '๋žจํ”„', '์กฐ๋ช…'],
'์•ˆ์ „': ['์—์–ด๋ฐฑ', '์•ˆ์ „๋ฒจํŠธ', '๊ฒฝ๊ณ ๋“ฑ'],
'์ •๋น„': ['์ ๊ฒ€', '๊ต์ฒด', '์ •๋น„', '์˜ค์ผ'],
'์šด์ „': ['์ฃผํ–‰', '์šด์ „', '๊ธฐ์–ด', '๋ณ€์†'],
'ADAS': ['ํฌ๋ฃจ์ฆˆ', '์ฐจ์„ ', '์ถฉ๋Œ', '์ž๋™']
}
for section, keywords in section_keywords.items():
for keyword in keywords:
if keyword in text_lower:
return section
return '์ผ๋ฐ˜'
def _get_avg_chunk_size(self) -> float:
"""ํ‰๊ท  ์ฒญํฌ ํฌ๊ธฐ ๊ณ„์‚ฐ"""
if not self.chunks:
return 0
return sum(len(c.page_content) for c in self.chunks) / len(self.chunks)
def get_chunk_statistics(self) -> dict:
if not self.chunks:
return {}
lengths = [len(c.page_content) for c in self.chunks]
return {
'total_chunks': len(self.chunks),
'avg_size': sum(lengths) / len(lengths),
'min_size': min(lengths),
'max_size': max(lengths),
'total_chars': sum(lengths),
'sections': self._count_sections()
}
def _count_sections(self) -> dict:
sections = {}
for chunk in self.chunks:
section = chunk.metadata.get('section', '์ผ๋ฐ˜')
sections[section] = sections.get(section, 0) + 1
return sections
def search_chunks(self, keyword: str, limit: int = 5) -> List[Document]:
results = []
for chunk in self.chunks:
if keyword.lower() in chunk.page_content.lower():
results.append(chunk)
if len(results) >= limit:
break
return results
# ํ…Œ์ŠคํŠธ ์ฝ”๋“œ
if __name__ == "__main__":
from document_loader import VehicleManualLoader
import os
# ํ˜„์žฌ ๋””๋ ‰ํ† ๋ฆฌ ์„ค์ •
current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.dirname(current_dir)
pdf_path = os.path.join(project_root, "data", "LX3_2026_ko_KR.pdf")
print("=" * 60)
print("์ฐจ๋Ÿ‰ ๋งค๋‰ด์–ผ ํ…์ŠคํŠธ ๋ถ„ํ•  ํ…Œ์ŠคํŠธ")
print("=" * 60)
try:
# 1. PDF ๋กœ๋“œ
print("\n1. PDF ๋กœ๋”ฉ...")
loader = VehicleManualLoader(pdf_path)
documents = loader.load_pdf()
# 2. ํ…์ŠคํŠธ ๋ถ„ํ• 
print("\n2. ํ…์ŠคํŠธ ๋ถ„ํ•  ์ค‘...")
splitter = VehicleManualTextSplitter(
chunk_size=500, # ํ•œ๊ตญ์–ด ๊ธฐ์ค€ ์•ฝ 2-3๋ฌธ๋‹จ
chunk_overlap=100 # ๋ฌธ๋งฅ ์œ ์ง€๋ฅผ ์œ„ํ•œ ์ค‘๋ณต
)
chunks = splitter.split_documents(documents)
# 3. ํ†ต๊ณ„ ์ถœ๋ ฅ
print("\n3. ์ฒญํฌ ํ†ต๊ณ„:")
stats = splitter.get_chunk_statistics()
print(f" - ์ด ์ฒญํฌ ์ˆ˜: {stats['total_chunks']:,}๊ฐœ")
print(f" - ํ‰๊ท  ํฌ๊ธฐ: {stats['avg_size']:.0f}์ž")
print(f" - ์ตœ์†Œ/์ตœ๋Œ€: {stats['min_size']}์ž / {stats['max_size']}์ž")
print(f" - ์ด ๋ฌธ์ž ์ˆ˜: {stats['total_chars']:,}์ž")
print("\n ์„น์…˜๋ณ„ ๋ถ„ํฌ:")
for section, count in stats['sections'].items():
print(f" - {section}: {count}๊ฐœ")
# 4. ์ƒ˜ํ”Œ ์ฒญํฌ ํ™•์ธ
print("\n4๏ธ. ์ƒ˜ํ”Œ ์ฒญํฌ (์ฒ˜์Œ 3๊ฐœ):")
print("-" * 50)
for i, chunk in enumerate(chunks[:3]):
print(f"\n[์ฒญํฌ {i + 1}]")
print(f"ID: {chunk.metadata.get('chunk_id')}")
print(f"์„น์…˜: {chunk.metadata.get('section')}")
print(f"์›๋ณธ ํŽ˜์ด์ง€: {chunk.metadata.get('page', 'N/A')}")
print(f"๋‚ด์šฉ: {chunk.page_content[:150]}...")
print("-" * 50)
# 5. ํ‚ค์›Œ๋“œ ๊ฒ€์ƒ‰ ํ…Œ์ŠคํŠธ
print("\n5๏ธ. '์—”์ง„ ์˜ค์ผ' ๊ฒ€์ƒ‰ ํ…Œ์ŠคํŠธ:")
results = splitter.search_chunks("์—”์ง„ ์˜ค์ผ", limit=3)
print(f" ์ฐพ์€ ์ฒญํฌ: {len(results)}๊ฐœ")
for i, chunk in enumerate(results):
print(f" - ์ฒญํฌ {chunk.metadata['chunk_id']}: {chunk.page_content[:100]}...")
except Exception as e:
print(f" ์—๋Ÿฌ ๋ฐœ์ƒ: {e}")
import traceback
traceback.print_exc()