|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_core.documents import Document |
|
from typing import List |
|
import re |
|
|
|
class VehicleManualTextSplitter: |
|
def __init__(self, chunk_size: int = 500, chunk_overlap: int = 100): |
|
self.chunk_size = chunk_size |
|
self.chunk_overlap = chunk_overlap |
|
|
|
self.text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size = chunk_size, |
|
chunk_overlap = chunk_overlap, |
|
length_function = len, |
|
separators=["\n\n", "\n", ".", "ใ", "!", "?", ",", " ", ""], |
|
is_separator_regex=False |
|
) |
|
|
|
self.chunks = [] |
|
|
|
def split_documents(self, documents: List[Document]) -> List[Document]: |
|
|
|
print(f"{len(documents)}๊ฐ ํ์ด์ง๋ฅผ ์ฒญํฌ๋ก ๋ถํ ์ค...") |
|
|
|
|
|
processed_docs = self._preprocess_documents(documents) |
|
|
|
|
|
self.chunks = self.text_splitter.split_documents(processed_docs) |
|
|
|
|
|
self._add_chunk_metadata() |
|
|
|
print(f"์ด {len(self.chunks)}๊ฐ ์ฒญํฌ ์์ฑ ์๋ฃ") |
|
print(f"ํ๊ท ์ฒญํฌ ํฌ๊ธฐ: {self._get_avg_chunk_size():.0f}์") |
|
|
|
return self.chunks |
|
|
|
def _preprocess_documents(self, documents: List[Document]) -> List[Document]: |
|
|
|
processed = [] |
|
|
|
for doc in documents: |
|
|
|
content = re.sub(r'\s+', ' ', doc.page_content) |
|
|
|
|
|
content = re.sub(r'-\s*\d+\s*-', '', content) |
|
content = re.sub(r'Page\s*\d+', '', content, flags=re.IGNORECASE) |
|
|
|
|
|
if len(content.strip()) < 50: |
|
continue |
|
|
|
processed.append(Document( |
|
page_content=content, |
|
metadata=doc.metadata |
|
)) |
|
|
|
return processed |
|
|
|
def _add_chunk_metadata(self): |
|
for i, chunk in enumerate(self.chunks): |
|
chunk.metadata['chunk_id'] = f"chunk_{i:04d}" |
|
chunk.metadata['chunk_index'] = i |
|
|
|
|
|
section = self._infer_section(chunk.page_content) |
|
if section: |
|
chunk.metadata['section'] = section |
|
|
|
def _infer_section(self, text: str) -> str: |
|
text_lower = text.lower() |
|
|
|
|
|
section_keywords = { |
|
'์์ง': ['์์ง', '์๋', '์ถ๋ ฅ', '์ฐ๋ฃ'], |
|
'๋ธ๋ ์ดํฌ': ['๋ธ๋ ์ดํฌ', '์ ๋', 'ํ๋ฌ'], |
|
'ํ์ด์ด': ['ํ์ด์ด', 'ํ ', '๊ณต๊ธฐ์'], |
|
'์ ๊ธฐ์ฅ์น': ['๋ฐฐํฐ๋ฆฌ', 'ํจ์ฆ', '๋จํ', '์กฐ๋ช
'], |
|
'์์ ': ['์์ด๋ฐฑ', '์์ ๋ฒจํธ', '๊ฒฝ๊ณ ๋ฑ'], |
|
'์ ๋น': ['์ ๊ฒ', '๊ต์ฒด', '์ ๋น', '์ค์ผ'], |
|
'์ด์ ': ['์ฃผํ', '์ด์ ', '๊ธฐ์ด', '๋ณ์'], |
|
'ADAS': ['ํฌ๋ฃจ์ฆ', '์ฐจ์ ', '์ถฉ๋', '์๋'] |
|
} |
|
|
|
for section, keywords in section_keywords.items(): |
|
for keyword in keywords: |
|
if keyword in text_lower: |
|
return section |
|
|
|
return '์ผ๋ฐ' |
|
|
|
def _get_avg_chunk_size(self) -> float: |
|
"""ํ๊ท ์ฒญํฌ ํฌ๊ธฐ ๊ณ์ฐ""" |
|
if not self.chunks: |
|
return 0 |
|
return sum(len(c.page_content) for c in self.chunks) / len(self.chunks) |
|
|
|
def get_chunk_statistics(self) -> dict: |
|
if not self.chunks: |
|
return {} |
|
|
|
lengths = [len(c.page_content) for c in self.chunks] |
|
|
|
return { |
|
'total_chunks': len(self.chunks), |
|
'avg_size': sum(lengths) / len(lengths), |
|
'min_size': min(lengths), |
|
'max_size': max(lengths), |
|
'total_chars': sum(lengths), |
|
'sections': self._count_sections() |
|
} |
|
|
|
def _count_sections(self) -> dict: |
|
sections = {} |
|
for chunk in self.chunks: |
|
section = chunk.metadata.get('section', '์ผ๋ฐ') |
|
sections[section] = sections.get(section, 0) + 1 |
|
return sections |
|
|
|
def search_chunks(self, keyword: str, limit: int = 5) -> List[Document]: |
|
results = [] |
|
for chunk in self.chunks: |
|
if keyword.lower() in chunk.page_content.lower(): |
|
results.append(chunk) |
|
if len(results) >= limit: |
|
break |
|
return results |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
from document_loader import VehicleManualLoader |
|
import os |
|
|
|
|
|
current_dir = os.path.dirname(os.path.abspath(__file__)) |
|
project_root = os.path.dirname(current_dir) |
|
pdf_path = os.path.join(project_root, "data", "LX3_2026_ko_KR.pdf") |
|
|
|
print("=" * 60) |
|
print("์ฐจ๋ ๋งค๋ด์ผ ํ
์คํธ ๋ถํ ํ
์คํธ") |
|
print("=" * 60) |
|
|
|
try: |
|
|
|
print("\n1. PDF ๋ก๋ฉ...") |
|
loader = VehicleManualLoader(pdf_path) |
|
documents = loader.load_pdf() |
|
|
|
|
|
print("\n2. ํ
์คํธ ๋ถํ ์ค...") |
|
splitter = VehicleManualTextSplitter( |
|
chunk_size=500, |
|
chunk_overlap=100 |
|
) |
|
chunks = splitter.split_documents(documents) |
|
|
|
|
|
print("\n3. ์ฒญํฌ ํต๊ณ:") |
|
stats = splitter.get_chunk_statistics() |
|
print(f" - ์ด ์ฒญํฌ ์: {stats['total_chunks']:,}๊ฐ") |
|
print(f" - ํ๊ท ํฌ๊ธฐ: {stats['avg_size']:.0f}์") |
|
print(f" - ์ต์/์ต๋: {stats['min_size']}์ / {stats['max_size']}์") |
|
print(f" - ์ด ๋ฌธ์ ์: {stats['total_chars']:,}์") |
|
|
|
print("\n ์น์
๋ณ ๋ถํฌ:") |
|
for section, count in stats['sections'].items(): |
|
print(f" - {section}: {count}๊ฐ") |
|
|
|
|
|
print("\n4๏ธ. ์ํ ์ฒญํฌ (์ฒ์ 3๊ฐ):") |
|
print("-" * 50) |
|
for i, chunk in enumerate(chunks[:3]): |
|
print(f"\n[์ฒญํฌ {i + 1}]") |
|
print(f"ID: {chunk.metadata.get('chunk_id')}") |
|
print(f"์น์
: {chunk.metadata.get('section')}") |
|
print(f"์๋ณธ ํ์ด์ง: {chunk.metadata.get('page', 'N/A')}") |
|
print(f"๋ด์ฉ: {chunk.page_content[:150]}...") |
|
print("-" * 50) |
|
|
|
|
|
print("\n5๏ธ. '์์ง ์ค์ผ' ๊ฒ์ ํ
์คํธ:") |
|
results = splitter.search_chunks("์์ง ์ค์ผ", limit=3) |
|
print(f" ์ฐพ์ ์ฒญํฌ: {len(results)}๊ฐ") |
|
for i, chunk in enumerate(results): |
|
print(f" - ์ฒญํฌ {chunk.metadata['chunk_id']}: {chunk.page_content[:100]}...") |
|
|
|
except Exception as e: |
|
print(f" ์๋ฌ ๋ฐ์: {e}") |
|
import traceback |
|
|
|
traceback.print_exc() |