meeting-minutes-ai / utils /text_processor.py
Yermia's picture
Fix text procesor
7af8df4
# from transformers import (
# AutoTokenizer,
# AutoModelForSeq2SeqLM,
# AutoModelForTokenClassification,
# pipeline
# )
# from keybert import KeyBERT
# from summarizer import Summarizer
# import re
# import nltk
# nltk.download('punkt')
# class TextProcessor:
# def __init__(self):
# # Initialize summarization model
# self.summarizer = Summarizer('bert-base-multilingual-cased')
# # Initialize KeyBERT for keyword extraction
# self.kw_model = KeyBERT('paraphrase-multilingual-MiniLM-L12-v2')
# # Initialize NER for action item detection
# self.ner_pipeline = pipeline(
# "ner",
# model="cahya/bert-base-indonesian-NER",
# aggregation_strategy="simple"
# )
# # Action item patterns
# self.action_patterns = [
# r"akan\s+(\w+)",
# r"harus\s+(\w+)",
# r"perlu\s+(\w+)",
# r"mohon\s+(\w+)",
# r"tolong\s+(\w+)",
# r"segera\s+(\w+)",
# r"follow\s*up",
# r"action\s*item",
# r"to\s*do",
# r"deadline"
# ]
# # Decision patterns
# self.decision_patterns = [
# r"(diputuskan|memutuskan)\s+(.+)",
# r"(disepakati|menyepakati)\s+(.+)",
# r"(setuju|persetujuan)\s+(.+)",
# r"keputusan(?:nya)?\s+(.+)",
# r"final(?:isasi)?\s+(.+)"
# ]
# def summarize_transcript(self, transcript_segments, ratio=0.3):
# """
# Hierarchical summarization untuk transcript panjang
# """
# # Gabungkan text dari semua segments
# full_text = ' '.join([seg['text'] for seg in transcript_segments])
# # Chunking untuk dokumen panjang
# chunks = self._create_chunks(full_text)
# if len(chunks) == 1:
# # Direct summarization untuk dokumen pendek
# return self.summarizer(
# chunks[0],
# ratio=ratio,
# num_sentences=5
# )
# else:
# # Hierarchical summarization
# return self._hierarchical_summarization(chunks, ratio)
# def extract_key_information(self, transcript_segments):
# """
# Extract action items, decisions, dan key topics
# """
# full_text = ' '.join([seg['text'] for seg in transcript_segments])
# # Extract keywords/topics
# keywords = self.kw_model.extract_keywords(
# full_text,
# keyphrase_ngram_range=(1, 3),
# stop_words='indonesian',
# top_n=10,
# use_mmr=True,
# diversity=0.5
# )
# # Extract action items dan decisions
# action_items = []
# decisions = []
# for segment in transcript_segments:
# # Check for action items
# if self._is_action_item(segment['text']):
# action_items.append({
# 'text': segment['text'],
# 'speaker': segment['speaker'],
# 'timestamp': f"{segment['start']:.1f}s",
# 'entities': self._extract_entities(segment['text'])
# })
# # Check for decisions
# if self._is_decision(segment['text']):
# decisions.append({
# 'text': segment['text'],
# 'speaker': segment['speaker'],
# 'timestamp': f"{segment['start']:.1f}s"
# })
# return {
# 'keywords': keywords,
# 'action_items': action_items,
# 'decisions': decisions
# }
# def _create_chunks(self, text, max_length=3000):
# """
# Create overlapping chunks for long documents
# """
# sentences = nltk.sent_tokenize(text)
# chunks = []
# current_chunk = []
# current_length = 0
# for sentence in sentences:
# sentence_length = len(sentence)
# if current_length + sentence_length > max_length and current_chunk:
# chunks.append(' '.join(current_chunk))
# # Keep last 2 sentences for overlap
# current_chunk = current_chunk[-2:] if len(current_chunk) > 2 else []
# current_length = sum(len(s) for s in current_chunk)
# current_chunk.append(sentence)
# current_length += sentence_length
# if current_chunk:
# chunks.append(' '.join(current_chunk))
# return chunks
# def _hierarchical_summarization(self, chunks, ratio):
# """
# Two-level summarization for long documents
# """
# # Level 1: Summarize each chunk
# chunk_summaries = []
# for chunk in chunks:
# summary = self.summarizer(
# chunk,
# ratio=0.4, # Higher ratio for first level
# num_sentences=4
# )
# chunk_summaries.append(summary)
# # Level 2: Summarize the summaries
# combined_summary = ' '.join(chunk_summaries)
# final_summary = self.summarizer(
# combined_summary,
# ratio=ratio,
# num_sentences=6
# )
# return final_summary
# def _is_action_item(self, text):
# """
# Detect if text contains action item
# """
# text_lower = text.lower()
# # Check patterns
# for pattern in self.action_patterns:
# if re.search(pattern, text_lower):
# return True
# # Check for imperative sentences
# first_word = text.split()[0].lower() if text.split() else ""
# imperative_verbs = [
# 'lakukan', 'buat', 'siapkan', 'kirim', 'hubungi',
# 'follow', 'prepare', 'send', 'contact', 'create'
# ]
# return first_word in imperative_verbs
# def _is_decision(self, text):
# """
# Detect if text contains decision
# """
# text_lower = text.lower()
# for pattern in self.decision_patterns:
# if re.search(pattern, text_lower):
# return True
# return False
# def _extract_entities(self, text):
# """
# Extract named entities (person, date, etc)
# """
# entities = self.ner_pipeline(text)
# return {
# 'persons': [e['word'] for e in entities if e['entity_group'] == 'PER'],
# 'organizations': [e['word'] for e in entities if e['entity_group'] == 'ORG'],
# 'dates': self._extract_dates(text)
# }
# def _extract_dates(self, text):
# """
# Extract date mentions
# """
# date_patterns = [
# r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}',
# r'(senin|selasa|rabu|kamis|jumat|sabtu|minggu)',
# r'(besok|lusa|minggu\s+depan|bulan\s+depan)',
# r'(januari|februari|maret|april|mei|juni|juli|agustus|september|oktober|november|desember)'
# ]
# dates = []
# for pattern in date_patterns:
# matches = re.findall(pattern, text.lower())
# dates.extend(matches)
# return dates
from transformers import (
AutoTokenizer,
AutoModelForSeq2SeqLM,
pipeline
)
from keybert import KeyBERT
import re
import nltk
from typing import List, Dict
class TextProcessor:
def __init__(self):
print("Initializing Text Processor...")
# Use transformers pipeline for summarization instead
try:
self.summarizer = pipeline(
"summarization",
model="sshleifer/distilbart-cnn-12-6",
device=-1 # CPU
)
except:
# Fallback to simple extractive summarization
self.summarizer = None
print("Warning: Summarization model not loaded, using fallback")
# Initialize KeyBERT for keyword extraction
try:
self.kw_model = KeyBERT('paraphrase-multilingual-MiniLM-L12-v2')
except:
self.kw_model = None
print("Warning: KeyBERT not loaded")
# Action item patterns
self.action_patterns = [
r"akan\s+(\w+)", r"harus\s+(\w+)", r"perlu\s+(\w+)",
r"mohon\s+(\w+)", r"tolong\s+(\w+)", r"segera\s+(\w+)",
r"follow\s*up", r"action\s*item", r"to\s*do", r"deadline"
]
# Decision patterns
self.decision_patterns = [
r"(diputuskan|memutuskan)\s+(.+)",
r"(disepakati|menyepakati)\s+(.+)",
r"(setuju|persetujuan)\s+(.+)",
r"keputusan(?:nya)?\s+(.+)",
r"final(?:isasi)?\s+(.+)"
]
print("Text Processor ready!")
def summarize_transcript(self, transcript_segments, ratio=0.3):
"""Summarization with fallback methods"""
# Combine text from all segments
full_text = ' '.join([seg['text'] for seg in transcript_segments])
if not full_text.strip():
return "No content to summarize."
# Try using the summarization pipeline
if self.summarizer:
try:
# Split into chunks if too long
max_chunk_length = 1024
if len(full_text) > max_chunk_length:
chunks = self._split_into_chunks(full_text, max_chunk_length)
summaries = []
for chunk in chunks[:3]: # Limit to first 3 chunks
summary = self.summarizer(
chunk,
max_length=130,
min_length=30,
do_sample=False
)[0]['summary_text']
summaries.append(summary)
return ' '.join(summaries)
else:
return self.summarizer(
full_text,
max_length=150,
min_length=30,
do_sample=False
)[0]['summary_text']
except:
pass
# Fallback: Simple extractive summarization
return self._simple_extractive_summary(full_text, ratio)
def extract_key_information(self, transcript_segments):
"""Extract action items, decisions, and key topics"""
full_text = ' '.join([seg['text'] for seg in transcript_segments])
# Extract keywords/topics
keywords = []
if self.kw_model:
try:
keywords = self.kw_model.extract_keywords(
full_text,
keyphrase_ngram_range=(1, 3),
stop_words=None,
top_n=10,
use_mmr=True,
diversity=0.5
)
except:
pass
# If KeyBERT fails, use simple frequency-based extraction
if not keywords:
keywords = self._extract_keywords_simple(full_text)
# Extract action items and decisions
action_items = []
decisions = []
for segment in transcript_segments:
# Check for action items
if self._is_action_item(segment['text']):
action_items.append({
'text': segment['text'],
'speaker': segment['speaker'],
'timestamp': f"{segment['start']:.1f}s"
})
# Check for decisions
if self._is_decision(segment['text']):
decisions.append({
'text': segment['text'],
'speaker': segment['speaker'],
'timestamp': f"{segment['start']:.1f}s"
})
return {
'keywords': keywords,
'action_items': action_items,
'decisions': decisions
}
def _split_into_chunks(self, text, max_length):
"""Split text into chunks"""
words = text.split()
chunks = []
current_chunk = []
current_length = 0
for word in words:
current_chunk.append(word)
current_length += len(word) + 1
if current_length >= max_length:
chunks.append(' '.join(current_chunk))
current_chunk = []
current_length = 0
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
def _simple_extractive_summary(self, text, ratio=0.3):
"""Simple extractive summarization fallback"""
sentences = nltk.sent_tokenize(text)
if len(sentences) <= 3:
return text
# Calculate number of sentences to include
num_sentences = max(3, int(len(sentences) * ratio))
# Simple scoring: prefer sentences with more content words
scored_sentences = []
for i, sent in enumerate(sentences):
# Score based on length and position
score = len(sent.split())
if i < 3: # Boost first sentences
score *= 1.5
if i >= len(sentences) - 2: # Boost last sentences
score *= 1.2
scored_sentences.append((score, sent))
# Sort by score and select top sentences
scored_sentences.sort(reverse=True)
selected = [sent for _, sent in scored_sentences[:num_sentences]]
# Return in original order
return ' '.join([s for s in sentences if s in selected])
def _extract_keywords_simple(self, text):
"""Simple keyword extraction fallback"""
# Remove common words
stopwords = {
'yang', 'dan', 'di', 'ke', 'dari', 'untuk', 'pada', 'adalah',
'ini', 'itu', 'dengan', 'tersebut', 'dalam', 'dapat', 'akan',
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to',
'for', 'of', 'with', 'as', 'is', 'was', 'are', 'were'
}
# Count word frequency
words = re.findall(r'\b\w+\b', text.lower())
word_freq = {}
for word in words:
if len(word) > 3 and word not in stopwords:
word_freq[word] = word_freq.get(word, 0) + 1
# Get top keywords
keywords = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10]
# Format like KeyBERT output
return [(word, freq/len(words)) for word, freq in keywords]
def _is_action_item(self, text):
"""Detect if text contains action item"""
text_lower = text.lower()
# Check patterns
for pattern in self.action_patterns:
if re.search(pattern, text_lower):
return True
# Check for imperative sentences
first_word = text.split()[0].lower() if text.split() else ""
imperative_verbs = [
'lakukan', 'buat', 'siapkan', 'kirim', 'hubungi',
'follow', 'prepare', 'send', 'contact', 'create'
]
return first_word in imperative_verbs
def _is_decision(self, text):
"""Detect if text contains decision"""
text_lower = text.lower()
for pattern in self.decision_patterns:
if re.search(pattern, text_lower):
return True
return False