jfk_assassination_records_app / summarization.py
3v324v23's picture
Auto-deploy from GitHub
4418e3c
import os
from transformers import pipeline, BartTokenizer
from pydantic import BaseModel
# Initialize the summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") # device=-1 for CPU
# Initialize the tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
class TextRequest(BaseModel):
text: str
# Paths
folder_path = "jfk_text" # Folder containing the documents
output_dir = "summaryoutput" # Folder to save summaries
os.makedirs(output_dir, exist_ok=True)
# Get a list of already summarized files
existing_summaries = set(os.listdir(output_dir))
# Function to split text into meaningful chunks (e.g., paragraphs or sentences)
def split_text_into_chunks(text, max_tokens=500):
paragraphs = text.split("\n\n") # Split by double newlines (paragraphs)
chunks = []
current_chunk = ""
for paragraph in paragraphs:
tokens = tokenizer.tokenize(paragraph)
if len(tokens) > max_tokens:
words = paragraph.split()
sub_chunks = [' '.join(words[i:i + max_tokens]) for i in range(0, len(words), max_tokens)]
chunks.extend(sub_chunks)
else:
if len(tokenizer.tokenize(current_chunk + " " + paragraph)) > max_tokens:
chunks.append(current_chunk.strip())
current_chunk = paragraph
else:
current_chunk += " " + paragraph
if current_chunk.strip():
chunks.append(current_chunk.strip())
return chunks
# Process each document separately
def summarize_all_files():
for filename in os.listdir(folder_path):
if filename.endswith(".md"): # Process Markdown files
summary_filename = f"summary_{filename}"
summary_filepath = os.path.join(output_dir, summary_filename)
if summary_filename in existing_summaries:
print(f"⏭ Skipping {filename}, already summarized.")
continue # Skip this file
file_path = os.path.join(folder_path, filename)
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
# Split text into chunks
chunks = split_text_into_chunks(text)
# Summarize each chunk
summaries = []
for chunk in chunks:
try:
summary = summarizer(
chunk,
max_length=min(130, len(tokenizer.tokenize(chunk))), # Ensure max_length is not more than input tokens
min_length=min(30, len(tokenizer.tokenize(chunk)) // 2), # Ensure min_length is reasonable
do_sample=False
)
summaries.append(summary[0]['summary_text'])
except Exception as e:
print(f"Error summarizing chunk in {filename}: {e}")
# Save individual document summary
summary_text = "\n\n".join(summaries)
with open(summary_filepath, "w", encoding="utf-8") as f:
f.write(summary_text)
print(f"βœ… Summary saved for {filename} -> {summary_filepath}")
print("πŸŽ‰ All files summarized successfully!")
if __name__ == "__main__":
summarize_all_files()