Spaces:
Runtime error
Runtime error
import os | |
from transformers import pipeline, BartTokenizer | |
from pydantic import BaseModel | |
# Initialize the summarization pipeline | |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") # device=-1 for CPU | |
# Initialize the tokenizer | |
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn") | |
class TextRequest(BaseModel): | |
text: str | |
# Paths | |
folder_path = "jfk_text" # Folder containing the documents | |
output_dir = "summaryoutput" # Folder to save summaries | |
os.makedirs(output_dir, exist_ok=True) | |
# Get a list of already summarized files | |
existing_summaries = set(os.listdir(output_dir)) | |
# Function to split text into meaningful chunks (e.g., paragraphs or sentences) | |
def split_text_into_chunks(text, max_tokens=500): | |
paragraphs = text.split("\n\n") # Split by double newlines (paragraphs) | |
chunks = [] | |
current_chunk = "" | |
for paragraph in paragraphs: | |
tokens = tokenizer.tokenize(paragraph) | |
if len(tokens) > max_tokens: | |
words = paragraph.split() | |
sub_chunks = [' '.join(words[i:i + max_tokens]) for i in range(0, len(words), max_tokens)] | |
chunks.extend(sub_chunks) | |
else: | |
if len(tokenizer.tokenize(current_chunk + " " + paragraph)) > max_tokens: | |
chunks.append(current_chunk.strip()) | |
current_chunk = paragraph | |
else: | |
current_chunk += " " + paragraph | |
if current_chunk.strip(): | |
chunks.append(current_chunk.strip()) | |
return chunks | |
# Process each document separately | |
def summarize_all_files(): | |
for filename in os.listdir(folder_path): | |
if filename.endswith(".md"): # Process Markdown files | |
summary_filename = f"summary_{filename}" | |
summary_filepath = os.path.join(output_dir, summary_filename) | |
if summary_filename in existing_summaries: | |
print(f"β Skipping {filename}, already summarized.") | |
continue # Skip this file | |
file_path = os.path.join(folder_path, filename) | |
with open(file_path, 'r', encoding='utf-8') as file: | |
text = file.read() | |
# Split text into chunks | |
chunks = split_text_into_chunks(text) | |
# Summarize each chunk | |
summaries = [] | |
for chunk in chunks: | |
try: | |
summary = summarizer( | |
chunk, | |
max_length=min(130, len(tokenizer.tokenize(chunk))), # Ensure max_length is not more than input tokens | |
min_length=min(30, len(tokenizer.tokenize(chunk)) // 2), # Ensure min_length is reasonable | |
do_sample=False | |
) | |
summaries.append(summary[0]['summary_text']) | |
except Exception as e: | |
print(f"Error summarizing chunk in {filename}: {e}") | |
# Save individual document summary | |
summary_text = "\n\n".join(summaries) | |
with open(summary_filepath, "w", encoding="utf-8") as f: | |
f.write(summary_text) | |
print(f"β Summary saved for {filename} -> {summary_filepath}") | |
print("π All files summarized successfully!") | |
if __name__ == "__main__": | |
summarize_all_files() |