|
import os |
|
from transformers import pipeline, BartTokenizer |
|
from pydantic import BaseModel |
|
|
|
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") |
|
|
|
|
|
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn") |
|
class TextRequest(BaseModel): |
|
text: str |
|
|
|
folder_path = "jfk_text" |
|
output_dir = "summaryoutput" |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
existing_summaries = set(os.listdir(output_dir)) |
|
|
|
|
|
def split_text_into_chunks(text, max_tokens=500): |
|
paragraphs = text.split("\n\n") |
|
chunks = [] |
|
current_chunk = "" |
|
|
|
for paragraph in paragraphs: |
|
tokens = tokenizer.tokenize(paragraph) |
|
if len(tokens) > max_tokens: |
|
words = paragraph.split() |
|
sub_chunks = [' '.join(words[i:i + max_tokens]) for i in range(0, len(words), max_tokens)] |
|
chunks.extend(sub_chunks) |
|
else: |
|
if len(tokenizer.tokenize(current_chunk + " " + paragraph)) > max_tokens: |
|
chunks.append(current_chunk.strip()) |
|
current_chunk = paragraph |
|
else: |
|
current_chunk += " " + paragraph |
|
|
|
if current_chunk.strip(): |
|
chunks.append(current_chunk.strip()) |
|
|
|
return chunks |
|
|
|
|
|
def summarize_all_files(): |
|
for filename in os.listdir(folder_path): |
|
if filename.endswith(".md"): |
|
summary_filename = f"summary_{filename}" |
|
summary_filepath = os.path.join(output_dir, summary_filename) |
|
|
|
if summary_filename in existing_summaries: |
|
print(f"β Skipping {filename}, already summarized.") |
|
continue |
|
|
|
file_path = os.path.join(folder_path, filename) |
|
|
|
with open(file_path, 'r', encoding='utf-8') as file: |
|
text = file.read() |
|
|
|
|
|
chunks = split_text_into_chunks(text) |
|
|
|
|
|
summaries = [] |
|
for chunk in chunks: |
|
try: |
|
summary = summarizer( |
|
chunk, |
|
max_length=min(130, len(tokenizer.tokenize(chunk))), |
|
min_length=min(30, len(tokenizer.tokenize(chunk)) // 2), |
|
do_sample=False |
|
) |
|
|
|
summaries.append(summary[0]['summary_text']) |
|
except Exception as e: |
|
print(f"Error summarizing chunk in {filename}: {e}") |
|
|
|
|
|
summary_text = "\n\n".join(summaries) |
|
with open(summary_filepath, "w", encoding="utf-8") as f: |
|
f.write(summary_text) |
|
|
|
print(f"β
Summary saved for {filename} -> {summary_filepath}") |
|
|
|
print("π All files summarized successfully!") |
|
|
|
if __name__ == "__main__": |
|
summarize_all_files() |