Spaces:
Build error
Build error
| # Core Pkgs | |
| import streamlit as st | |
| from transformers import pipeline | |
| from PyPDF2 import PdfFileReader | |
| import docx2txt | |
| import base64 | |
| import re | |
| import sqlite3 | |
| import time | |
| from io import StringIO | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| time_str = time.strftime("%Y%m%d-%H%M%S") | |
| # Loading function the model pipeline from huggingface model | |
| def bart(): | |
| ''' Loading bart model using pipeline api ''' | |
| summarizer = pipeline('summarization',model='facebook/bart-large-cnn') | |
| return summarizer | |
| def t5(): | |
| ''' Loading t5 model using pipeline api ''' | |
| summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base") | |
| return summarizer | |
| def bart_t5(): | |
| ''' Loading bart_t5 model using pipeline api ''' | |
| summarizer = pipeline("summarization", model="tuner007/pegasus_summarizer") | |
| return summarizer | |
| # def pegasus(): | |
| # ''' Loading pegasus model using pipeline api ''' | |
| # summarizer = pipeline('summarization',model='google/pegasus-xsum') | |
| # return summarizer | |
| def preprocess_plain_text(x): | |
| x = x.encode("ascii", "ignore").decode() # unicode | |
| x = re.sub(r"https*\S+", " ", x) # url | |
| x = re.sub(r"@\S+", " ", x) # mentions | |
| x = re.sub(r"#\S+", " ", x) # hastags | |
| x = re.sub(r"\s{2,}", " ", x) # over spaces | |
| x = re.sub("[^.,!?A-Za-z0-9]+", " ", x) # special charachters except .,!? | |
| return x | |
| def extract_pdf(file): | |
| '''Extract text from PDF file''' | |
| pdfReader = PdfFileReader(file) | |
| count = pdfReader.numPages | |
| all_text = "" | |
| for i in range(count): | |
| page = pdfReader.getPage(i) | |
| all_text += page.extractText() | |
| return all_text | |
| def extract_text_from_file(file): | |
| '''Extract text from uploaded file''' | |
| # read text file | |
| if file.type == "text/plain": | |
| # To convert to a string based IO: | |
| stringio = StringIO(file.getvalue().decode("utf-8")) | |
| # To read file as string: | |
| file_text = stringio.read() | |
| # read pdf file | |
| elif file.type == "application/pdf": | |
| file_text = extract_pdf(file) | |
| # read docx file | |
| elif ( | |
| file.type | |
| == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" | |
| ): | |
| file_text = docx2txt.process(file) | |
| return file_text | |
| def summary_downloader(raw_text): | |
| b64 = base64.b64encode(raw_text.encode()).decode() | |
| new_filename = "new_text_file_{}_.txt".format(time_str) | |
| st.markdown("#### Download Summary as a File ###") | |
| href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>' | |
| st.markdown(href,unsafe_allow_html=True) | |
| # Storage in A Database | |
| conn = sqlite3.connect('summarizer_database.db',check_same_thread=False) | |
| c = conn.cursor() | |
| # Create Fxn From SQL | |
| def create_table(): | |
| c.execute('CREATE TABLE IF NOT EXISTS TextTable(text_to_summarize TEXT,summarized_text TEXT,postdate DATE)') | |
| def add_data(text_to_summarize,summarized_text,postdate): | |
| c.execute('INSERT INTO TextTable(text_to_summarize,summarized_text,postdate) VALUES (?,?,?)',(text_to_summarize,summarized_text,postdate)) | |
| conn.commit() | |
| def view_all_data(): | |
| c.execute("SELECT * FROM TextTable") | |
| data = c.fetchall() | |
| return data |