import gradio as gr import PyPDF2 import nltk from nltk.corpus import stopwords from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity nltk.download('punkt') nltk.download('stopwords') def extract_text(file): """ This function takes a PDF file and returns the extracted text. """ pdf_file = open(file.name, 'rb') read_pdf = PyPDF2.PdfReader(pdf_file) num_pages = len(read_pdf.pages) text = "" for i in range(num_pages): page = read_pdf.pages[i] text += page.extract_text () return text def generate_answers(text, question): """ This function takes the extracted text and a question and generates an answer. """ # Tokenize the text and question sentences = nltk.sent_tokenize(text) stop_words = set(stopwords.words('english')) words = nltk.word_tokenize(question.lower()) # Generate TF-IDF matrix vectorizer = TfidfVectorizer(stop_words=stop_words) X = vectorizer.fit_transform(sentences) # Calculate cosine similarity matrix cos_sim_matrix = cosine_similarity(X) # Find the sentence with the highest similarity to the question max_sim = -1 max_idx = -1 for i in range(len(sentences)): sim = 0 for word in words: sim += cos_sim_matrix[i][vectorizer.vocabulary_.get(word, 0)] if sim > max_sim: max_sim = sim max_idx = i # Return the sentence with the highest similarity as the answer if max_idx != -1: answer = sentences[max_idx] else: answer = "I'm sorry, I couldn't find an answer to that question." return answer # Create the Gradio app interface def app(): file_input = gr.inputs.File(label="Upload PDF Document") output_text = gr.outputs.Textbox(label="Extracted Text") question_input = gr.inputs.Textbox(label="Enter a question") output_answer = gr.outputs.Textbox(label="Answer") def predict(file, question): # Extract text from the file text = extract_text(file) # Generate an answer to the question answer = generate_answers(text, question) return text, answer # Create the interface and run the app iface = gr.Interface(fn=predict, inputs=[file_input, question_input], outputs=[output_text, output_answer], title="PDF QA Generator") iface.launch() if __name__ == '__main__': app()