Spaces:
Sleeping
Sleeping
| import PyPDF2 | |
| import spacy | |
| import re | |
| nlp = spacy.load("en_core_web_sm") | |
| def extract_text_from_pdf(pdf_file): | |
| reader = PyPDF2.PdfReader(pdf_file) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() | |
| return text | |
| def split_into_clauses(text): | |
| # Preprocess the text | |
| text = re.sub(r'\s+', ' ', text) # Remove extra whitespace | |
| text = re.sub(r'\n+', '\n', text) # Remove extra newlines | |
| # Use spaCy to parse the text | |
| doc = nlp(text) | |
| clauses = [] | |
| current_clause = [] | |
| for sent in doc.sents: | |
| current_clause.append(sent.text) | |
| # Check if this sentence ends a clause | |
| if re.search(r'\d+\.|\([a-z]\)|\([iv]+\)', sent.text) or len(' '.join(current_clause)) > 200: | |
| clauses.append(' '.join(current_clause)) | |
| current_clause = [] | |
| # Add any remaining text as the last clause | |
| if current_clause: | |
| clauses.append(' '.join(current_clause)) | |
| # Post-process clauses | |
| cleaned_clauses = [] | |
| for clause in clauses: | |
| # Remove leading/trailing whitespace and numbers | |
| clause = re.sub(r'^\s*\d+\.?\s*', '', clause.strip()) | |
| if clause: | |
| cleaned_clauses.append(clause) | |
| return cleaned_clauses |