| from pypdf import PdfReader |
| import docx |
| from io import BytesIO |
| import logging |
| from fastapi import HTTPException |
|
|
|
|
| def parse_docx(file: BytesIO): |
| doc = docx.Document(file) |
| text = "" |
| for para in doc.paragraphs: |
| text += para.text + "\n" |
| return text |
|
|
|
|
| def parse_pdf(file: BytesIO): |
| try: |
| doc = PdfReader(file) |
| text = "" |
| for page in doc.pages: |
| text += page.extract_text() |
| return text |
| except Exception as e: |
| logging.error(f"Error while processing PDF: {str(e)}") |
| raise HTTPException( |
| status_code=500, detail="Error processing PDF file") |
|
|
| def parse_txt(file: BytesIO): |
| return file.read().decode("utf-8") |
|
|
|
|