File size: 816 Bytes
4ad299d
9cb30e2
 
 
 
 
4ad299d
9cb30e2
 
 
4ad299d
9cb30e2
 
 
 
 
4ad299d
9cb30e2
 
 
 
 
 
4ad299d
9cb30e2
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import fitz  # PyMuPDF
import pandas as pd
from pptx import Presentation

def extract_text_from_file(v_file_path):
    """
    Extracts text from PDF, PPTX, or CSV files.
    """
    v_text = ""

    if v_file_path.lower().endswith('.pdf'):
        obj_pdf = fitz.open(v_file_path)
        for obj_page in obj_pdf:
            v_text += obj_page.get_text()
        obj_pdf.close()

    elif v_file_path.lower().endswith('.pptx'):
        obj_ppt = Presentation(v_file_path)
        for obj_slide in obj_ppt.slides:
            for obj_shape in obj_slide.shapes:
                if obj_shape.has_text_frame:
                    v_text += obj_shape.text_frame.text + "\n"

    elif v_file_path.lower().endswith('.csv'):
        v_data = pd.read_csv(v_file_path)
        v_text += v_data.to_string()

    return v_text