Spaces:
Running
Running
from docx import Document | |
from typing import List, Tuple | |
import os | |
def extract_questions_from_docx(docx_path: str) -> List[str]: | |
"""Extracts questions from a .docx file. Assumes each question is a separate paragraph or numbered list item.""" | |
doc = Document(docx_path) | |
questions = [] | |
for para in doc.paragraphs: | |
text = para.text.strip() | |
if text and (text.endswith('?') or text.startswith('Q')): | |
questions.append(text) | |
# Also check for numbered/bulleted lists | |
for para in doc.paragraphs: | |
if para.style.name.startswith('List') and para.text.strip(): | |
questions.append(para.text.strip()) | |
# Remove duplicates | |
questions = list(dict.fromkeys(questions)) | |
return questions | |
def extract_instructions_from_docx(docx_path: str) -> str: | |
"""Extracts the main instruction text from a .docx file (all non-question paragraphs).""" | |
doc = Document(docx_path) | |
instructions = [] | |
for para in doc.paragraphs: | |
text = para.text.strip() | |
if text and not text.endswith('?'): | |
instructions.append(text) | |
return '\n'.join(instructions) | |
def find_docx_file_in_folder(folder_path: str) -> str: | |
"""Finds the first .docx file in a folder and returns its path.""" | |
for fname in os.listdir(folder_path): | |
if fname.lower().endswith('.docx'): | |
return os.path.join(folder_path, fname) | |
raise FileNotFoundError(f"No .docx file found in {folder_path}") |