assitantchatbot / utils /docx_prompt_loader.py
aghaai's picture
Initial deployment of Unified Assistant with OpenAI and Hugging Face integration
bd161ec
from docx import Document
from typing import List, Tuple
import os
def extract_questions_from_docx(docx_path: str) -> List[str]:
"""Extracts questions from a .docx file. Assumes each question is a separate paragraph or numbered list item."""
doc = Document(docx_path)
questions = []
for para in doc.paragraphs:
text = para.text.strip()
if text and (text.endswith('?') or text.startswith('Q')):
questions.append(text)
# Also check for numbered/bulleted lists
for para in doc.paragraphs:
if para.style.name.startswith('List') and para.text.strip():
questions.append(para.text.strip())
# Remove duplicates
questions = list(dict.fromkeys(questions))
return questions
def extract_instructions_from_docx(docx_path: str) -> str:
"""Extracts the main instruction text from a .docx file (all non-question paragraphs)."""
doc = Document(docx_path)
instructions = []
for para in doc.paragraphs:
text = para.text.strip()
if text and not text.endswith('?'):
instructions.append(text)
return '\n'.join(instructions)
def find_docx_file_in_folder(folder_path: str) -> str:
"""Finds the first .docx file in a folder and returns its path."""
for fname in os.listdir(folder_path):
if fname.lower().endswith('.docx'):
return os.path.join(folder_path, fname)
raise FileNotFoundError(f"No .docx file found in {folder_path}")