# content.py # Simple content registry for two modes: "career" and "personal". # Auto-loads PDF/TXT/MD files from folders and concatenates text for prompts. import os import glob from dataclasses import dataclass from typing import List, Dict, Iterable, Optional from pypdf import PdfReader @dataclass class Doc: domain: str # "career" or "personal" title: str text: str source_path: str class ContentStore: def __init__(self): self.docs: List[Doc] = [] self.by_domain: Dict[str, List[Doc]] = {} # ---------- Loading ---------- def add_doc(self, doc: Doc): self.docs.append(doc) self.by_domain.setdefault(doc.domain, []).append(doc) def load_folder(self, folder: str, domain: str): """ Load all files in a folder into a domain. Supported: .pdf, .txt, .md, .markdown """ os.makedirs(folder, exist_ok=True) for path in glob.glob(os.path.join(folder, "*")): if os.path.isdir(path): continue text = self._extract_text(path) if not text: continue title = os.path.basename(path) self.add_doc(Doc(domain=domain, title=title, text=text, source_path=path)) def _extract_text(self, path: str) -> str: lower = path.lower() if lower.endswith(".pdf"): out = [] try: reader = PdfReader(path) for p in reader.pages: t = p.extract_text() if t: out.append(t) except Exception: return "" return "\n".join(out) if lower.endswith((".txt", ".md", ".markdown")): try: with open(path, "r", encoding="utf-8") as f: return f.read() except Exception: return "" return "" # ---------- Retrieval ---------- def join_domain_text(self, domains: Optional[Iterable[str]]) -> str: """ Concatenate documents for the selected domains. If domains is None/empty, defaults to ["career"]. """ if not domains: domains = ["career"] chunks: List[str] = [] for d in domains: for doc in self.by_domain.get(d, []): chunks.append(f"### {doc.title}\n{doc.text}\n") return "\n".join(chunks)