Spaces:
Sleeping
Sleeping
# content.py | |
# Simple content registry for two modes: "career" and "personal". | |
# Auto-loads PDF/TXT/MD files from folders and concatenates text for prompts. | |
import os | |
import glob | |
from dataclasses import dataclass | |
from typing import List, Dict, Iterable, Optional | |
from pypdf import PdfReader | |
class Doc: | |
domain: str # "career" or "personal" | |
title: str | |
text: str | |
source_path: str | |
class ContentStore: | |
def __init__(self): | |
self.docs: List[Doc] = [] | |
self.by_domain: Dict[str, List[Doc]] = {} | |
# ---------- Loading ---------- | |
def add_doc(self, doc: Doc): | |
self.docs.append(doc) | |
self.by_domain.setdefault(doc.domain, []).append(doc) | |
def load_folder(self, folder: str, domain: str): | |
""" | |
Load all files in a folder into a domain. | |
Supported: .pdf, .txt, .md, .markdown | |
""" | |
os.makedirs(folder, exist_ok=True) | |
for path in glob.glob(os.path.join(folder, "*")): | |
if os.path.isdir(path): | |
continue | |
text = self._extract_text(path) | |
if not text: | |
continue | |
title = os.path.basename(path) | |
self.add_doc(Doc(domain=domain, title=title, text=text, source_path=path)) | |
def _extract_text(self, path: str) -> str: | |
lower = path.lower() | |
if lower.endswith(".pdf"): | |
out = [] | |
try: | |
reader = PdfReader(path) | |
for p in reader.pages: | |
t = p.extract_text() | |
if t: | |
out.append(t) | |
except Exception: | |
return "" | |
return "\n".join(out) | |
if lower.endswith((".txt", ".md", ".markdown")): | |
try: | |
with open(path, "r", encoding="utf-8") as f: | |
return f.read() | |
except Exception: | |
return "" | |
return "" | |
# ---------- Retrieval ---------- | |
def join_domain_text(self, domains: Optional[Iterable[str]]) -> str: | |
""" | |
Concatenate documents for the selected domains. | |
If domains is None/empty, defaults to ["career"]. | |
""" | |
if not domains: | |
domains = ["career"] | |
chunks: List[str] = [] | |
for d in domains: | |
for doc in self.by_domain.get(d, []): | |
chunks.append(f"### {doc.title}\n{doc.text}\n") | |
return "\n".join(chunks) | |