Spaces:
Running
Running
File size: 2,451 Bytes
9064719 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# content.py
# Simple content registry for two modes: "career" and "personal".
# Auto-loads PDF/TXT/MD files from folders and concatenates text for prompts.
import os
import glob
from dataclasses import dataclass
from typing import List, Dict, Iterable, Optional
from pypdf import PdfReader
@dataclass
class Doc:
domain: str # "career" or "personal"
title: str
text: str
source_path: str
class ContentStore:
def __init__(self):
self.docs: List[Doc] = []
self.by_domain: Dict[str, List[Doc]] = {}
# ---------- Loading ----------
def add_doc(self, doc: Doc):
self.docs.append(doc)
self.by_domain.setdefault(doc.domain, []).append(doc)
def load_folder(self, folder: str, domain: str):
"""
Load all files in a folder into a domain.
Supported: .pdf, .txt, .md, .markdown
"""
os.makedirs(folder, exist_ok=True)
for path in glob.glob(os.path.join(folder, "*")):
if os.path.isdir(path):
continue
text = self._extract_text(path)
if not text:
continue
title = os.path.basename(path)
self.add_doc(Doc(domain=domain, title=title, text=text, source_path=path))
def _extract_text(self, path: str) -> str:
lower = path.lower()
if lower.endswith(".pdf"):
out = []
try:
reader = PdfReader(path)
for p in reader.pages:
t = p.extract_text()
if t:
out.append(t)
except Exception:
return ""
return "\n".join(out)
if lower.endswith((".txt", ".md", ".markdown")):
try:
with open(path, "r", encoding="utf-8") as f:
return f.read()
except Exception:
return ""
return ""
# ---------- Retrieval ----------
def join_domain_text(self, domains: Optional[Iterable[str]]) -> str:
"""
Concatenate documents for the selected domains.
If domains is None/empty, defaults to ["career"].
"""
if not domains:
domains = ["career"]
chunks: List[str] = []
for d in domains:
for doc in self.by_domain.get(d, []):
chunks.append(f"### {doc.title}\n{doc.text}\n")
return "\n".join(chunks)
|