Spaces:
Sleeping
Sleeping
import requests, os, re, warnings, fitz | |
warnings.filterwarnings("ignore") | |
from dotenv import load_dotenv | |
from datasets import load_dataset | |
from fastapi import FastAPI, HTTPException | |
from fastapi.middleware.cors import CORSMiddleware | |
from pydantic import BaseModel | |
load_dotenv() | |
app = FastAPI(title="ETSI Specification Splitter API", | |
description="API to split and display specifications by their chapters & sub-chapters", | |
docs_url="/") | |
origins = [ | |
"*", | |
] | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=origins, | |
allow_credentials=True, | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
spec_contents = load_dataset("OrganizedProgrammers/ETSISpecContent", token=os.environ["HF_TOKEN"]) | |
spec_contents = spec_contents["train"].to_list() | |
def is_doc_indexed(spec_id: str): | |
return any([True if spec_id == s["doc_id"] else False for s in spec_contents]) | |
def get_full_doc(spec_id: str): | |
doc = [] | |
for spec in spec_contents: | |
if spec["doc_id"] == spec_id: | |
doc.append(f"{spec['section']}\n{spec['content']}") | |
return "\n\n".join(doc) | |
def get_structured_doc(spec_id: str): | |
doc = {} | |
for spec in spec_contents: | |
if spec["doc_id"] == spec_id: | |
doc[spec["section"]] = spec["content"] | |
return doc | |
class SpecRequest(BaseModel): | |
spec_id: str | |
def get_pdf_data(request: SpecRequest): | |
specification = request.spec_id | |
if is_doc_indexed(specification): | |
return get_full_doc(specification) | |
url = requests.post( | |
"https://organizedprogrammers-etsidocfinder.hf.space/find", | |
verify=False, | |
headers={"Content-Type": "application/json"}, | |
json={"doc_id": specification} | |
) | |
if url.status_code != 200: | |
raise HTTPException(404, detail="Not found") | |
url = url.json()['url'] | |
response = requests.get( | |
url, | |
verify=False, | |
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"} | |
) | |
pdf = fitz.open(stream=response.content, filetype="pdf") | |
return pdf, pdf.get_toc() | |
def get_spec_content(request: SpecRequest): | |
def extract_sections(text, titles): | |
sections = {} | |
# On trie les titres selon leur position dans le texte | |
sorted_titles = sorted(titles, key=lambda t: text.find(t)) | |
for i, title in enumerate(sorted_titles): | |
start = text.find(title) | |
if i + 1 < len(sorted_titles): | |
end = text.find(sorted_titles[i + 1]) | |
sections[re.sub(r"\s+", " ", title)] = re.sub(r"\s+", " ", text[start:end].replace(title, "").strip().rstrip()) | |
else: | |
sections[re.sub(r"\s+", " ", title)] = re.sub(r"\s+", " ", text[start:].replace(title, "").strip().rstrip()) | |
return sections | |
print("\n[INFO] Tentative de récupération du texte", flush=True) | |
pdf, doc_toc = get_pdf_data(request) | |
text = [] | |
first = 0 | |
for level, title, page in doc_toc: | |
if title[0].isnumeric(): | |
first = page - 1 | |
break | |
for page in pdf[first:]: | |
text.append("\n".join([line.strip() for line in page.get_text().splitlines()])) | |
text = "\n".join(text) | |
if not text or not doc_toc: | |
print("\n[ERREUR] Pas de texte/table of contents trouvé !") | |
return {} | |
print(f"\n[INFO] Texte {request.spec_id} récupéré", flush=True) | |
titles = [] | |
for level, title, page in doc_toc: | |
if title[0].isnumeric() and '\n'.join(title.strip().split(" ", 1)) in text: | |
titles.append('\n'.join(title.strip().split(" ", 1))) | |
return extract_sections(text, titles) |