om4r932's picture
First version
219f767
import requests, os, re, warnings, fitz
warnings.filterwarnings("ignore")
from dotenv import load_dotenv
from datasets import load_dataset
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
load_dotenv()
app = FastAPI(title="ETSI Specification Splitter API",
description="API to split and display specifications by their chapters & sub-chapters",
docs_url="/")
origins = [
"*",
]
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
spec_contents = load_dataset("OrganizedProgrammers/ETSISpecContent", token=os.environ["HF_TOKEN"])
spec_contents = spec_contents["train"].to_list()
def is_doc_indexed(spec_id: str):
return any([True if spec_id == s["doc_id"] else False for s in spec_contents])
def get_full_doc(spec_id: str):
doc = []
for spec in spec_contents:
if spec["doc_id"] == spec_id:
doc.append(f"{spec['section']}\n{spec['content']}")
return "\n\n".join(doc)
def get_structured_doc(spec_id: str):
doc = {}
for spec in spec_contents:
if spec["doc_id"] == spec_id:
doc[spec["section"]] = spec["content"]
return doc
class SpecRequest(BaseModel):
spec_id: str
def get_pdf_data(request: SpecRequest):
specification = request.spec_id
if is_doc_indexed(specification):
return get_full_doc(specification)
url = requests.post(
"https://organizedprogrammers-etsidocfinder.hf.space/find",
verify=False,
headers={"Content-Type": "application/json"},
json={"doc_id": specification}
)
if url.status_code != 200:
raise HTTPException(404, detail="Not found")
url = url.json()['url']
response = requests.get(
url,
verify=False,
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"}
)
pdf = fitz.open(stream=response.content, filetype="pdf")
return pdf, pdf.get_toc()
@app.post("/get_spec_content")
def get_spec_content(request: SpecRequest):
def extract_sections(text, titles):
sections = {}
# On trie les titres selon leur position dans le texte
sorted_titles = sorted(titles, key=lambda t: text.find(t))
for i, title in enumerate(sorted_titles):
start = text.find(title)
if i + 1 < len(sorted_titles):
end = text.find(sorted_titles[i + 1])
sections[re.sub(r"\s+", " ", title)] = re.sub(r"\s+", " ", text[start:end].replace(title, "").strip().rstrip())
else:
sections[re.sub(r"\s+", " ", title)] = re.sub(r"\s+", " ", text[start:].replace(title, "").strip().rstrip())
return sections
print("\n[INFO] Tentative de récupération du texte", flush=True)
pdf, doc_toc = get_pdf_data(request)
text = []
first = 0
for level, title, page in doc_toc:
if title[0].isnumeric():
first = page - 1
break
for page in pdf[first:]:
text.append("\n".join([line.strip() for line in page.get_text().splitlines()]))
text = "\n".join(text)
if not text or not doc_toc:
print("\n[ERREUR] Pas de texte/table of contents trouvé !")
return {}
print(f"\n[INFO] Texte {request.spec_id} récupéré", flush=True)
titles = []
for level, title, page in doc_toc:
if title[0].isnumeric() and '\n'.join(title.strip().split(" ", 1)) in text:
titles.append('\n'.join(title.strip().split(" ", 1)))
return extract_sections(text, titles)