Spaces:

om4r932
/

ETSISpecSplitter

Sleeping

File size: 3,739 Bytes

219f767

import requests, os, re, warnings, fitz
warnings.filterwarnings("ignore")
from dotenv import load_dotenv
from datasets import load_dataset
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel

load_dotenv()

app = FastAPI(title="ETSI Specification Splitter API", 
              description="API to split and display specifications by their chapters & sub-chapters",
              docs_url="/")

origins = [
    "*",
]

app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

spec_contents = load_dataset("OrganizedProgrammers/ETSISpecContent", token=os.environ["HF_TOKEN"])
spec_contents = spec_contents["train"].to_list()

def is_doc_indexed(spec_id: str):
    return any([True if spec_id == s["doc_id"] else False for s in spec_contents])

def get_full_doc(spec_id: str):
    doc = []
    for spec in spec_contents:
        if spec["doc_id"] == spec_id:
            doc.append(f"{spec['section']}\n{spec['content']}")
    return "\n\n".join(doc)

def get_structured_doc(spec_id: str):
    doc = {}
    for spec in spec_contents:
        if spec["doc_id"] == spec_id:
            doc[spec["section"]] = spec["content"]
    return doc


class SpecRequest(BaseModel):
    spec_id: str

def get_pdf_data(request: SpecRequest):
    specification = request.spec_id
    if is_doc_indexed(specification):
        return get_full_doc(specification)
    url = requests.post(
        "https://organizedprogrammers-etsidocfinder.hf.space/find",
        verify=False,
        headers={"Content-Type": "application/json"},
        json={"doc_id": specification}
    )

    if url.status_code != 200:
        raise HTTPException(404, detail="Not found")
    
    url = url.json()['url']
    response = requests.get(
        url,
        verify=False,
        headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"}

    )

    pdf = fitz.open(stream=response.content, filetype="pdf")
    return pdf, pdf.get_toc()

@app.post("/get_spec_content")
def get_spec_content(request: SpecRequest):
    def extract_sections(text, titles):
        sections = {}
        # On trie les titres selon leur position dans le texte
        sorted_titles = sorted(titles, key=lambda t: text.find(t))
        for i, title in enumerate(sorted_titles):
            start = text.find(title)
            if i + 1 < len(sorted_titles):
                end = text.find(sorted_titles[i + 1])
                sections[re.sub(r"\s+", " ", title)] = re.sub(r"\s+", " ", text[start:end].replace(title, "").strip().rstrip())
            else:
                sections[re.sub(r"\s+", " ", title)] = re.sub(r"\s+", " ", text[start:].replace(title, "").strip().rstrip())
        return sections
    print("\n[INFO] Tentative de récupération du texte", flush=True)
    pdf, doc_toc = get_pdf_data(request)
    text = []
    first = 0
    for level, title, page in doc_toc:
        if title[0].isnumeric():
            first = page - 1
            break
    for page in pdf[first:]:
        text.append("\n".join([line.strip() for line in page.get_text().splitlines()]))
    text = "\n".join(text)

    if not text or not doc_toc:
        print("\n[ERREUR] Pas de texte/table of contents trouvé !")
        return {}
    print(f"\n[INFO] Texte {request.spec_id} récupéré", flush=True)
    titles = []
    for level, title, page in doc_toc:
        if title[0].isnumeric() and '\n'.join(title.strip().split(" ", 1)) in text:
            titles.append('\n'.join(title.strip().split(" ", 1)))
    
    return extract_sections(text, titles)