import xml.etree.ElementTree as ET from elasticsearch import Elasticsearch, helpers import torch from transformers import CLIPProcessor, CLIPModel import numpy as np from server.utils.database import get_db from server.utils.model import get_clip_model from server.models.database import DocumentModel from server.crud.patent_data import insert_data,search_data from server.controllers.document import document_to_dict # Load CLIP model globally for reuse clip_model, clip_processor = get_clip_model() def insert_data_from_xml(xml_file: str, db=get_db(), index_name="patents"): tree = ET.parse(xml_file) root = tree.getroot() # Create index if not exists if not db.indices.exists(index=index_name): db.indices.create(index=index_name, body={ "mappings": { "properties": { "reel_no": {"type": "keyword"}, "frame_no": {"type": "keyword"}, "assignors": {"type": "text"}, "assignees": {"type": "text"}, "invention_title": {"type": "text"}, "conveyance_text": {"type": "text"}, "doc_numbers": {"type": "keyword"}, "raw_text": {"type": "text"}, "embedding": {"type": "dense_vector", "dims": 512, "index": True, "similarity": "cosine"} } } }) get_text = lambda el: el.text.strip() if el is not None and el.text else "" bulk_data = [] for pa in root.findall('.//patent-assignment'): record = pa.find('assignment-record') if record is None: continue reel_no = get_text(record.find('reel-no')) frame_no = get_text(record.find('frame-no')) conveyance_text = get_text(record.find('conveyance-text')) assignors = ", ".join([ get_text(a.find('name')) for a in pa.findall('.//patent-assignor') if get_text(a.find('name')) ]) assignees = ", ".join([ get_text(a.find('name')) for a in pa.findall('.//patent-assignee') if get_text(a.find('name')) ]) invention_title = "" doc_numbers = [] for prop in pa.findall('.//patent-property'): title = prop.find('invention-title') if title is not None: invention_title = get_text(title) for doc in prop.findall('document-id'): doc_num = get_text(doc.find('doc-number')) if doc_num: doc_numbers.append(doc_num) embedding = None if invention_title: inputs = clip_processor(text=[invention_title], return_tensors="pt", padding=True, truncation=True) with torch.no_grad(): embedding = clip_model.get_text_features(**inputs)[0].cpu().numpy().astype(np.float32).tolist() else: embedding = [0.0]*512 document=DocumentModel() document.reel_no = reel_no document.frame_no = frame_no document.assignors = assignors document.assignees = assignees document.invention_title = invention_title document.conveyance_text = conveyance_text document.doc_numbers = doc_numbers document.raw_text = invention_title document.embedding = embedding doc=document_to_dict(document) bulk_data.append({"_index": index_name, "_source": doc}) if bulk_data: helpers.bulk(db, bulk_data) print(f"Inserted {len(bulk_data)} records into index '{index_name}'") else: return "No patent records found to insert."