File size: 3,745 Bytes
d17ca98 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
import xml.etree.ElementTree as ET
from elasticsearch import Elasticsearch, helpers
import torch
from transformers import CLIPProcessor, CLIPModel
import numpy as np
from server.utils.database import get_db
from server.utils.model import get_clip_model
from server.models.database import DocumentModel
from server.crud.patent_data import insert_data,search_data
from server.controllers.document import document_to_dict
# Load CLIP model globally for reuse
clip_model, clip_processor = get_clip_model()
def insert_data_from_xml(xml_file: str, db=get_db(), index_name="patents"):
tree = ET.parse(xml_file)
root = tree.getroot()
# Create index if not exists
if not db.indices.exists(index=index_name):
db.indices.create(index=index_name, body={
"mappings": {
"properties": {
"reel_no": {"type": "keyword"},
"frame_no": {"type": "keyword"},
"assignors": {"type": "text"},
"assignees": {"type": "text"},
"invention_title": {"type": "text"},
"conveyance_text": {"type": "text"},
"doc_numbers": {"type": "keyword"},
"raw_text": {"type": "text"},
"embedding": {"type": "dense_vector", "dims": 512, "index": True, "similarity": "cosine"}
}
}
})
get_text = lambda el: el.text.strip() if el is not None and el.text else ""
bulk_data = []
for pa in root.findall('.//patent-assignment'):
record = pa.find('assignment-record')
if record is None:
continue
reel_no = get_text(record.find('reel-no'))
frame_no = get_text(record.find('frame-no'))
conveyance_text = get_text(record.find('conveyance-text'))
assignors = ", ".join([
get_text(a.find('name')) for a in pa.findall('.//patent-assignor') if get_text(a.find('name'))
])
assignees = ", ".join([
get_text(a.find('name')) for a in pa.findall('.//patent-assignee') if get_text(a.find('name'))
])
invention_title = ""
doc_numbers = []
for prop in pa.findall('.//patent-property'):
title = prop.find('invention-title')
if title is not None:
invention_title = get_text(title)
for doc in prop.findall('document-id'):
doc_num = get_text(doc.find('doc-number'))
if doc_num:
doc_numbers.append(doc_num)
embedding = None
if invention_title:
inputs = clip_processor(text=[invention_title], return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
embedding = clip_model.get_text_features(**inputs)[0].cpu().numpy().astype(np.float32).tolist()
else:
embedding = [0.0]*512
document=DocumentModel()
document.reel_no = reel_no
document.frame_no = frame_no
document.assignors = assignors
document.assignees = assignees
document.invention_title = invention_title
document.conveyance_text = conveyance_text
document.doc_numbers = doc_numbers
document.raw_text = invention_title
document.embedding = embedding
doc=document_to_dict(document)
bulk_data.append({"_index": index_name, "_source": doc})
if bulk_data:
helpers.bulk(db, bulk_data)
print(f"Inserted {len(bulk_data)} records into index '{index_name}'")
else:
return "No patent records found to insert."
|