Spaces:

darpanaswal
/

Patent_Retrieval

Configuration error

App Files Files Community

darpanaswal commited on Apr 10

Commit

b4bffe5

verified ·

1 Parent(s): 29de29c

Update cross_encoder_reranking_train.py

Browse files

Files changed (1) hide show

cross_encoder_reranking_train.py +52 -140

cross_encoder_reranking_train.py CHANGED Viewed

@@ -13,8 +13,8 @@ from sklearn.metrics.pairwise import cosine_similarity
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 # Load embedder once
-embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2").to(device)
 def embed_text_list(texts):
     return embedder.encode(texts, convert_to_tensor=False, device=device)
@@ -61,28 +61,6 @@ def process_single_patent(patent_dict):
         "features": rank_by_centrality(top_features),
     }
-def refined_process_single_patent(patent_dict, top_n=10):
-    abstract = patent_dict.get("pa01", "")
-    title = patent_dict.get("title", "")
-    context = f"{title} {abstract}"
-    context_emb = embed_text_list([context])[0]
-    claims = [v for k, v in patent_dict.items() if k.startswith("c-en")]
-    paragraphs = [v for k, v in patent_dict.items() if k.startswith("p")]
-    features = [v for k, v in patent_dict.get("features", {}).items()]
-    def semantic_rank(items, context_emb):
-        embeddings = embed_text_list(items)
-        scores = cosine_similarity([context_emb], embeddings)[0]
-        ranked_items = [item for item, _ in sorted(zip(items, scores), key=lambda x: x[1], reverse=True)]
-        return ranked_items
-    return {
-        "claims": semantic_rank(claims, context_emb)[:top_n],
-        "paragraphs": semantic_rank(paragraphs, context_emb)[:top_n],
-        "features": semantic_rank(features, context_emb)[:top_n],
-    }
 def load_json_file(file_path):
     """Load JSON data from a file"""
     with open(file_path, 'r') as f:
@@ -174,22 +152,6 @@ def extract_text(content_dict, text_type="full"):
         return " ".join(all_text)
-    elif text_type == "smart2":
-        filtered_dict = refined_process_single_patent(content_dict)
-        all_text = []
-        # Context with title and abstract
-        if "title" in content_dict:
-            all_text.append(content_dict["title"])
-        if "pa01" in content_dict:
-            all_text.append(content_dict["pa01"])
-        # Add claims, paragraphs, and features
-        all_text.extend(filtered_dict["claims"])
-        all_text.extend(filtered_dict["paragraphs"])
-        all_text.extend(filtered_dict["features"])
-        return " ".join(all_text)
     return ""
@@ -203,118 +165,67 @@ def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tenso
         batch_size = last_hidden_states.shape[0]
         return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
-# def get_detailed_instruct(task_description: str, query: str) -> str:
-#     """Create an instruction-formatted query"""
-#     return f'Instruct: {task_description}\nQuery: {query}'
 def get_detailed_instruct(task_description: str, query: str) -> str:
-    return (
-        f"Instruct: Evaluate the semantic and technical similarity between two patent documents."
-        f" Prioritize highly similar claims, technical implementations, and shared functionalities."
-        f"\nQuery: {query}"
-    )
-def hybrid_score(cross_encoder_score, semantic_score, weight_cross=0.8, weight_semantic=0.2):
-    return (weight_cross * cross_encoder_score) + (weight_semantic * semantic_score)
-# def cross_encoder_reranking(query_text, doc_texts, model, tokenizer, batch_size=8, max_length=2048):
-#     """
-#     Rerank document texts based on query text using cross-encoder model
-#     Parameters:
-#     query_text (str): The query text
-#     doc_texts (list): List of document texts
-#     model: The cross-encoder model
-#     tokenizer: The tokenizer for the model
-#     batch_size (int): Batch size for processing
-#     max_length (int): Maximum sequence length
-#     Returns:
-#     list: Indices of documents sorted by relevance score (descending)
-#     """
-#     device = next(model.parameters()).device
-#     scores = []
-#     # Format query with instruction
-#     task_description = 'Re-rank a set of retrieved patents based on their relevance to a given query patent. The task aims to refine the order of patents by evaluating their semantic similarity to the query patent, ensuring that the most relevant patents appear at the top of the list.'
-#     instructed_query = get_detailed_instruct(task_description, query_text)
-#     # Process in batches to avoid OOM
-#     for i in tqdm(range(0, len(doc_texts), batch_size), desc="Scoring documents", leave=False):
-#         batch_docs = doc_texts[i:i+batch_size]
-#         # Prepare input pairs for the batch
-#         input_texts = [instructed_query] + batch_docs
-#         # Tokenize
-#         with torch.no_grad():
-#             batch_dict = tokenizer(input_texts, max_length=max_length, padding=True,
-#                                   truncation=True, return_tensors='pt').to(device)
-#             # Get embeddings
-#             outputs = model(**batch_dict)
-#             embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
-#             # Normalize embeddings
-#             embeddings = F.normalize(embeddings, p=2, dim=1)
-#             # Calculate similarity scores between query and documents
-#             batch_scores = (embeddings[0].unsqueeze(0) @ embeddings[1:].T).squeeze(0) * 100
-#             scores.extend(batch_scores.cpu().tolist())
-#     # Create list of (index, score) tuples for sorting
-#     indexed_scores = list(enumerate(scores))
-#     # Sort by score in descending order
-#     indexed_scores.sort(key=lambda x: x[1], reverse=True)
-#     # Return sorted indices
-#     return [idx for idx, _ in indexed_scores]
-def cross_encoder_reranking(query_text, doc_texts, model, tokenizer, batch_size=64, max_length=2048):
-    device = next(model.parameters()).device
-    cross_scores = []
-    query_emb = embed_text_list([query_text])[0]  # Move embedder to CPU
-    instructed_query = get_detailed_instruct("", query_text)
-    # Pre-create all input pairs (concatenation-based cross-encoder setup)
-    input_texts = [f"{instructed_query} {doc}" for doc in doc_texts]
-    for i in tqdm(range(0, len(input_texts), batch_size), desc="Scoring documents", leave=False):
-        batch_input_texts = input_texts[i:i+batch_size]
-        with torch.no_grad():
-            batch_dict = tokenizer(batch_input_texts, max_length=max_length, padding=True, truncation=True, return_tensors='pt').to(device)
-            # Mixed precision for faster inference and lower memory
-            with torch.cuda.amp.autocast():
-                outputs = model(**batch_dict)
-                embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
-                embeddings = F.normalize(embeddings, p=2, dim=1)
-                # Since queries are repeated in each pair, compare to instructed query embedding (first one)
-                query_vector = embeddings[0].unsqueeze(0)  # Use first as query
-                batch_cross_scores = (query_vector @ embeddings.T).squeeze(0).cpu().numpy()[1:]  # Exclude self-comparison
-                cross_scores.extend(batch_cross_scores)
-    # Semantic scores
-    doc_embeddings = embed_text_list(doc_texts)
-    semantic_scores = cosine_similarity([query_emb], doc_embeddings)[0]
-    # Hybrid scores
-    hybrid_scores = [hybrid_score(c, s) for c, s in zip(cross_scores, semantic_scores)]
-    indexed_scores = list(enumerate(hybrid_scores))
     indexed_scores.sort(key=lambda x: x[1], reverse=True)
     return [idx for idx, _ in indexed_scores]
 def main():
     base_directory = os.getcwd()
-    base_directory += "/Patent_Retrieval"
     parser = argparse.ArgumentParser(description='Re-rank patents using cross-encoder scoring (training queries only)')
     parser.add_argument('--pre_ranking', type=str, default='shuffled_pre_ranking.json',
                         help='Path to pre-ranking JSON file')
@@ -326,11 +237,12 @@ def main():
     parser.add_argument('--documents_content', type=str,
                         default='./documents_content_with_features.json',
                         help='Path to documents content JSON file')
-    # Change here from train to test
     parser.add_argument('--queries_list', type=str, default='test_queries.json',
                         help='Path to training queries JSON file')
     parser.add_argument('--text_type', type=str, default='TA',
-                        choices=['TA', 'claims', 'description', 'full', 'tac1', 'smart', 'smart2'],
                         help='Type of text to use for scoring')
     parser.add_argument('--model_name', type=str, default='intfloat/e5-large-v2',
                         help='Name of the cross-encoder model')
@@ -341,7 +253,7 @@ def main():
     parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu',
                         help='Device to use (cuda/cpu)')
     parser.add_argument('--base_dir', type=str,
-                        default=f'{base_directory}/datasets',
                         help='Base directory for data files')
     args = parser.parse_args()
@@ -460,4 +372,4 @@ def main():
         print(f"Information about missing FANs saved to {missing_info_path}")
 if __name__ == "__main__":
-    main()

 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 # Load embedder once
+embedder = SentenceTransformer("all-MiniLM-L6-v2")
+embedder = embedder.to(device)
 def embed_text_list(texts):
     return embedder.encode(texts, convert_to_tensor=False, device=device)
         "features": rank_by_centrality(top_features),
     }
 def load_json_file(file_path):
     """Load JSON data from a file"""
     with open(file_path, 'r') as f:
         return " ".join(all_text)
     return ""
         batch_size = last_hidden_states.shape[0]
         return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
 def get_detailed_instruct(task_description: str, query: str) -> str:
+    """Create an instruction-formatted query"""
+    return f'Instruct: {task_description}\nQuery: {query}'
+def cross_encoder_reranking(query_text, doc_texts, model, tokenizer, batch_size=8, max_length=2048):
+    """
+    Rerank document texts based on query text using cross-encoder model
+    Parameters:
+    query_text (str): The query text
+    doc_texts (list): List of document texts
+    model: The cross-encoder model
+    tokenizer: The tokenizer for the model
+    batch_size (int): Batch size for processing
+    max_length (int): Maximum sequence length
+    Returns:
+    list: Indices of documents sorted by relevance score (descending)
+    """
+    device = next(model.parameters()).device
+    scores = []
+    # Format query with instruction
+    task_description = 'Re-rank a set of retrieved patents based on their relevance to a given query patent. The task aims to refine the order of patents by evaluating their semantic similarity to the query patent, ensuring that the most relevant patents appear at the top of the list.'
+    instructed_query = get_detailed_instruct(task_description, query_text)
+    # Process in batches to avoid OOM
+    for i in tqdm(range(0, len(doc_texts), batch_size), desc="Scoring documents", leave=False):
+        batch_docs = doc_texts[i:i+batch_size]
+        # Prepare input pairs for the batch
+        input_texts = [instructed_query] + batch_docs
+        # Tokenize
+        with torch.no_grad():
+            batch_dict = tokenizer(input_texts, max_length=max_length, padding=True,
+                                  truncation=True, return_tensors='pt').to(device)
+            # Get embeddings
+            outputs = model(**batch_dict)
+            embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
+            # Normalize embeddings
+            embeddings = F.normalize(embeddings, p=2, dim=1)
+            # Calculate similarity scores between query and documents
+            batch_scores = (embeddings[0].unsqueeze(0) @ embeddings[1:].T).squeeze(0) * 100
+            scores.extend(batch_scores.cpu().tolist())
+    # Create list of (index, score) tuples for sorting
+    indexed_scores = list(enumerate(scores))
+    # Sort by score in descending order
     indexed_scores.sort(key=lambda x: x[1], reverse=True)
+    # Return sorted indices
     return [idx for idx, _ in indexed_scores]
 def main():
     base_directory = os.getcwd()
     parser = argparse.ArgumentParser(description='Re-rank patents using cross-encoder scoring (training queries only)')
     parser.add_argument('--pre_ranking', type=str, default='shuffled_pre_ranking.json',
                         help='Path to pre-ranking JSON file')
     parser.add_argument('--documents_content', type=str,
                         default='./documents_content_with_features.json',
                         help='Path to documents content JSON file')
+    # Change here for test or train
     parser.add_argument('--queries_list', type=str, default='test_queries.json',
                         help='Path to training queries JSON file')
     parser.add_argument('--text_type', type=str, default='TA',
+                        choices=['TA', 'claims', 'description', 'full', 'tac1', 'smart'],
                         help='Type of text to use for scoring')
     parser.add_argument('--model_name', type=str, default='intfloat/e5-large-v2',
                         help='Name of the cross-encoder model')
     parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu',
                         help='Device to use (cuda/cpu)')
     parser.add_argument('--base_dir', type=str,
+                        default=f'{base_directory}/Patent_Retrieval/datasets',
                         help='Base directory for data files')
     args = parser.parse_args()
         print(f"Information about missing FANs saved to {missing_info_path}")
 if __name__ == "__main__":
+    main()