File size: 5,351 Bytes
3081fb6 0dff178 3081fb6 dd574a9 c8ba695 3081fb6 cde46d6 3081fb6 96a7813 3081fb6 dd574a9 96a7813 3081fb6 dd574a9 3081fb6 28e132a 3081fb6 dd574a9 3081fb6 f92585e 3081fb6 96a7813 3081fb6 dd574a9 3081fb6 0dff178 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
---
library_name: transformers
license: cc-by-nc-sa-4.0
pipeline_tag: text-ranking
---
# Contextual AI Reranker v2 1B
## Highlights
Our reranker is on the cost/performance Pareto frontier across 5 key areas:
- Instruction following (including capability to rank more recent information higher)
- Question answering
- Multilinguality
- Product search / recommendation systems
- Real-world use cases
<p align="center">
<img src="main_benchmark.png" width="1200"/>
<p>
For more details on these and other benchmarks, please refer to our [blogpost](https://contextual.ai/blog/rerank-v2).
## Overview
- Model Type: Text Reranking
- Supported Languages: 100+
- Number of Paramaters: 1B
- Context Length: up to 32K
- Blogpost: https://contextual.ai/blog/rerank-v2
## Quickstart
### vLLM usage
Requires vllm==0.10.0 for NVFP4 or vllm>=0.8.5 for BF16.
```python
import os
os.environ['VLLM_USE_V1'] = '0' # v1 engine doesn’t support logits processor yet
import torch
from vllm import LLM, SamplingParams
def logits_processor(_, scores):
"""Custom logits processor for vLLM reranking."""
index = scores[0].view(torch.uint16)
scores = torch.full_like(scores, float("-inf"))
scores[index] = 1
return scores
def format_prompts(query: str, instruction: str, documents: list[str]) -> list[str]:
"""Format query and documents into prompts for reranking."""
if instruction:
instruction = f" {instruction}"
prompts = []
for doc in documents:
prompt = f"Check whether a given document contains information helpful to answer the query.\n<Document> {doc}\n<Query> {query}{instruction} ??"
prompts.append(prompt)
return prompts
def infer_w_vllm(model_path: str, query: str, instruction: str, documents: list[str]):
model = LLM(
model=model_path,
gpu_memory_utilization=0.85,
max_model_len=8192,
dtype="bfloat16",
max_logprobs=2,
max_num_batched_tokens=262144,
)
sampling_params = SamplingParams(
temperature=0,
max_tokens=1,
logits_processors=[logits_processor]
)
prompts = format_prompts(query, instruction, documents)
outputs = model.generate(prompts, sampling_params, use_tqdm=False)
# Extract scores and create results
results = []
for i, output in enumerate(outputs):
score = (
torch.tensor([output.outputs[0].token_ids[0]], dtype=torch.uint16)
.view(torch.bfloat16)
.item()
)
results.append((score, i, documents[i]))
# Sort by score (descending)
results = sorted(results, key=lambda x: x[0], reverse=True)
print(f"Query: {query}")
print(f"Instruction: {instruction}")
for score, doc_id, doc in results:
print(f"Score: {score:.4f} | Doc: {doc}")
```
### Transformers Usage
Requires transformers>=4.51.0 for BF16. Not supported for NVFP4.
```python
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
def format_prompts(query: str, instruction: str, documents: list[str]) -> list[str]:
"""Format query and documents into prompts for reranking."""
if instruction:
instruction = f" {instruction}"
prompts = []
for doc in documents:
prompt = f"Check whether a given document contains information helpful to answer the query.\n<Document> {doc}\n<Query> {query}{instruction} ??"
prompts.append(prompt)
return prompts
def infer_w_hf(model_path: str, query: str, instruction: str, documents: list[str]):
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left" # so -1 is the real last token for all prompts
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=dtype).to(device)
model.eval()
prompts = format_prompts(query, instruction, documents)
enc = tokenizer(
prompts,
return_tensors="pt",
padding=True,
truncation=True,
)
input_ids = enc["input_ids"].to(device)
attention_mask = enc["attention_mask"].to(device)
with torch.no_grad():
out = model(input_ids=input_ids, attention_mask=attention_mask)
next_logits = out.logits[:, -1, :] # [batch, vocab]
scores_bf16 = next_logits[:, 0].to(torch.bfloat16)
scores = scores_bf16.float().tolist()
# Sort by score (descending)
results = sorted([(s, i, documents[i]) for i, s in enumerate(scores)], key=lambda x: x[0], reverse=True)
print(f"Query: {query}")
print(f"Instruction: {instruction}")
for score, doc_id, doc in results:
print(f"Score: {score:.4f} | Doc: {doc}")
```
## Citation
If you use this model, please cite:
```bibtex
@misc{ctxl_rerank_v2_instruct_multilingual,
title={Contextual AI Reranker v2},
author={George Halal, Sheshansh Agrawal},
year={2025},
url={https://contextual.ai/blog/rerank-v2},
}
```
## License
Creative Commons Attribution Non Commercial Share Alike 4.0 (cc-by-nc-sa-4.0)
## Contact
For questions or issues, please open an issue on the model repository or contact george@contextual.ai. |