Training in progress, step 36, checkpoint

4411b83 verified about 1 year ago

30 kB

metadata

base_model: BXresearch/DeBERTa2-0.9B-ST-v2
datasets:
  - sentence-transformers/stsb
language:
  - en
library_name: sentence-transformers
metrics:
  - pearson_cosine
  - spearman_cosine
  - pearson_manhattan
  - spearman_manhattan
  - pearson_euclidean
  - spearman_euclidean
  - pearson_dot
  - spearman_dot
  - pearson_max
  - spearman_max
  - cosine_accuracy
  - cosine_accuracy_threshold
  - cosine_f1
  - cosine_f1_threshold
  - cosine_precision
  - cosine_recall
  - cosine_ap
  - dot_accuracy
  - dot_accuracy_threshold
  - dot_f1
  - dot_f1_threshold
  - dot_precision
  - dot_recall
  - dot_ap
  - manhattan_accuracy
  - manhattan_accuracy_threshold
  - manhattan_f1
  - manhattan_f1_threshold
  - manhattan_precision
  - manhattan_recall
  - manhattan_ap
  - euclidean_accuracy
  - euclidean_accuracy_threshold
  - euclidean_f1
  - euclidean_f1_threshold
  - euclidean_precision
  - euclidean_recall
  - euclidean_ap
  - max_accuracy
  - max_accuracy_threshold
  - max_f1
  - max_f1_threshold
  - max_precision
  - max_recall
  - max_ap
pipeline_tag: sentence-similarity
tags:
  - sentence-transformers
  - sentence-similarity
  - feature-extraction
  - generated_from_trainer
  - dataset_size:5749
  - loss:AnglELoss
widget:
  - source_sentence: Left side of a silver train engine.
    sentences:
      - A close-up of a black train engine.
      - Two boys are in midair jumping into an inground pool.
      - An older Asian couple poses with a newborn baby at the dinner table.
  - source_sentence: Four girls in swimsuits are playing volleyball at the beach.
    sentences:
      - A little girl is walking down a hallway.
      - The man is erasing the chalk board.
      - Four women in bikinis are playing volleyball on the beach.
  - source_sentence: A woman is cooking meat.
    sentences:
      - The dogs are alone in the forest.
      - A man is speaking.
      - A dog jumps through a hoop.
  - source_sentence: A person is folding a square paper piece.
    sentences:
      - A woman is carrying her baby.
      - A person folds a piece of paper.
      - A dog is trying to get through his dog door.
  - source_sentence: The boy is playing the piano.
    sentences:
      - The woman is pouring oil into the pan.
      - A small black and white dog is swimming in water.
      - Two brown dogs are playing with each other in the snow.
model-index:
  - name: SentenceTransformer based on BXresearch/DeBERTa2-0.9B-ST-v2
    results:
      - task:
          type: semantic-similarity
          name: Semantic Similarity
        dataset:
          name: sts test
          type: sts-test
        metrics:
          - type: pearson_cosine
            value: 0.9223420070013995
            name: Pearson Cosine
          - type: spearman_cosine
            value: 0.9291243257027669
            name: Spearman Cosine
          - type: pearson_manhattan
            value: 0.9346373512805987
            name: Pearson Manhattan
          - type: spearman_manhattan
            value: 0.9291489836472425
            name: Spearman Manhattan
          - type: pearson_euclidean
            value: 0.9354223786017909
            name: Pearson Euclidean
          - type: spearman_euclidean
            value: 0.9300019874215577
            name: Spearman Euclidean
          - type: pearson_dot
            value: 0.9106971189943253
            name: Pearson Dot
          - type: spearman_dot
            value: 0.9082045435102475
            name: Spearman Dot
          - type: pearson_max
            value: 0.9354223786017909
            name: Pearson Max
          - type: spearman_max
            value: 0.9300019874215577
            name: Spearman Max
      - task:
          type: binary-classification
          name: Binary Classification
        dataset:
          name: allNLI dev
          type: allNLI-dev
        metrics:
          - type: cosine_accuracy
            value: 0.75
            name: Cosine Accuracy
          - type: cosine_accuracy_threshold
            value: 0.7616457939147949
            name: Cosine Accuracy Threshold
          - type: cosine_f1
            value: 0.66
            name: Cosine F1
          - type: cosine_f1_threshold
            value: 0.636581540107727
            name: Cosine F1 Threshold
          - type: cosine_precision
            value: 0.5546218487394958
            name: Cosine Precision
          - type: cosine_recall
            value: 0.8148148148148148
            name: Cosine Recall
          - type: cosine_ap
            value: 0.6243778842583771
            name: Cosine Ap
          - type: dot_accuracy
            value: 0.75390625
            name: Dot Accuracy
          - type: dot_accuracy_threshold
            value: 679.4810791015625
            name: Dot Accuracy Threshold
          - type: dot_f1
            value: 0.6534653465346534
            name: Dot F1
          - type: dot_f1_threshold
            value: 598.3258056640625
            name: Dot F1 Threshold
          - type: dot_precision
            value: 0.5454545454545454
            name: Dot Precision
          - type: dot_recall
            value: 0.8148148148148148
            name: Dot Recall
          - type: dot_ap
            value: 0.6187309376038581
            name: Dot Ap
          - type: manhattan_accuracy
            value: 0.75390625
            name: Manhattan Accuracy
          - type: manhattan_accuracy_threshold
            value: 729.032470703125
            name: Manhattan Accuracy Threshold
          - type: manhattan_f1
            value: 0.6470588235294118
            name: Manhattan F1
          - type: manhattan_f1_threshold
            value: 838.39892578125
            name: Manhattan F1 Threshold
          - type: manhattan_precision
            value: 0.5365853658536586
            name: Manhattan Precision
          - type: manhattan_recall
            value: 0.8148148148148148
            name: Manhattan Recall
          - type: manhattan_ap
            value: 0.6217733494040824
            name: Manhattan Ap
          - type: euclidean_accuracy
            value: 0.75390625
            name: Euclidean Accuracy
          - type: euclidean_accuracy_threshold
            value: 23.002826690673828
            name: Euclidean Accuracy Threshold
          - type: euclidean_f1
            value: 0.6567164179104479
            name: Euclidean F1
          - type: euclidean_f1_threshold
            value: 26.765533447265625
            name: Euclidean F1 Threshold
          - type: euclidean_precision
            value: 0.55
            name: Euclidean Precision
          - type: euclidean_recall
            value: 0.8148148148148148
            name: Euclidean Recall
          - type: euclidean_ap
            value: 0.6216881687074047
            name: Euclidean Ap
          - type: max_accuracy
            value: 0.75390625
            name: Max Accuracy
          - type: max_accuracy_threshold
            value: 729.032470703125
            name: Max Accuracy Threshold
          - type: max_f1
            value: 0.66
            name: Max F1
          - type: max_f1_threshold
            value: 838.39892578125
            name: Max F1 Threshold
          - type: max_precision
            value: 0.5546218487394958
            name: Max Precision
          - type: max_recall
            value: 0.8148148148148148
            name: Max Recall
          - type: max_ap
            value: 0.6243778842583771
            name: Max Ap
      - task:
          type: binary-classification
          name: Binary Classification
        dataset:
          name: Qnli dev
          type: Qnli-dev
        metrics:
          - type: cosine_accuracy
            value: 0.73828125
            name: Cosine Accuracy
          - type: cosine_accuracy_threshold
            value: 0.6307685375213623
            name: Cosine Accuracy Threshold
          - type: cosine_f1
            value: 0.7357142857142857
            name: Cosine F1
          - type: cosine_f1_threshold
            value: 0.5677690505981445
            name: Cosine F1 Threshold
          - type: cosine_precision
            value: 0.6560509554140127
            name: Cosine Precision
          - type: cosine_recall
            value: 0.8373983739837398
            name: Cosine Recall
          - type: cosine_ap
            value: 0.7842286974902155
            name: Cosine Ap
          - type: dot_accuracy
            value: 0.7109375
            name: Dot Accuracy
          - type: dot_accuracy_threshold
            value: 541.8418579101562
            name: Dot Accuracy Threshold
          - type: dot_f1
            value: 0.7153846153846153
            name: Dot F1
          - type: dot_f1_threshold
            value: 538.5023193359375
            name: Dot F1 Threshold
          - type: dot_precision
            value: 0.6788321167883211
            name: Dot Precision
          - type: dot_recall
            value: 0.7560975609756098
            name: Dot Recall
          - type: dot_ap
            value: 0.749860948872692
            name: Dot Ap
          - type: manhattan_accuracy
            value: 0.74609375
            name: Manhattan Accuracy
          - type: manhattan_accuracy_threshold
            value: 787.5203247070312
            name: Manhattan Accuracy Threshold
          - type: manhattan_f1
            value: 0.728
            name: Manhattan F1
          - type: manhattan_f1_threshold
            value: 831.8275146484375
            name: Manhattan F1 Threshold
          - type: manhattan_precision
            value: 0.7165354330708661
            name: Manhattan Precision
          - type: manhattan_recall
            value: 0.7398373983739838
            name: Manhattan Recall
          - type: manhattan_ap
            value: 0.7942379057804293
            name: Manhattan Ap
          - type: euclidean_accuracy
            value: 0.75
            name: Euclidean Accuracy
          - type: euclidean_accuracy_threshold
            value: 25.221097946166992
            name: Euclidean Accuracy Threshold
          - type: euclidean_f1
            value: 0.7292418772563176
            name: Euclidean F1
          - type: euclidean_f1_threshold
            value: 28.07604217529297
            name: Euclidean F1 Threshold
          - type: euclidean_precision
            value: 0.6558441558441559
            name: Euclidean Precision
          - type: euclidean_recall
            value: 0.8211382113821138
            name: Euclidean Recall
          - type: euclidean_ap
            value: 0.7942309913520247
            name: Euclidean Ap
          - type: max_accuracy
            value: 0.75
            name: Max Accuracy
          - type: max_accuracy_threshold
            value: 787.5203247070312
            name: Max Accuracy Threshold
          - type: max_f1
            value: 0.7357142857142857
            name: Max F1
          - type: max_f1_threshold
            value: 831.8275146484375
            name: Max F1 Threshold
          - type: max_precision
            value: 0.7165354330708661
            name: Max Precision
          - type: max_recall
            value: 0.8373983739837398
            name: Max Recall
          - type: max_ap
            value: 0.7942379057804293
            name: Max Ap

SentenceTransformer based on BXresearch/DeBERTa2-0.9B-ST-v2

This is a sentence-transformers model finetuned from BXresearch/DeBERTa2-0.9B-ST-v2 on the sentence-transformers/stsb dataset. It maps sentences & paragraphs to a 1536-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.

Model Details

Model Description

Model Type: Sentence Transformer
Base model: BXresearch/DeBERTa2-0.9B-ST-v2
Maximum Sequence Length: 512 tokens
Output Dimensionality: 1536 tokens
Similarity Function: Cosine Similarity
Training Dataset:
- sentence-transformers/stsb
Language: en

Model Sources

Documentation: Sentence Transformers Documentation
Repository: Sentence Transformers on GitHub
Hugging Face: Sentence Transformers on Hugging Face

Full Model Architecture

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: DebertaV2Model 
  (1): Pooling({'word_embedding_dimension': 1536, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

Usage

Direct Usage (Sentence Transformers)

First install the Sentence Transformers library:

pip install -U sentence-transformers

Then you can load this model and run inference.

from sentence_transformers import SentenceTransformer

# Download from the 🤗 Hub
model = SentenceTransformer("bobox/DeBERTa2-0.9B-ST-stsb-checkpoints-tmp")
# Run inference
sentences = [
    'The boy is playing the piano.',
    'The woman is pouring oil into the pan.',
    'A small black and white dog is swimming in water.',
]
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 1536]

# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
# [3, 3]

Evaluation

Metrics

Semantic Similarity

Dataset: sts-test
Evaluated with EmbeddingSimilarityEvaluator

Metric	Value
pearson_cosine	0.9223
spearman_cosine	0.9291
pearson_manhattan	0.9346
spearman_manhattan	0.9291
pearson_euclidean	0.9354
spearman_euclidean	0.93
pearson_dot	0.9107
spearman_dot	0.9082
pearson_max	0.9354
spearman_max	0.93

Binary Classification

Dataset: allNLI-dev
Evaluated with BinaryClassificationEvaluator

Metric	Value
cosine_accuracy	0.75
cosine_accuracy_threshold	0.7616
cosine_f1	0.66
cosine_f1_threshold	0.6366
cosine_precision	0.5546
cosine_recall	0.8148
cosine_ap	0.6244
dot_accuracy	0.7539
dot_accuracy_threshold	679.4811
dot_f1	0.6535
dot_f1_threshold	598.3258
dot_precision	0.5455
dot_recall	0.8148
dot_ap	0.6187
manhattan_accuracy	0.7539
manhattan_accuracy_threshold	729.0325
manhattan_f1	0.6471
manhattan_f1_threshold	838.3989
manhattan_precision	0.5366
manhattan_recall	0.8148
manhattan_ap	0.6218
euclidean_accuracy	0.7539
euclidean_accuracy_threshold	23.0028
euclidean_f1	0.6567
euclidean_f1_threshold	26.7655
euclidean_precision	0.55
euclidean_recall	0.8148
euclidean_ap	0.6217
max_accuracy	0.7539
max_accuracy_threshold	729.0325
max_f1	0.66
max_f1_threshold	838.3989
max_precision	0.5546
max_recall	0.8148
max_ap	0.6244

Binary Classification

Dataset: Qnli-dev
Evaluated with BinaryClassificationEvaluator

Metric	Value
cosine_accuracy	0.7383
cosine_accuracy_threshold	0.6308
cosine_f1	0.7357
cosine_f1_threshold	0.5678
cosine_precision	0.6561
cosine_recall	0.8374
cosine_ap	0.7842
dot_accuracy	0.7109
dot_accuracy_threshold	541.8419
dot_f1	0.7154
dot_f1_threshold	538.5023
dot_precision	0.6788
dot_recall	0.7561
dot_ap	0.7499
manhattan_accuracy	0.7461
manhattan_accuracy_threshold	787.5203
manhattan_f1	0.728
manhattan_f1_threshold	831.8275
manhattan_precision	0.7165
manhattan_recall	0.7398
manhattan_ap	0.7942
euclidean_accuracy	0.75
euclidean_accuracy_threshold	25.2211
euclidean_f1	0.7292
euclidean_f1_threshold	28.076
euclidean_precision	0.6558
euclidean_recall	0.8211
euclidean_ap	0.7942
max_accuracy	0.75
max_accuracy_threshold	787.5203
max_f1	0.7357
max_f1_threshold	831.8275
max_precision	0.7165
max_recall	0.8374
max_ap	0.7942

Training Details

Training Dataset

sentence-transformers/stsb

Dataset: sentence-transformers/stsb at ab7a5ac
Size: 5,749 training samples
Columns: sentence1, sentence2, and score
Approximate statistics based on the first 1000 samples:
sentence1 sentence2 score
type string string float
details
min: 6 tokens
mean: 9.81 tokens
max: 27 tokens

min: 5 tokens
mean: 9.74 tokens
max: 25 tokens

min: 0.0
mean: 0.54
max: 1.0

	sentence1	sentence2	score
type	string	string	float
details	min: 6 tokens mean: 9.81 tokens max: 27 tokens	min: 5 tokens mean: 9.74 tokens max: 25 tokens	min: 0.0 mean: 0.54 max: 1.0

Samples:

sentence1	sentence2	score
`A plane is taking off.`	`An air plane is taking off.`	`1.0`
`A man is playing a large flute.`	`A man is playing a flute.`	`0.76`
`A man is spreading shreded cheese on a pizza.`	`A man is spreading shredded cheese on an uncooked pizza.`	`0.76`

Loss: AnglELoss with these parameters:

{
    "scale": 20.0,
    "similarity_fct": "pairwise_angle_sim"
}

Evaluation Dataset

sentence-transformers/stsb

Dataset: sentence-transformers/stsb at ab7a5ac
Size: 512 evaluation samples
Columns: sentence1, sentence2, and score
Approximate statistics based on the first 1000 samples:
sentence1 sentence2 score
type string string float
details
min: 6 tokens
mean: 11.16 tokens
max: 26 tokens

min: 6 tokens
mean: 11.17 tokens
max: 23 tokens

min: 0.0
mean: 0.47
max: 1.0

	sentence1	sentence2	score
type	string	string	float
details	min: 6 tokens mean: 11.16 tokens max: 26 tokens	min: 6 tokens mean: 11.17 tokens max: 23 tokens	min: 0.0 mean: 0.47 max: 1.0

Samples:

sentence1	sentence2	score
`A man with a hard hat is dancing.`	`A man wearing a hard hat is dancing.`	`1.0`
`A young child is riding a horse.`	`A child is riding a horse.`	`0.95`
`A man is feeding a mouse to a snake.`	`The man is feeding a mouse to the snake.`	`1.0`

Loss: AnglELoss with these parameters:

{
    "scale": 20.0,
    "similarity_fct": "pairwise_angle_sim"
}

Training Hyperparameters

Non-Default Hyperparameters

eval_strategy: steps
per_device_train_batch_size: 4
per_device_eval_batch_size: 256
gradient_accumulation_steps: 4
learning_rate: 1e-05
weight_decay: 0.001
num_train_epochs: 2
lr_scheduler_type: cosine_with_min_lr
lr_scheduler_kwargs: {'num_cycles': 0.5, 'min_lr': 1.0000000000000002e-06}
warmup_ratio: 0.2
save_safetensors: False
fp16: True
push_to_hub: True
hub_model_id: bobox/DeBERTa2-0.9B-ST-stsb-checkpoints-tmp
hub_strategy: all_checkpoints
batch_sampler: no_duplicates

All Hyperparameters

Click to expand

overwrite_output_dir: False
do_predict: False
eval_strategy: steps
prediction_loss_only: True
per_device_train_batch_size: 4
per_device_eval_batch_size: 256
per_gpu_train_batch_size: None
per_gpu_eval_batch_size: None
gradient_accumulation_steps: 4
eval_accumulation_steps: None
learning_rate: 1e-05
weight_decay: 0.001
adam_beta1: 0.9
adam_beta2: 0.999
adam_epsilon: 1e-08
max_grad_norm: 1.0
num_train_epochs: 2
max_steps: -1
lr_scheduler_type: cosine_with_min_lr
lr_scheduler_kwargs: {'num_cycles': 0.5, 'min_lr': 1.0000000000000002e-06}
warmup_ratio: 0.2
warmup_steps: 0
log_level: passive
log_level_replica: warning
log_on_each_node: True
logging_nan_inf_filter: True
save_safetensors: False
save_on_each_node: False
save_only_model: False
restore_callback_states_from_checkpoint: False
no_cuda: False
use_cpu: False
use_mps_device: False
seed: 42
data_seed: None
jit_mode_eval: False
use_ipex: False
bf16: False
fp16: True
fp16_opt_level: O1
half_precision_backend: auto
bf16_full_eval: False
fp16_full_eval: False
tf32: None
local_rank: 0
ddp_backend: None
tpu_num_cores: None
tpu_metrics_debug: False
debug: []
dataloader_drop_last: False
dataloader_num_workers: 0
dataloader_prefetch_factor: None
past_index: -1
disable_tqdm: False
remove_unused_columns: True
label_names: None
load_best_model_at_end: False
ignore_data_skip: False
fsdp: []
fsdp_min_num_params: 0
fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
fsdp_transformer_layer_cls_to_wrap: None
accelerator_config: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
deepspeed: None
label_smoothing_factor: 0.0
optim: adamw_torch
optim_args: None
adafactor: False
group_by_length: False
length_column_name: length
ddp_find_unused_parameters: None
ddp_bucket_cap_mb: None
ddp_broadcast_buffers: False
dataloader_pin_memory: True
dataloader_persistent_workers: False
skip_memory_metrics: True
use_legacy_prediction_loop: False
push_to_hub: True
resume_from_checkpoint: None
hub_model_id: bobox/DeBERTa2-0.9B-ST-stsb-checkpoints-tmp
hub_strategy: all_checkpoints
hub_private_repo: False
hub_always_push: False
gradient_checkpointing: False
gradient_checkpointing_kwargs: None
include_inputs_for_metrics: False
eval_do_concat_batches: True
fp16_backend: auto
push_to_hub_model_id: None
push_to_hub_organization: None
mp_parameters:
auto_find_batch_size: False
full_determinism: False
torchdynamo: None
ray_scope: last
ddp_timeout: 1800
torch_compile: False
torch_compile_backend: None
torch_compile_mode: None
dispatch_batches: None
split_batches: None
include_tokens_per_second: False
include_num_input_tokens_seen: False
neftune_noise_alpha: None
optim_target_modules: None
batch_eval_metrics: False
eval_on_start: False
batch_sampler: no_duplicates
multi_dataset_batch_sampler: proportional

Training Logs

Epoch	Step	Training Loss	loss	Qnli-dev_max_ap	allNLI-dev_max_ap	sts-test_spearman_cosine
0.0056	2	1.1634	-	-	-	-
0.0111	4	1.1431	-	-	-	-
0.0167	6	2.2064	-	-	-	-
0.0223	8	1.4548	-	-	-	-
0.0278	10	1.4417	-	-	-	-
0.0334	12	0.7039	-	-	-	-
0.0389	14	0.8871	-	-	-	-
0.0445	16	1.3651	-	-	-	-
0.0501	18	1.211	-	-	-	-
0.0556	20	1.2555	-	-	-	-
0.0612	22	1.272	-	-	-	-
0.0668	24	1.0434	-	-	-	-
0.0723	26	0.8263	-	-	-	-
0.0779	28	1.1717	-	-	-	-
0.0834	30	0.9858	-	-	-	-
0.0890	32	0.8084	-	-	-	-
0.0946	34	1.6431	-	-	-	-
0.1001	36	1.6234	1.1413	0.7942	0.6244	0.9291

Framework Versions

Python: 3.10.12
Sentence Transformers: 3.0.1
Transformers: 4.42.4
PyTorch: 2.4.0+cu121
Accelerate: 0.32.1
Datasets: 2.21.0
Tokenizers: 0.19.1

Citation

BibTeX

Sentence Transformers

@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

AnglELoss

@misc{li2023angleoptimized,
    title={AnglE-optimized Text Embeddings}, 
    author={Xianming Li and Jing Li},
    year={2023},
    eprint={2309.12871},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}