BERT-Bytecode / tokenizer.json
lucapernice's picture
Add tokenizer
47ad9c0 verified
{
"version": "1.0",
"truncation": {
"direction": "Right",
"max_length": 256,
"strategy": "LongestFirst",
"stride": 0
},
"padding": {
"strategy": {
"Fixed": 256
},
"direction": "Right",
"pad_to_multiple_of": null,
"pad_id": 0,
"pad_type_id": 0,
"pad_token": "[PAD]"
},
"added_tokens": [
{
"id": 0,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "BertNormalizer",
"clean_text": true,
"handle_chinese_chars": true,
"strip_accents": null,
"lowercase": true
},
"pre_tokenizer": {
"type": "BertPreTokenizer"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 1
}
}
],
"special_tokens": {
"[CLS]": {
"id": "[CLS]",
"ids": [
2
],
"tokens": [
"[CLS]"
]
},
"[SEP]": {
"id": "[SEP]",
"ids": [
3
],
"tokens": [
"[SEP]"
]
}
}
},
"decoder": {
"type": "WordPiece",
"prefix": "##",
"cleanup": true
},
"model": {
"type": "WordPiece",
"unk_token": "[UNK]",
"continuing_subword_prefix": "##",
"max_input_chars_per_word": 100,
"vocab": {
"[PAD]": 0,
"[UNK]": 1,
"[CLS]": 2,
"[SEP]": 3,
"[MASK]": 4,
"0": 5,
"1": 6,
"2": 7,
"3": 8,
"4": 9,
"5": 10,
"6": 11,
"7": 12,
"8": 13,
"9": 14,
"10": 15,
"11": 16,
"12": 17,
"13": 18,
"14": 19,
"15": 20,
"16": 21,
"17": 22,
"18": 23,
"19": 24,
"20": 25,
"21": 26,
"22": 27,
"23": 28,
"24": 29,
"25": 30,
"26": 31,
"27": 32,
"28": 33,
"29": 34,
"30": 35,
"31": 36,
"32": 37,
"33": 38,
"34": 39,
"35": 40,
"36": 41,
"37": 42,
"38": 43,
"39": 44,
"40": 45,
"41": 46,
"42": 47,
"43": 48,
"44": 49,
"45": 50,
"46": 51,
"47": 52,
"48": 53,
"49": 54,
"50": 55,
"51": 56,
"52": 57,
"53": 58,
"54": 59,
"55": 60,
"56": 61,
"57": 62,
"58": 63,
"59": 64,
"60": 65,
"61": 66,
"62": 67,
"63": 68,
"64": 69,
"65": 70,
"66": 71,
"67": 72,
"68": 73,
"69": 74,
"70": 75,
"71": 76,
"72": 77,
"73": 78,
"74": 79,
"75": 80,
"76": 81,
"77": 82,
"78": 83,
"79": 84,
"80": 85,
"81": 86,
"82": 87,
"83": 88,
"84": 89,
"85": 90,
"86": 91,
"87": 92,
"88": 93,
"89": 94,
"90": 95,
"91": 96,
"92": 97,
"93": 98,
"94": 99,
"95": 100,
"96": 101,
"97": 102,
"98": 103,
"99": 104,
"100": 105,
"101": 106,
"102": 107,
"103": 108,
"104": 109,
"105": 110,
"106": 111,
"107": 112,
"108": 113,
"109": 114,
"110": 115,
"111": 116,
"112": 117,
"113": 118,
"114": 119,
"115": 120,
"116": 121,
"117": 122,
"118": 123,
"119": 124,
"120": 125,
"121": 126,
"122": 127,
"123": 128,
"124": 129,
"125": 130,
"126": 131,
"127": 132,
"128": 133,
"129": 134,
"130": 135,
"131": 136,
"132": 137,
"133": 138,
"134": 139,
"135": 140,
"136": 141,
"137": 142,
"138": 143,
"139": 144,
"140": 145,
"141": 146,
"142": 147,
"143": 148,
"144": 149,
"145": 150,
"146": 151,
"147": 152,
"148": 153,
"149": 154,
"150": 155,
"151": 156,
"152": 157,
"153": 158,
"154": 159,
"155": 160,
"156": 161,
"157": 162,
"158": 163,
"159": 164,
"160": 165,
"161": 166,
"162": 167,
"163": 168,
"164": 169,
"165": 170,
"166": 171,
"167": 172,
"168": 173,
"169": 174,
"170": 175,
"171": 176,
"172": 177,
"173": 178,
"174": 179,
"175": 180,
"176": 181,
"177": 182,
"178": 183,
"179": 184,
"180": 185,
"181": 186,
"182": 187,
"183": 188,
"184": 189,
"185": 190,
"186": 191,
"187": 192,
"188": 193,
"189": 194,
"190": 195,
"191": 196,
"192": 197,
"193": 198,
"194": 199,
"195": 200,
"196": 201,
"197": 202,
"198": 203,
"199": 204,
"200": 205,
"201": 206,
"202": 207,
"203": 208,
"204": 209,
"205": 210,
"206": 211,
"207": 212,
"208": 213,
"209": 214,
"210": 215,
"211": 216,
"212": 217,
"213": 218,
"214": 219,
"215": 220,
"216": 221,
"217": 222,
"218": 223,
"219": 224,
"220": 225,
"221": 226,
"222": 227,
"223": 228,
"224": 229,
"225": 230,
"226": 231,
"227": 232,
"228": 233,
"229": 234,
"230": 235,
"231": 236,
"232": 237,
"233": 238,
"234": 239,
"235": 240,
"236": 241,
"237": 242,
"238": 243,
"239": 244,
"240": 245,
"241": 246,
"242": 247,
"243": 248,
"244": 249,
"245": 250,
"246": 251,
"247": 252,
"248": 253,
"249": 254,
"250": 255,
"251": 256,
"252": 257,
"253": 258,
"254": 259,
"255": 260
}
}
}