Initial Commit for Namo Turn Detector v1 (#1)
Browse files- Initial Commit for Namo Turn Detector v1 (10a203de329613240aa98bb9533a788f2ff12998)
- .gitattributes +2 -0
- README.md +225 -0
- config.json +29 -0
- confusion_matrices.png +3 -0
- model.onnx +3 -0
- model_quant.onnx +3 -0
- performance_analysis.png +3 -0
- special_tokens_map.json +37 -0
- tokenizer.json +0 -0
- tokenizer_config.json +63 -0
- vocab.txt +0 -0
    	
        .gitattributes
    CHANGED
    
    | @@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text | |
| 33 | 
             
            *.zip filter=lfs diff=lfs merge=lfs -text
         | 
| 34 | 
             
            *.zst filter=lfs diff=lfs merge=lfs -text
         | 
| 35 | 
             
            *tfevents* filter=lfs diff=lfs merge=lfs -text
         | 
|  | |
|  | 
|  | |
| 33 | 
             
            *.zip filter=lfs diff=lfs merge=lfs -text
         | 
| 34 | 
             
            *.zst filter=lfs diff=lfs merge=lfs -text
         | 
| 35 | 
             
            *tfevents* filter=lfs diff=lfs merge=lfs -text
         | 
| 36 | 
            +
            confusion_matrices.png filter=lfs diff=lfs merge=lfs -text
         | 
| 37 | 
            +
            performance_analysis.png filter=lfs diff=lfs merge=lfs -text
         | 
    	
        README.md
    ADDED
    
    | @@ -0,0 +1,225 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            ---
         | 
| 2 | 
            +
            language: no
         | 
| 3 | 
            +
            license: apache-2.0
         | 
| 4 | 
            +
            library_name: onnxruntime
         | 
| 5 | 
            +
            pipeline_tag: text-classification
         | 
| 6 | 
            +
            tags:
         | 
| 7 | 
            +
            - turn-detection
         | 
| 8 | 
            +
            - end-of-utterance
         | 
| 9 | 
            +
            - distilbert
         | 
| 10 | 
            +
            - onnx
         | 
| 11 | 
            +
            - quantized
         | 
| 12 | 
            +
            - conversational-ai
         | 
| 13 | 
            +
            - voice-assistant
         | 
| 14 | 
            +
            - real-time
         | 
| 15 | 
            +
            base_model: distilbert-base-multilingual-cased
         | 
| 16 | 
            +
            datasets:
         | 
| 17 | 
            +
            - videosdk-live/Namo-Turn-Detector-v1-Train
         | 
| 18 | 
            +
            model-index:
         | 
| 19 | 
            +
            - name: Namo Turn Detector v1 - Norwegian
         | 
| 20 | 
            +
              results:
         | 
| 21 | 
            +
              - task:
         | 
| 22 | 
            +
                  type: text-classification
         | 
| 23 | 
            +
                  name: Turn Detection
         | 
| 24 | 
            +
                dataset:
         | 
| 25 | 
            +
                  name: Namo Turn Detector v1 Test - Norwegian
         | 
| 26 | 
            +
                  type: videosdk-live/Namo-Turn-Detector-v1-Test
         | 
| 27 | 
            +
                  split: train
         | 
| 28 | 
            +
                metrics:
         | 
| 29 | 
            +
                - type: accuracy
         | 
| 30 | 
            +
                  value: 0.873482
         | 
| 31 | 
            +
                  name: Accuracy
         | 
| 32 | 
            +
                - type: f1
         | 
| 33 | 
            +
                  value: 0.882739
         | 
| 34 | 
            +
                  name: F1 Score
         | 
| 35 | 
            +
                - type: precision
         | 
| 36 | 
            +
                  value: 0.834960
         | 
| 37 | 
            +
                  name: Precision
         | 
| 38 | 
            +
                - type: recall
         | 
| 39 | 
            +
                  value: 0.936318
         | 
| 40 | 
            +
                  name: Recall
         | 
| 41 | 
            +
             | 
| 42 | 
            +
            ---
         | 
| 43 | 
            +
             | 
| 44 | 
            +
            # 🎯 Namo Turn Detector v1 - Norwegian
         | 
| 45 | 
            +
             | 
| 46 | 
            +
            <div align="center">
         | 
| 47 | 
            +
             | 
| 48 | 
            +
            [](https://opensource.org/licenses/Apache-2.0)
         | 
| 49 | 
            +
            [](https://onnx.ai/)
         | 
| 50 | 
            +
            [](https://huggingface.co/videosdk-live/Namo-Turn-Detector-v1-Norwegian)
         | 
| 51 | 
            +
            []()
         | 
| 52 | 
            +
             | 
| 53 | 
            +
            **🚀 Namo Turn Detection Model for Norwegian**
         | 
| 54 | 
            +
             | 
| 55 | 
            +
            </div>
         | 
| 56 | 
            +
             | 
| 57 | 
            +
            ---
         | 
| 58 | 
            +
             | 
| 59 | 
            +
            ## 📋 Overview
         | 
| 60 | 
            +
             | 
| 61 | 
            +
            The **Namo Turn Detector** is a specialized AI model designed to solve one of the most challenging problems in conversational AI: **knowing when a user has finished speaking**. 
         | 
| 62 | 
            +
             | 
| 63 | 
            +
            This Norwegian-specialist model uses advanced natural language understanding to distinguish between:
         | 
| 64 | 
            +
            - ✅ **Complete utterances** (user is done speaking)
         | 
| 65 | 
            +
            - 🔄 **Incomplete utterances** (user will continue speaking)
         | 
| 66 | 
            +
             | 
| 67 | 
            +
            Built on DistilBERT architecture and optimized with quantized ONNX format, it delivers enterprise-grade performance with minimal latency.
         | 
| 68 | 
            +
             | 
| 69 | 
            +
            ## 🔑 Key Features
         | 
| 70 | 
            +
             | 
| 71 | 
            +
            - **Turn Detection Specialist**: Detects end-of-turn vs. continuation in Norwegian speech transcripts.  
         | 
| 72 | 
            +
            - **Low Latency**: Optimized with **quantized ONNX** for <12ms inference.  
         | 
| 73 | 
            +
            - **Robust Performance**: 87.3% accuracy on diverse Norwegian utterances.  
         | 
| 74 | 
            +
            - **Easy Integration**: Compatible with Python, ONNX Runtime, and VideoSDK Agents SDK.  
         | 
| 75 | 
            +
            - **Enterprise Ready**: Supports real-time conversational AI and voice assistants.  
         | 
| 76 | 
            +
             | 
| 77 | 
            +
            ## 📊 Performance Metrics
         | 
| 78 | 
            +
            <div>
         | 
| 79 | 
            +
             | 
| 80 | 
            +
            | Metric | Score |
         | 
| 81 | 
            +
            |--------|-------|
         | 
| 82 | 
            +
            | **🎯 Accuracy** | **87.34%** | 
         | 
| 83 | 
            +
            | **📈 F1-Score** | **88.27%** |
         | 
| 84 | 
            +
            | **🎪 Precision** | **83.49%** |
         | 
| 85 | 
            +
            | **🎭 Recall** | **93.63%** |
         | 
| 86 | 
            +
            | **⚡ Latency** | **<12ms** |
         | 
| 87 | 
            +
            | **💾 Model Size** | **~135MB** |
         | 
| 88 | 
            +
             | 
| 89 | 
            +
            </div>
         | 
| 90 | 
            +
            <img src="./confusion_matrices.png" alt="Alt text" width="600" height="400"/>
         | 
| 91 | 
            +
             | 
| 92 | 
            +
            > 📊 *Evaluated on 1500+ Norwegian utterances from diverse conversational contexts*
         | 
| 93 | 
            +
             | 
| 94 | 
            +
            ## ⚡️ Speed Analysis
         | 
| 95 | 
            +
             | 
| 96 | 
            +
            <img src="./performance_analysis.png" alt="Alt text" width="600" height="400"/>
         | 
| 97 | 
            +
             | 
| 98 | 
            +
            ## 🔧 Train & Test Scripts
         | 
| 99 | 
            +
             | 
| 100 | 
            +
            <div align="center">
         | 
| 101 | 
            +
             | 
| 102 | 
            +
            [](https://colab.research.google.com/drive/1DqSUYfcya0r2iAEZB9fS4mfrennubduV) [](https://colab.research.google.com/drive/19ZOlNoHS2WLX2V4r5r492tsCUnYLXnQR)
         | 
| 103 | 
            +
             | 
| 104 | 
            +
            </div>
         | 
| 105 | 
            +
             | 
| 106 | 
            +
            ## 🛠️ Installation
         | 
| 107 | 
            +
             | 
| 108 | 
            +
            To use this model, you will need to install the following libraries.
         | 
| 109 | 
            +
             | 
| 110 | 
            +
            ```bash
         | 
| 111 | 
            +
            pip install onnxruntime transformers huggingface_hub
         | 
| 112 | 
            +
            ```
         | 
| 113 | 
            +
             | 
| 114 | 
            +
            ## 🚀 Quick Start
         | 
| 115 | 
            +
             | 
| 116 | 
            +
            You can run inference directly from Hugging Face repository.
         | 
| 117 | 
            +
             | 
| 118 | 
            +
            ```python
         | 
| 119 | 
            +
            import numpy as np
         | 
| 120 | 
            +
            import onnxruntime as ort
         | 
| 121 | 
            +
            from transformers import AutoTokenizer
         | 
| 122 | 
            +
            from huggingface_hub import hf_hub_download
         | 
| 123 | 
            +
             | 
| 124 | 
            +
            class TurnDetector:
         | 
| 125 | 
            +
                def __init__(self, repo_id="videosdk-live/Namo-Turn-Detector-v1-Norwegian"):
         | 
| 126 | 
            +
                    """
         | 
| 127 | 
            +
                    Initializes the detector by downloading the model and tokenizer
         | 
| 128 | 
            +
                    from the Hugging Face Hub.
         | 
| 129 | 
            +
                    """
         | 
| 130 | 
            +
                    print(f"Loading model from repo: {repo_id}")
         | 
| 131 | 
            +
                    
         | 
| 132 | 
            +
                    # Download the model and tokenizer from the Hub
         | 
| 133 | 
            +
                    # Authentication is handled automatically if you are logged in
         | 
| 134 | 
            +
                    model_path = hf_hub_download(repo_id=repo_id, filename="model_quant.onnx")
         | 
| 135 | 
            +
                    self.tokenizer = AutoTokenizer.from_pretrained(repo_id)
         | 
| 136 | 
            +
                    
         | 
| 137 | 
            +
                    # Set up the ONNX Runtime inference session
         | 
| 138 | 
            +
                    self.session = ort.InferenceSession(model_path)
         | 
| 139 | 
            +
                    self.max_length = 512
         | 
| 140 | 
            +
                    print("✅ Model and tokenizer loaded successfully.")
         | 
| 141 | 
            +
             | 
| 142 | 
            +
                def predict(self, text: str) -> str:
         | 
| 143 | 
            +
                    """
         | 
| 144 | 
            +
                    Predicts if a given text utterance is the end of a turn.
         | 
| 145 | 
            +
                    Returns "End of Turn" or "Not End of Turn".
         | 
| 146 | 
            +
                    """
         | 
| 147 | 
            +
                    # Tokenize the input text
         | 
| 148 | 
            +
                    inputs = self.tokenizer(
         | 
| 149 | 
            +
                        text,
         | 
| 150 | 
            +
                        truncation=True,
         | 
| 151 | 
            +
                        max_length=self.max_length,
         | 
| 152 | 
            +
                        return_tensors="np"
         | 
| 153 | 
            +
                    )
         | 
| 154 | 
            +
                    
         | 
| 155 | 
            +
                    # Prepare the feed dictionary for the ONNX model
         | 
| 156 | 
            +
                    feed_dict = {
         | 
| 157 | 
            +
                        "input_ids": inputs["input_ids"],
         | 
| 158 | 
            +
                        "attention_mask": inputs["attention_mask"]
         | 
| 159 | 
            +
                    }
         | 
| 160 | 
            +
                    
         | 
| 161 | 
            +
                    # Run inference
         | 
| 162 | 
            +
                    outputs = self.session.run(None, feed_dict)
         | 
| 163 | 
            +
                    logits = outputs
         | 
| 164 | 
            +
                    
         | 
| 165 | 
            +
                    # Get the predicted class (0 or 1)
         | 
| 166 | 
            +
                    prediction_index = np.argmax(logits, axis=1)
         | 
| 167 | 
            +
                    
         | 
| 168 | 
            +
                    return "End of Turn" if prediction_index == 1 else "Not End of Turn"
         | 
| 169 | 
            +
             | 
| 170 | 
            +
            # --- Example Usage ---
         | 
| 171 | 
            +
            if __name__ == "__main__":
         | 
| 172 | 
            +
                detector = TurnDetector()
         | 
| 173 | 
            +
                
         | 
| 174 | 
            +
                sentences = [
         | 
| 175 | 
            +
                    "Noen typer korn er sunnere enn andre.",      # Expected: End of Turn
         | 
| 176 | 
            +
                    "Euklidts elementer ble en ofte brukt, vel?" # Expected: Not End of Turn
         | 
| 177 | 
            +
                ]
         | 
| 178 | 
            +
                
         | 
| 179 | 
            +
                for sentence in sentences:
         | 
| 180 | 
            +
                    result = detector.predict(sentence)
         | 
| 181 | 
            +
                    print(f"'{sentence}' -> {result}")
         | 
| 182 | 
            +
             | 
| 183 | 
            +
            ```
         | 
| 184 | 
            +
             | 
| 185 | 
            +
             | 
| 186 | 
            +
            ## 🤖 VideoSDK Agents Integration
         | 
| 187 | 
            +
             | 
| 188 | 
            +
            Integrate this turn detector directly with VideoSDK Agents for production-ready conversational AI applications.
         | 
| 189 | 
            +
             | 
| 190 | 
            +
            ```python
         | 
| 191 | 
            +
            from videosdk_agents import NamoTurnDetectorV1, pre_download_namo_turn_v1_model
         | 
| 192 | 
            +
             | 
| 193 | 
            +
            #download model
         | 
| 194 | 
            +
            pre_download_namo_turn_v1_model(language="no")
         | 
| 195 | 
            +
             | 
| 196 | 
            +
            # Initialize Norwegian turn detector for VideoSDK Agents
         | 
| 197 | 
            +
            turn_detector = NamoTurnDetectorV1(language="no")
         | 
| 198 | 
            +
            ```
         | 
| 199 | 
            +
             | 
| 200 | 
            +
            > 📚 [**Complete Integration Guide**](https://docs.videosdk.live/ai_agents/plugins/namo-turn-detector) - Learn how to use `NamoTurnDetectorV1` with VideoSDK Agents
         | 
| 201 | 
            +
             | 
| 202 | 
            +
            ## 📖 Citation
         | 
| 203 | 
            +
             | 
| 204 | 
            +
            ```bibtex
         | 
| 205 | 
            +
            @model{namo_turn_detector_no_2025,
         | 
| 206 | 
            +
              title={Namo Turn Detector v1: Norwegian},
         | 
| 207 | 
            +
              author={VideoSDK Team},
         | 
| 208 | 
            +
              year={2025},
         | 
| 209 | 
            +
              publisher={Hugging Face},
         | 
| 210 | 
            +
              url={https://huggingface.co/videosdk-live/Namo-Turn-Detector-v1-Norwegian},
         | 
| 211 | 
            +
              note={ONNX-optimized DistilBERT for turn detection in Norwegian}
         | 
| 212 | 
            +
            }
         | 
| 213 | 
            +
            ```
         | 
| 214 | 
            +
             | 
| 215 | 
            +
            ## 📄 License
         | 
| 216 | 
            +
             | 
| 217 | 
            +
            This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details.
         | 
| 218 | 
            +
             | 
| 219 | 
            +
            <div align="center">
         | 
| 220 | 
            +
             | 
| 221 | 
            +
            **Made with ❤️ by the VideoSDK Team**
         | 
| 222 | 
            +
             | 
| 223 | 
            +
            [](https://videosdk.live)
         | 
| 224 | 
            +
             | 
| 225 | 
            +
            </div>
         | 
    	
        config.json
    ADDED
    
    | @@ -0,0 +1,29 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "activation": "gelu",
         | 
| 3 | 
            +
              "architectures": [
         | 
| 4 | 
            +
                "DistilBertForSequenceClassification"
         | 
| 5 | 
            +
              ],
         | 
| 6 | 
            +
              "attention_dropout": 0.1,
         | 
| 7 | 
            +
              "class_weights": [
         | 
| 8 | 
            +
                0.9951778825546507,
         | 
| 9 | 
            +
                1.004869075957585
         | 
| 10 | 
            +
              ],
         | 
| 11 | 
            +
              "dim": 768,
         | 
| 12 | 
            +
              "dropout": 0.1,
         | 
| 13 | 
            +
              "dtype": "float32",
         | 
| 14 | 
            +
              "hidden_dim": 3072,
         | 
| 15 | 
            +
              "initializer_range": 0.02,
         | 
| 16 | 
            +
              "max_position_embeddings": 512,
         | 
| 17 | 
            +
              "model_type": "distilbert",
         | 
| 18 | 
            +
              "n_heads": 12,
         | 
| 19 | 
            +
              "n_layers": 6,
         | 
| 20 | 
            +
              "output_past": true,
         | 
| 21 | 
            +
              "pad_token_id": 0,
         | 
| 22 | 
            +
              "problem_type": "single_label_classification",
         | 
| 23 | 
            +
              "qa_dropout": 0.1,
         | 
| 24 | 
            +
              "seq_classif_dropout": 0.2,
         | 
| 25 | 
            +
              "sinusoidal_pos_embds": false,
         | 
| 26 | 
            +
              "tie_weights_": true,
         | 
| 27 | 
            +
              "transformers_version": "4.53.3",
         | 
| 28 | 
            +
              "vocab_size": 119547
         | 
| 29 | 
            +
            }
         | 
    	
        confusion_matrices.png
    ADDED
    
    |   | 
| Git LFS Details
 | 
    	
        model.onnx
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:efe7693373c6721880959b00704d6e668f3d29622b8c4259901e2fb27f87ebb5
         | 
| 3 | 
            +
            size 541442940
         | 
    	
        model_quant.onnx
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:101ca887983edc24d1ec2a2056ddcaa1b201fcddd360c85380fdba1336598121
         | 
| 3 | 
            +
            size 135967547
         | 
    	
        performance_analysis.png
    ADDED
    
    |   | 
| Git LFS Details
 | 
    	
        special_tokens_map.json
    ADDED
    
    | @@ -0,0 +1,37 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "cls_token": {
         | 
| 3 | 
            +
                "content": "[CLS]",
         | 
| 4 | 
            +
                "lstrip": false,
         | 
| 5 | 
            +
                "normalized": false,
         | 
| 6 | 
            +
                "rstrip": false,
         | 
| 7 | 
            +
                "single_word": false
         | 
| 8 | 
            +
              },
         | 
| 9 | 
            +
              "mask_token": {
         | 
| 10 | 
            +
                "content": "[MASK]",
         | 
| 11 | 
            +
                "lstrip": false,
         | 
| 12 | 
            +
                "normalized": false,
         | 
| 13 | 
            +
                "rstrip": false,
         | 
| 14 | 
            +
                "single_word": false
         | 
| 15 | 
            +
              },
         | 
| 16 | 
            +
              "pad_token": {
         | 
| 17 | 
            +
                "content": "[PAD]",
         | 
| 18 | 
            +
                "lstrip": false,
         | 
| 19 | 
            +
                "normalized": false,
         | 
| 20 | 
            +
                "rstrip": false,
         | 
| 21 | 
            +
                "single_word": false
         | 
| 22 | 
            +
              },
         | 
| 23 | 
            +
              "sep_token": {
         | 
| 24 | 
            +
                "content": "[SEP]",
         | 
| 25 | 
            +
                "lstrip": false,
         | 
| 26 | 
            +
                "normalized": false,
         | 
| 27 | 
            +
                "rstrip": false,
         | 
| 28 | 
            +
                "single_word": false
         | 
| 29 | 
            +
              },
         | 
| 30 | 
            +
              "unk_token": {
         | 
| 31 | 
            +
                "content": "[UNK]",
         | 
| 32 | 
            +
                "lstrip": false,
         | 
| 33 | 
            +
                "normalized": false,
         | 
| 34 | 
            +
                "rstrip": false,
         | 
| 35 | 
            +
                "single_word": false
         | 
| 36 | 
            +
              }
         | 
| 37 | 
            +
            }
         | 
    	
        tokenizer.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        tokenizer_config.json
    ADDED
    
    | @@ -0,0 +1,63 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "added_tokens_decoder": {
         | 
| 3 | 
            +
                "0": {
         | 
| 4 | 
            +
                  "content": "[PAD]",
         | 
| 5 | 
            +
                  "lstrip": false,
         | 
| 6 | 
            +
                  "normalized": false,
         | 
| 7 | 
            +
                  "rstrip": false,
         | 
| 8 | 
            +
                  "single_word": false,
         | 
| 9 | 
            +
                  "special": true
         | 
| 10 | 
            +
                },
         | 
| 11 | 
            +
                "100": {
         | 
| 12 | 
            +
                  "content": "[UNK]",
         | 
| 13 | 
            +
                  "lstrip": false,
         | 
| 14 | 
            +
                  "normalized": false,
         | 
| 15 | 
            +
                  "rstrip": false,
         | 
| 16 | 
            +
                  "single_word": false,
         | 
| 17 | 
            +
                  "special": true
         | 
| 18 | 
            +
                },
         | 
| 19 | 
            +
                "101": {
         | 
| 20 | 
            +
                  "content": "[CLS]",
         | 
| 21 | 
            +
                  "lstrip": false,
         | 
| 22 | 
            +
                  "normalized": false,
         | 
| 23 | 
            +
                  "rstrip": false,
         | 
| 24 | 
            +
                  "single_word": false,
         | 
| 25 | 
            +
                  "special": true
         | 
| 26 | 
            +
                },
         | 
| 27 | 
            +
                "102": {
         | 
| 28 | 
            +
                  "content": "[SEP]",
         | 
| 29 | 
            +
                  "lstrip": false,
         | 
| 30 | 
            +
                  "normalized": false,
         | 
| 31 | 
            +
                  "rstrip": false,
         | 
| 32 | 
            +
                  "single_word": false,
         | 
| 33 | 
            +
                  "special": true
         | 
| 34 | 
            +
                },
         | 
| 35 | 
            +
                "103": {
         | 
| 36 | 
            +
                  "content": "[MASK]",
         | 
| 37 | 
            +
                  "lstrip": false,
         | 
| 38 | 
            +
                  "normalized": false,
         | 
| 39 | 
            +
                  "rstrip": false,
         | 
| 40 | 
            +
                  "single_word": false,
         | 
| 41 | 
            +
                  "special": true
         | 
| 42 | 
            +
                }
         | 
| 43 | 
            +
              },
         | 
| 44 | 
            +
              "clean_up_tokenization_spaces": false,
         | 
| 45 | 
            +
              "cls_token": "[CLS]",
         | 
| 46 | 
            +
              "do_lower_case": false,
         | 
| 47 | 
            +
              "extra_special_tokens": {},
         | 
| 48 | 
            +
              "mask_token": "[MASK]",
         | 
| 49 | 
            +
              "max_length": 128,
         | 
| 50 | 
            +
              "model_max_length": 512,
         | 
| 51 | 
            +
              "pad_to_multiple_of": null,
         | 
| 52 | 
            +
              "pad_token": "[PAD]",
         | 
| 53 | 
            +
              "pad_token_type_id": 0,
         | 
| 54 | 
            +
              "padding_side": "right",
         | 
| 55 | 
            +
              "sep_token": "[SEP]",
         | 
| 56 | 
            +
              "stride": 0,
         | 
| 57 | 
            +
              "strip_accents": null,
         | 
| 58 | 
            +
              "tokenize_chinese_chars": true,
         | 
| 59 | 
            +
              "tokenizer_class": "DistilBertTokenizer",
         | 
| 60 | 
            +
              "truncation_side": "right",
         | 
| 61 | 
            +
              "truncation_strategy": "longest_first",
         | 
| 62 | 
            +
              "unk_token": "[UNK]"
         | 
| 63 | 
            +
            }
         | 
    	
        vocab.txt
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
