Initial commit
Browse files- .gitattributes +1 -0
- README.md +153 -0
- benchmark_results.txt +7 -0
- benchmark_translations.zip +3 -0
- config.json +45 -0
- pytorch_model.bin +3 -0
- source.spm +3 -0
- special_tokens_map.json +1 -0
- target.spm +3 -0
- tokenizer_config.json +1 -0
- vocab.json +0 -0
    	
        .gitattributes
    CHANGED
    
    | @@ -26,3 +26,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text | |
| 26 | 
             
            *.zip filter=lfs diff=lfs merge=lfs -text
         | 
| 27 | 
             
            *.zstandard filter=lfs diff=lfs merge=lfs -text
         | 
| 28 | 
             
            *tfevents* filter=lfs diff=lfs merge=lfs -text
         | 
|  | 
|  | |
| 26 | 
             
            *.zip filter=lfs diff=lfs merge=lfs -text
         | 
| 27 | 
             
            *.zstandard filter=lfs diff=lfs merge=lfs -text
         | 
| 28 | 
             
            *tfevents* filter=lfs diff=lfs merge=lfs -text
         | 
| 29 | 
            +
            *.spm filter=lfs diff=lfs merge=lfs -text
         | 
    	
        README.md
    ADDED
    
    | @@ -0,0 +1,153 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            ---
         | 
| 2 | 
            +
            language:
         | 
| 3 | 
            +
            - en
         | 
| 4 | 
            +
            - it
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            tags:
         | 
| 7 | 
            +
            - translation
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            license: cc-by-4.0
         | 
| 10 | 
            +
            model-index:
         | 
| 11 | 
            +
            - name: opus-mt-tc-big-it-en
         | 
| 12 | 
            +
              results:
         | 
| 13 | 
            +
              - task:
         | 
| 14 | 
            +
                  name: Translation ita-eng
         | 
| 15 | 
            +
                  type: translation
         | 
| 16 | 
            +
                  args: ita-eng
         | 
| 17 | 
            +
                dataset:
         | 
| 18 | 
            +
                  name: flores101-devtest
         | 
| 19 | 
            +
                  type: flores_101
         | 
| 20 | 
            +
                  args: ita eng devtest
         | 
| 21 | 
            +
                metrics:
         | 
| 22 | 
            +
                   - name: BLEU
         | 
| 23 | 
            +
                     type: bleu
         | 
| 24 | 
            +
                     value: 32.8
         | 
| 25 | 
            +
              - task:
         | 
| 26 | 
            +
                  name: Translation ita-eng
         | 
| 27 | 
            +
                  type: translation
         | 
| 28 | 
            +
                  args: ita-eng
         | 
| 29 | 
            +
                dataset:
         | 
| 30 | 
            +
                  name: tatoeba-test-v2021-08-07
         | 
| 31 | 
            +
                  type: tatoeba_mt
         | 
| 32 | 
            +
                  args: ita-eng
         | 
| 33 | 
            +
                metrics:
         | 
| 34 | 
            +
                   - name: BLEU
         | 
| 35 | 
            +
                     type: bleu
         | 
| 36 | 
            +
                     value: 72.1
         | 
| 37 | 
            +
              - task:
         | 
| 38 | 
            +
                  name: Translation ita-eng
         | 
| 39 | 
            +
                  type: translation
         | 
| 40 | 
            +
                  args: ita-eng
         | 
| 41 | 
            +
                dataset:
         | 
| 42 | 
            +
                  name: newstest2009
         | 
| 43 | 
            +
                  type: wmt-2009-news
         | 
| 44 | 
            +
                  args: ita-eng
         | 
| 45 | 
            +
                metrics:
         | 
| 46 | 
            +
                   - name: BLEU
         | 
| 47 | 
            +
                     type: bleu
         | 
| 48 | 
            +
                     value: 34.3
         | 
| 49 | 
            +
            ---
         | 
| 50 | 
            +
            # opus-mt-tc-big-it-en
         | 
| 51 | 
            +
             | 
| 52 | 
            +
            Neural machine translation model for translating from Italian (it) to English (en).
         | 
| 53 | 
            +
             | 
| 54 | 
            +
            This model is part of the [OPUS-MT project](https://github.com/Helsinki-NLP/Opus-MT), an effort to make neural machine translation models widely available and accessible for many languages in the world. All models are originally trained using the amazing framework of [Marian NMT](https://marian-nmt.github.io/), an efficient NMT implementation written in pure C++. The models have been converted to pyTorch using the transformers library by huggingface. Training data is taken from [OPUS](https://opus.nlpl.eu/) and training pipelines use the procedures of [OPUS-MT-train](https://github.com/Helsinki-NLP/Opus-MT-train).
         | 
| 55 | 
            +
             | 
| 56 | 
            +
            * Publications: [OPUS-MT – Building open translation services for the World](https://aclanthology.org/2020.eamt-1.61/) and [The Tatoeba Translation Challenge – Realistic Data Sets for Low Resource and Multilingual MT](https://aclanthology.org/2020.wmt-1.139/) (Please, cite if you use this model.)
         | 
| 57 | 
            +
             | 
| 58 | 
            +
            ```
         | 
| 59 | 
            +
            @inproceedings{tiedemann-thottingal-2020-opus,
         | 
| 60 | 
            +
                title = "{OPUS}-{MT} {--} Building open translation services for the World",
         | 
| 61 | 
            +
                author = {Tiedemann, J{\"o}rg  and Thottingal, Santhosh},
         | 
| 62 | 
            +
                booktitle = "Proceedings of the 22nd Annual Conference of the European Association for Machine Translation",
         | 
| 63 | 
            +
                month = nov,
         | 
| 64 | 
            +
                year = "2020",
         | 
| 65 | 
            +
                address = "Lisboa, Portugal",
         | 
| 66 | 
            +
                publisher = "European Association for Machine Translation",
         | 
| 67 | 
            +
                url = "https://aclanthology.org/2020.eamt-1.61",
         | 
| 68 | 
            +
                pages = "479--480",
         | 
| 69 | 
            +
            }
         | 
| 70 | 
            +
             | 
| 71 | 
            +
            @inproceedings{tiedemann-2020-tatoeba,
         | 
| 72 | 
            +
                title = "The Tatoeba Translation Challenge {--} Realistic Data Sets for Low Resource and Multilingual {MT}",
         | 
| 73 | 
            +
                author = {Tiedemann, J{\"o}rg},
         | 
| 74 | 
            +
                booktitle = "Proceedings of the Fifth Conference on Machine Translation",
         | 
| 75 | 
            +
                month = nov,
         | 
| 76 | 
            +
                year = "2020",
         | 
| 77 | 
            +
                address = "Online",
         | 
| 78 | 
            +
                publisher = "Association for Computational Linguistics",
         | 
| 79 | 
            +
                url = "https://aclanthology.org/2020.wmt-1.139",
         | 
| 80 | 
            +
                pages = "1174--1182",
         | 
| 81 | 
            +
            }
         | 
| 82 | 
            +
            ```
         | 
| 83 | 
            +
             | 
| 84 | 
            +
            ## Model info
         | 
| 85 | 
            +
             | 
| 86 | 
            +
            * Release: 2022-02-25
         | 
| 87 | 
            +
            * source language(s): ita
         | 
| 88 | 
            +
            * target language(s): eng
         | 
| 89 | 
            +
            * model: transformer-big
         | 
| 90 | 
            +
            * data: opusTCv20210807+bt ([source](https://github.com/Helsinki-NLP/Tatoeba-Challenge))
         | 
| 91 | 
            +
            * tokenization: SentencePiece (spm32k,spm32k)
         | 
| 92 | 
            +
            * original model: [opusTCv20210807+bt_transformer-big_2022-02-25.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/ita-eng/opusTCv20210807+bt_transformer-big_2022-02-25.zip)
         | 
| 93 | 
            +
            * more information released models: [OPUS-MT ita-eng README](https://github.com/Helsinki-NLP/Tatoeba-Challenge/tree/master/models/ita-eng/README.md)
         | 
| 94 | 
            +
             | 
| 95 | 
            +
            ## Usage
         | 
| 96 | 
            +
             | 
| 97 | 
            +
            A short example code:
         | 
| 98 | 
            +
             | 
| 99 | 
            +
            ```python
         | 
| 100 | 
            +
            from transformers import MarianMTModel, MarianTokenizer
         | 
| 101 | 
            +
             | 
| 102 | 
            +
            src_text = [
         | 
| 103 | 
            +
                "So chi è il mio nemico.",
         | 
| 104 | 
            +
                "Tom è illetterato; non capisce assolutamente nulla."
         | 
| 105 | 
            +
            ]
         | 
| 106 | 
            +
             | 
| 107 | 
            +
            model_name = "pytorch-models/opus-mt-tc-big-it-en"
         | 
| 108 | 
            +
            tokenizer = MarianTokenizer.from_pretrained(model_name)
         | 
| 109 | 
            +
            model = MarianMTModel.from_pretrained(model_name)
         | 
| 110 | 
            +
            translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
         | 
| 111 | 
            +
             | 
| 112 | 
            +
            for t in translated:
         | 
| 113 | 
            +
                print( tokenizer.decode(t, skip_special_tokens=True) )
         | 
| 114 | 
            +
             | 
| 115 | 
            +
            # expected output:
         | 
| 116 | 
            +
            #     I know who my enemy is.
         | 
| 117 | 
            +
            #     Tom is illiterate; he understands absolutely nothing.
         | 
| 118 | 
            +
            ```
         | 
| 119 | 
            +
             | 
| 120 | 
            +
            You can also use OPUS-MT models with the transformers pipelines, for example:
         | 
| 121 | 
            +
             | 
| 122 | 
            +
            ```python
         | 
| 123 | 
            +
            from transformers import pipeline
         | 
| 124 | 
            +
            pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-big-it-en")
         | 
| 125 | 
            +
            print(pipe("So chi è il mio nemico."))
         | 
| 126 | 
            +
             | 
| 127 | 
            +
            # expected output: I know who my enemy is.
         | 
| 128 | 
            +
            ```
         | 
| 129 | 
            +
             | 
| 130 | 
            +
            ## Benchmarks
         | 
| 131 | 
            +
             | 
| 132 | 
            +
            * test set translations: [opusTCv20210807+bt_transformer-big_2022-02-25.test.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/ita-eng/opusTCv20210807+bt_transformer-big_2022-02-25.test.txt)
         | 
| 133 | 
            +
            * test set scores: [opusTCv20210807+bt_transformer-big_2022-02-25.eval.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/ita-eng/opusTCv20210807+bt_transformer-big_2022-02-25.eval.txt)
         | 
| 134 | 
            +
            * benchmark results: [benchmark_results.txt](benchmark_results.txt)
         | 
| 135 | 
            +
            * benchmark output: [benchmark_translations.zip](benchmark_translations.zip)
         | 
| 136 | 
            +
             | 
| 137 | 
            +
            | langpair | testset | chr-F | BLEU  | #sent | #words |
         | 
| 138 | 
            +
            |----------|---------|-------|-------|-------|--------|
         | 
| 139 | 
            +
            | ita-eng | tatoeba-test-v2021-08-07 | 0.82288 | 72.1 | 17320 | 119214 |
         | 
| 140 | 
            +
            | ita-eng | flores101-devtest | 0.62115 | 32.8 | 1012 | 24721 |
         | 
| 141 | 
            +
            | ita-eng | newssyscomb2009 | 0.59822 | 34.4 | 502 | 11818 |
         | 
| 142 | 
            +
            | ita-eng | newstest2009 | 0.59646 | 34.3 | 2525 | 65399 |
         | 
| 143 | 
            +
             | 
| 144 | 
            +
            ## Acknowledgements
         | 
| 145 | 
            +
             | 
| 146 | 
            +
            The work is supported by the [European Language Grid](https://www.european-language-grid.eu/) as [pilot project 2866](https://live.european-language-grid.eu/catalogue/#/resource/projects/2866), by the [FoTran project](https://www.helsinki.fi/en/researchgroups/natural-language-understanding-with-cross-lingual-grounding), funded by the European Research Council (ERC) under the European Union’s Horizon 2020 research and innovation programme (grant agreement No 771113), and the [MeMAD project](https://memad.eu/), funded by the European Union’s Horizon 2020 Research and Innovation Programme under grant agreement No 780069. We are also grateful for the generous computational resources and IT infrastructure provided by [CSC -- IT Center for Science](https://www.csc.fi/), Finland.
         | 
| 147 | 
            +
             | 
| 148 | 
            +
            ## Model conversion info
         | 
| 149 | 
            +
             | 
| 150 | 
            +
            * transformers version: 4.16.2
         | 
| 151 | 
            +
            * OPUS-MT git hash: 3405783
         | 
| 152 | 
            +
            * port time: Wed Apr 13 19:40:08 EEST 2022
         | 
| 153 | 
            +
            * port machine: LM0-400-22516.local
         | 
    	
        benchmark_results.txt
    ADDED
    
    | @@ -0,0 +1,7 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            ita-eng	flores101-dev	0.62411	32.8	997	23555
         | 
| 2 | 
            +
            ita-eng	flores101-devtest	0.62115	32.8	1012	24721
         | 
| 3 | 
            +
            ita-eng	newssyscomb2009	0.59822	34.4	502	11818
         | 
| 4 | 
            +
            ita-eng	newstest2009	0.59646	34.3	2525	65399
         | 
| 5 | 
            +
            ita-eng	tatoeba-test-v2020-07-28	0.80870	70.5	10000	67385
         | 
| 6 | 
            +
            ita-eng	tatoeba-test-v2021-03-30	0.81449	71.0	13443	92322
         | 
| 7 | 
            +
            ita-eng	tatoeba-test-v2021-08-07	0.82288	72.1	17320	119214
         | 
    	
        benchmark_translations.zip
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:ee7447e4aaf63f1e98a470cd958f7d40765bdfbcfc6d09aa8d4e7d46802f10a2
         | 
| 3 | 
            +
            size 1841089
         | 
    	
        config.json
    ADDED
    
    | @@ -0,0 +1,45 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "activation_dropout": 0.0,
         | 
| 3 | 
            +
              "activation_function": "relu",
         | 
| 4 | 
            +
              "architectures": [
         | 
| 5 | 
            +
                "MarianMTModel"
         | 
| 6 | 
            +
              ],
         | 
| 7 | 
            +
              "attention_dropout": 0.0,
         | 
| 8 | 
            +
              "bad_words_ids": [
         | 
| 9 | 
            +
                [
         | 
| 10 | 
            +
                  54421
         | 
| 11 | 
            +
                ]
         | 
| 12 | 
            +
              ],
         | 
| 13 | 
            +
              "bos_token_id": 0,
         | 
| 14 | 
            +
              "classifier_dropout": 0.0,
         | 
| 15 | 
            +
              "d_model": 1024,
         | 
| 16 | 
            +
              "decoder_attention_heads": 16,
         | 
| 17 | 
            +
              "decoder_ffn_dim": 4096,
         | 
| 18 | 
            +
              "decoder_layerdrop": 0.0,
         | 
| 19 | 
            +
              "decoder_layers": 6,
         | 
| 20 | 
            +
              "decoder_start_token_id": 54421,
         | 
| 21 | 
            +
              "decoder_vocab_size": 54422,
         | 
| 22 | 
            +
              "dropout": 0.1,
         | 
| 23 | 
            +
              "encoder_attention_heads": 16,
         | 
| 24 | 
            +
              "encoder_ffn_dim": 4096,
         | 
| 25 | 
            +
              "encoder_layerdrop": 0.0,
         | 
| 26 | 
            +
              "encoder_layers": 6,
         | 
| 27 | 
            +
              "eos_token_id": 43017,
         | 
| 28 | 
            +
              "forced_eos_token_id": 43017,
         | 
| 29 | 
            +
              "init_std": 0.02,
         | 
| 30 | 
            +
              "is_encoder_decoder": true,
         | 
| 31 | 
            +
              "max_length": 512,
         | 
| 32 | 
            +
              "max_position_embeddings": 1024,
         | 
| 33 | 
            +
              "model_type": "marian",
         | 
| 34 | 
            +
              "normalize_embedding": false,
         | 
| 35 | 
            +
              "num_beams": 4,
         | 
| 36 | 
            +
              "num_hidden_layers": 6,
         | 
| 37 | 
            +
              "pad_token_id": 54421,
         | 
| 38 | 
            +
              "scale_embedding": true,
         | 
| 39 | 
            +
              "share_encoder_decoder_embeddings": true,
         | 
| 40 | 
            +
              "static_position_embeddings": true,
         | 
| 41 | 
            +
              "torch_dtype": "float16",
         | 
| 42 | 
            +
              "transformers_version": "4.18.0.dev0",
         | 
| 43 | 
            +
              "use_cache": true,
         | 
| 44 | 
            +
              "vocab_size": 54422
         | 
| 45 | 
            +
            }
         | 
    	
        pytorch_model.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:08e29beb3ff123414b3aaa516280986e328db9de5e3b7ac8c64955cc50e51ea2
         | 
| 3 | 
            +
            size 575827779
         | 
    	
        source.spm
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:4e900c0f10324efc6ac67a5246cec9b0faf693ea1ea9fa1786a9f3755a59a73a
         | 
| 3 | 
            +
            size 820134
         | 
    	
        special_tokens_map.json
    ADDED
    
    | @@ -0,0 +1 @@ | |
|  | 
|  | |
| 1 | 
            +
            {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
         | 
    	
        target.spm
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:8e6799550a391c7a30ae95869c4692a00352e05fdf1ffcf9b50d96f08b0327bf
         | 
| 3 | 
            +
            size 802852
         | 
    	
        tokenizer_config.json
    ADDED
    
    | @@ -0,0 +1 @@ | |
|  | 
|  | |
| 1 | 
            +
            {"source_lang": "it", "target_lang": "en", "unk_token": "<unk>", "eos_token": "</s>", "pad_token": "<pad>", "model_max_length": 512, "sp_model_kwargs": {}, "separate_vocabs": false, "special_tokens_map_file": null, "name_or_path": "marian-models/opusTCv20210807+bt_transformer-big_2022-02-25/it-en", "tokenizer_class": "MarianTokenizer"}
         | 
    	
        vocab.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
