Spaces:
Runtime error
Runtime error
# Barcharts plotten en plaatsen in de gradio interface | |
import gradio as gr | |
import matplotlib.pyplot as plt | |
import numpy as np | |
from datasets import load_dataset, DatasetDict, Audio | |
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor, pipeline | |
from transformers import WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, WhisperForConditionalGeneration, WhisperProcessor | |
from dataclasses import dataclass | |
from typing import Any, Dict, List, Union | |
import gradio as gr | |
import torch | |
import multiprocessing | |
import pandas as pd | |
with open("./code.html") as f: | |
lines = f.readlines() | |
model = WhisperForConditionalGeneration.from_pretrained("hannatoenbreker/whisper-dutch") | |
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Dutch", task="transcribe") | |
pipe = pipeline(model="openai/whisper-small") | |
def transcribe(audio): | |
text = pipe(audio, generate_kwargs = {"language":"<|nl|>", "task": "transcribe"})["text"] | |
return text | |
def get_wer_chart(results, finetuned): | |
index = [1, 2, 3, 4, 5] | |
# Set the figure size | |
wer = plt.figure(figsize=(10, 5)) | |
# Set the same y-axis scale for all charts | |
#plt.ylim(0, 0.7) | |
# Set the width of each bar | |
bar_width = 0.2 | |
# Position of the bars on the x-axis | |
r1 = np.arange(len(index)) | |
r2 = r1 + bar_width | |
r3 = r2 + bar_width | |
r4 = r3 + bar_width | |
# Set the bar charts | |
plt.bar(r1, finetuned['wer'], color='orange', width=bar_width) | |
plt.bar(r2, results['wer'], color='blue', width=bar_width) | |
plt.bar(r3, results['wav2vec2'], color='grey', width=bar_width) | |
plt.bar(r4, results['baseline'], color='green', width=bar_width) | |
# Set the labels | |
plt.xlabel('Subgroup') | |
plt.ylabel('WER') | |
plt.title('WER per subgroup') | |
# Set the x-axis ticks | |
plt.xticks(r2 + bar_width, ['DC', 'DT', 'NNC', 'NNA', 'DOA']) | |
# Set the legend | |
plt.legend(['Finetuned model', 'Not finetuned model', 'wav2vec2', 'Baseline']) | |
return wer | |
def get_cer_chart(results, finetuned): | |
index = [1, 2, 3, 4, 5] | |
# Set the figure size | |
cer = plt.figure(figsize=(10, 5)) | |
# Set the same y-axis scale for all charts | |
#plt.ylim(0, 0.7) | |
# Set the width of each bar | |
bar_width = 0.3 | |
# Position of the bars on the x-axis | |
r1 = np.arange(len(index)) | |
r2 = r1 + bar_width | |
# Set value on top of the bar | |
#for index, value in enumerate(results['cer']): | |
#plt.text(index - 0.12, value + 0.01, str(round(value, 3))) | |
# Set the bar chart | |
plt.bar(r1, finetuned['cer'], color='orange', width=0.3) | |
plt.bar(r2, results['cer'], color='blue', width=0.3) | |
# Set the labels | |
plt.xlabel('Subgroup') | |
plt.ylabel('CER') | |
plt.title('CER per subgroup') | |
# Set the x-axis ticks | |
plt.xticks(r1 + bar_width/2, ['DC', 'DT', 'NNC', 'NNA', 'DOA']) | |
# Set the legend | |
plt.legend(['Whisper small finetuned', 'Whisper small v1']) | |
return cer | |
with gr.Blocks() as demo: | |
data = pd.read_csv("./eval-whisper-small-results-v1.txt", sep=';', header=0) | |
# SUBGROUPS: | |
# GROUP 1: native children aged 7-11 (DC) | |
# GROUP 2: native children aged 12-16 (DT) | |
# GROUP 3: non-native children (NNC) | |
# GROUP 4: non-native adults (NNA) | |
# GROUP 5: native adults above 65 (DOA) | |
baseline_read = [[1, 0.353], [2, 0.184], [3, 0.551], [4, 0.569], [5, 0.242]] | |
baseline_hmi = [[1, 0.434], [2, 0.353], [3, 0.616], [4, 0.613], [5, 0.395]] | |
# Set wave2vec2 results per subgroup | |
wav2vec2_read = [[1, 0.188], [2, 0.120], [3, 0.303], [4, 0.332], [5, 0.123]] | |
wav2vec2_hmi = [[1, 0.312], [2, 0.250], [3, 0.475], [4, 0.501], [5, 0.307]] | |
# Create dataframes for the bar charts | |
wav2vec2_r = pd.DataFrame(wav2vec2_read, columns=['group', 'wav2vec2']) | |
wav2vec2_h = pd.DataFrame(wav2vec2_hmi, columns=['group', 'wav2vec2']) | |
baseline_r = pd.DataFrame(baseline_read, columns=['group', 'baseline']) | |
baseline_h = pd.DataFrame(baseline_hmi, columns=['group', 'baseline']) | |
# Create dataframes for the components | |
comp_p = data[data['component'] == 'comp-p'] | |
comp_q = data[data['component'] == 'comp-q'] | |
comp_p = comp_p.groupby(['group'])[['wer', 'cer']].mean() | |
comp_q = comp_q.groupby(['group'])[['wer', 'cer']].mean() | |
comp_p = comp_p.merge(baseline_h, how='left', on=['group']) | |
comp_q = comp_q.merge(baseline_r, how='left', on=['group']) | |
comp_p = comp_p.merge(wav2vec2_h, how='left', on=['group']) | |
comp_q = comp_q.merge(wav2vec2_r, how='left', on=['group']) | |
finetuned = pd.read_csv("./eval-rtl-whisper-small-results-v3.txt", sep=';', header=0) | |
# Create dataframes for the components | |
comp_p_finetuned = finetuned[finetuned['component'] == 'comp-p'] | |
comp_q_finetuned = finetuned[finetuned['component'] == 'comp-q'] | |
comp_p_finetuned = comp_p_finetuned.groupby(['group'])[['wer', 'cer']].mean() | |
comp_q_finetuned = comp_q_finetuned.groupby(['group'])[['wer', 'cer']].mean() | |
# Create graphs for the gradio interface | |
wer_read = get_wer_chart(comp_q, comp_q_finetuned) | |
cer_read = get_cer_chart(comp_q, comp_q_finetuned) | |
wer_hmi = get_wer_chart(comp_p, comp_p_finetuned) | |
cer_hmi = get_cer_chart(comp_p, comp_p_finetuned) | |
gr.Markdown( | |
""" | |
# Whisper Dutch - RTL | |
This model is a fine-tuned version of [openai/whisper-small](https://huggingface.co/openai/whisper-small) on the Common Voice 11.0 dataset. | |
It achieves the following results on the evaluation set: | |
- Loss: 0.1790 | |
- Wer: 37.5081 | |
## Model description | |
### Whisper | |
OpenAI has recently released a new speech recognition model called Whisper. Unlike DALLE-2 and GPT-3, Whisper is a free and open-source model. | |
Whisper is an automatic speech recognition model trained on 680,000 hours of multilingual data collected from the web. As per OpenAI, this model is robust to accents, background noise and technical language. | |
In addition, it supports 99 different languages’ transcription and translation from those languages into English. During this project, we were aiming for the large-v2 model, but because of several issues such as limited computing power, we had to work with the small model. | |
More information needed | |
## Intended uses & limitations | |
### Server | |
For the sake of this tutorial, we will assume that you want to connect to the ml.hihva.nl server. There are other servers, like the Jupyter server, but these don't have enough processing power or the necessary access rights to finetune whisper models. | |
#### Connecting | |
To connect with the server, you first need an account (username and password). You also need to be connected to the HvA wifi. If you are not present at the HvA you can use the VPN via Ivanti. Then, you can connect via your terminal by typing "ssh username@ml.hihva.nl". After executing that command, you will get a prompt for your password. | |
#### Note | |
It's important to note that the server is only powerful enough to finetune the small Whisper model. | |
### Recommendation | |
In the tutorial of HuggingFace we followed, the Common Voice dataset is being used. So we decided to experiment with it. This dataset is based in 2017, which is already 6 years ago. There are some updates every year, which makes the resource trustful. For future research another dataset might be needed to compare the results of both datasets. | |
Remarkable is that our finetuned model with 1000 iterations has better WER results than with 4000 iterations. This can be a sign of overfitting the model with 4000 iterations, so this is also good to keep in mind. | |
After finetuning the small model, the transcribed text still contains hallucinations which we couldn’t find out during the project. Hallucinations are repeated words in a sentence which makes the Word Error Rate (WER) higher. We don’t know why our model hallucinates, but we may have some suggestions that you could try: | |
Experimenting with different training arguments (eval_steps, iterations) or pipeline arguments (chunck_length, stride_length) | |
It could be a lack of computing power that causes the hallucination. | |
The model could be overfitting because of too many data or too many iterations | |
We tried to use a small subset of the common voice dataset to finetune the model, but that didn’t help with preventing hallucinations. | |
## Training and evaluation data | |
During our project we have used two datasets: common voice for training our finetuned model and jasmin to evaluate and test our finetuned model. | |
### Common Voice | |
The Common Voice dataset is an open-source initiative by Mozilla, aimed at creating a publicly available dataset for training and developing speech recognition technologies. It was created to address the lack of diverse and openly accessible voice data, which can be a significant barrier in the development of speech recognition systems. | |
The Common Voice project encourages volunteers from around the world to contribute their voice by recording and submitting audio samples. Participants are asked to read aloud a set of prompted sentences, which helps create a diverse collection of voices, accents, languages, and speech patterns. | |
The dataset is designed to be multilingual and includes contributions in various languages. It aims to cover a wide range of accents, dialects, and speaking styles to improve the accuracy and inclusivity of speech recognition systems. By allowing individuals to freely contribute their voice, the project enables the collection of data from underrepresented populations and regions. | |
### Jasmin | |
The baseline of the subgroups in the modelcard are from the Jasmin research, which are the native children between 7 and 11, native children between 12 and 16, non-native children between 7 and 16, non-native adults and native adults above 65. Almost 100 hours of spoken text are collected to create a baseline. | |
## Training procedure | |
### Finetune process | |
For the entire finetune process we have used a blog: https://huggingface.co/blog/fine-tune-whisper | |
This blog contains a step-by-step tutorial for finetuning whisper. To see our finetuned code, you can visit the following link: https://huggingface.co/hannatoenbreker/whisper-dutch-small/tree/main | |
#### Finetuning | |
The process of finetuning a model involves retraining the model on a new data set, using the existing model parameters as a starting point. By doing this, we can effectively "teach" the model to perform better on our specific task, while still benefiting from the pre-trained "knowledge" of the model. | |
##### Pipeline: | |
- Dataset loading and preprocessing: The code starts by loading the Common Voice dataset, in Dutch language. It then selects a subset of the data for training and testing. Unnecessary columns are removed to simplify the dataset. The dataset is then processed to generate input features using the WhisperFeatureExtractor and the labels using the WhisperTokenizer. | |
- Data Collator: The DataCollatorSpeechSeq2SeqWithPadding class is used for padding the input features and the labels during training. This is necessary because batches of sequences in NLP tasks often have variable lengths, and we need to pad them to the same length to enable batch processing. | |
- Model Initialization: The WhisperForConditionalGeneration model is loaded from a pre-trained model. This model is used for the task of conditional generation, where the task is to generate a sequence of output tokens given a sequence of input tokens. | |
- Training Arguments: The Seq2SeqTrainingArguments object is used to specify the various parameters for training the model. Here, they define things like the output directory, batch size, learning rate, max steps, evaluation strategy, etc. | |
- Trainer Initialization: The Seq2SeqTrainer is initialized with the previously defined training arguments, the model, the datasets, the data collator, and a method to compute metrics. | |
- Training: The trainer.train() method is called to start training the model. | |
- Pushing to the Hugging Face Model Hub: After training, the model is pushed to the Hugging Face model hub for others to use. The trainer.push_to_hub() method is called with some metadata about the model. | |
##### Adding the tokenizer | |
It is important to note that the tutorial does not include code that pushes the tokenizer to Huggingface, which results in a tokenizer error. That’s why it is important to include this line of code after defining the training arguments: | |
``` | |
processor.save_pretrained(training_args.output_dir) | |
``` | |
##### Pushing config to git | |
After finetuning the model, the code tries to push the model to huggingface. During this process an error can occur: ‘Error: "model-index[0].results[0].dataset.config" must be a string’. This can be solved by manually editing the README.md of the model. At the top of the file, there is a config that can be edited. You should edit the value of the key named “config” which can be found under model-index -> results -> dataset. The given value should be a valid string. | |
 | |
We believe the core of the issue is in the kwargs argument (see picture below). We did not have time to try solving the issue from our code, but we would suggest trying a different dataset argument to solve the error beforehand. | |
 | |
#### Connect Huggingface to Visual studio code | |
https://huggingface.co/docs/huggingface_hub/installation | |
https://huggingface.co/docs/hub/security-tokens | |
#### Our finetune proces | |
To get some inspiration for the finetuning proces you can few our notebook that we used for finetuning which can be found here: [Finetuning notebook](https://huggingface.co/spaces/hannatoenbreker/whisper-dutch-small-gradio/blob/main/finetuningwhisper.ipynb) | |
#### Evaluation | |
To evaluate the model, you can use the shell script found here: /home2/rai_asr/eval/eval_whisper_rtl.sh. This script uses eval.py to evaluate the model on the Jasmin dataset. Eval.py can be found here: /home2/rai_asr/eval/eval.py. To change the model that you want to evaluate, you need to open eval.py and find the class “rtl_whisper_asr_model”. This class has the init method and a prediction method, which is used to evaluate an audio file with the model. To change the model that is used in this process you need to change the model in the pipeline to the model that you want to evaluate. The pipeline can be found in the predict method. | |
##### Evaluation pipeline | |
The pipeline, which can be found in the predict method of the rtl_whisper_asr_model class, has a few parameters. | |
**Chunck_length_s** is an important one because if you don’t use this parameter, the model will only transcribe the first 30 seconds of the audio file instead of the whole audio file. The value of this parameter can be changed and will have a (slight) impact on the WER, so we advise experimenting with it. When working with audio data, it is common to divide the waveform into smaller segments or chunks to process them efficiently. These chunks can be of fixed length or variable length, depending on the requirements of the task. | |
**Device** defines the GPU that is used for the transcribing. In this case, we use GPU 0. This was because other students were using GPU 1. | |
**Generate_kwargs** if you do not add this parameter, the pipeline will translate the transcribed text to english. Therefore, it is important to specify the language and task of the pipeline. | |
### Training hyperparameters | |
The following hyperparameters were used during training: | |
- learning_rate: 1e-05 | |
- train_batch_size: 16 | |
- eval_batch_size: 16 | |
- seed: 42 | |
- gradient_accumulation_steps: 2 | |
- total_train_batch_size: 32 | |
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08 | |
- lr_scheduler_type: linear | |
- lr_scheduler_warmup_steps: 500 | |
- training_steps: 4000 | |
- mixed_precision_training: Native AMP | |
### Training results | |
| Training Loss | Epoch | Step | Validation Loss | Wer | | |
|:-------------:|:-----:|:----:|:---------------:|:-------:| | |
| 0.1238 | 0.78 | 1000 | 0.2017 | 19.8254 | | |
| 0.0548 | 1.56 | 2000 | 0.1829 | 35.4625 | | |
| 0.0259 | 2.34 | 3000 | 0.1795 | 43.1853 | | |
| 0.0131 | 3.12 | 4000 | 0.1790 | 37.5081 | | |
### Framework versions | |
- Transformers 4.28.1 | |
- Pytorch 2.0.0+cu117 | |
- Datasets 2.12.0 | |
- Tokenizers 0.13.3 | |
### Useful links | |
#### Our pages | |
https://huggingface.co/spaces/hannatoenbreker/whisper-dutch-small-gradio | |
https://huggingface.co/hannatoenbreker/whisper-dutch-small | |
#### Blog finetuning | |
https://huggingface.co/blog/fine-tune-whisper | |
#### Backgroundinformation about Whisper | |
https://openai.com/research/whisper | |
https://github.com/openai/whisper | |
#### Huggingface token + installation | |
https://huggingface.co/docs/huggingface_hub/installation | |
https://huggingface.co/docs/hub/security-tokens | |
""" | |
) | |
gr.Markdown( | |
""" | |
# Model card | |
In this page we visualize the results of our whisper model (small-v1) | |
We measure the results by the Word Error Rate and the Character Error Rate | |
These values are compared with a baseline. | |
""" | |
) | |
gr.Markdown( | |
""" | |
### Subgroups | |
The subgroups are based on the age and language proficiency of the speaker. The subgroups are as follows: | |
- DC: native children aged 7-11 | |
- DT: native children aged 12-16 | |
- NNC: non-native children | |
- NNA: non-native adults | |
- DOA: native adults above 65 | |
### Word Error Rate (WER) | |
The Word Error Rate (WER) is a metric that measures the performance of a speech recognition system. It is calculated by comparing the number of words that are incorrectly predicted by the model with the total number of words in the reference text. | |
### Character Error Rate (CER) | |
The Character Error Rate (CER) is also metric that measures the performance of a speech recognition system. It is calculated by comparing the number of characters that are incorrectly predicted by the model with the total number of characters in the reference text. | |
## Baseline-information | |
The baseline in this model card is derived from a study in which the average score of the baseline was established. This score serves as a reference point for evaluating the performance of our model. In the box plot, the baseline is represented as a blue line that indicates the average WER for each subgroup. By comparing the WER of the model with the baseline, we can understand how well our model performs compared to the established standard. | |
### References | |
Quantifying Bias in Automatic Speech Recognition, Siyuan Feng, Olya Kudina, Bence Mark Halpern and Odette Scharenborg, link: https://drive.google.com/file/d/18Y60qr4SAX21-kdKiyg4QxbWaUtsJ4gR/view?usp=sharing | |
""" | |
) | |
with gr.Tabs(): | |
with gr.TabItem("Read"): | |
gr.Plot(wer_read) | |
gr.Plot(cer_read) | |
with gr.TabItem("HMI"): | |
gr.Plot(wer_hmi) | |
gr.Plot(cer_hmi) | |
gr.HTML(lines) | |
demo.launch() |