Spaces:

hannatoenbreker
/

whisper-dutch-small-gradio

Runtime error

App Files Files Community

whisper-dutch-small-gradio / app.py

hannatoenbreker

Update app.py

919046e about 2 years ago

raw

history blame contribute delete

20.8 kB

	# Barcharts plotten en plaatsen in de gradio interface
	import gradio as gr
	import matplotlib.pyplot as plt
	import numpy as np
	from datasets import load_dataset, DatasetDict, Audio
	from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor, pipeline
	from transformers import WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, WhisperForConditionalGeneration, WhisperProcessor
	from dataclasses import dataclass
	from typing import Any, Dict, List, Union
	import gradio as gr
	import torch
	import multiprocessing
	import pandas as pd

	with open("./code.html") as f:
	lines = f.readlines()

	model = WhisperForConditionalGeneration.from_pretrained("hannatoenbreker/whisper-dutch")
	processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Dutch", task="transcribe")
	pipe = pipeline(model="openai/whisper-small")

	def transcribe(audio):
	text = pipe(audio, generate_kwargs = {"language":"<\|nl\|>", "task": "transcribe"})["text"]
	return text

	def get_wer_chart(results, finetuned):
	index = [1, 2, 3, 4, 5]

	# Set the figure size
	wer = plt.figure(figsize=(10, 5))

	# Set the same y-axis scale for all charts
	#plt.ylim(0, 0.7)

	# Set the width of each bar
	bar_width = 0.2

	# Position of the bars on the x-axis
	r1 = np.arange(len(index))

	r2 = r1 + bar_width
	r3 = r2 + bar_width
	r4 = r3 + bar_width

	# Set the bar charts
	plt.bar(r1, finetuned['wer'], color='orange', width=bar_width)
	plt.bar(r2, results['wer'], color='blue', width=bar_width)
	plt.bar(r3, results['wav2vec2'], color='grey', width=bar_width)
	plt.bar(r4, results['baseline'], color='green', width=bar_width)

	# Set the labels
	plt.xlabel('Subgroup')
	plt.ylabel('WER')
	plt.title('WER per subgroup')

	# Set the x-axis ticks
	plt.xticks(r2 + bar_width, ['DC', 'DT', 'NNC', 'NNA', 'DOA'])

	# Set the legend
	plt.legend(['Finetuned model', 'Not finetuned model', 'wav2vec2', 'Baseline'])

	return wer



	def get_cer_chart(results, finetuned):
	index = [1, 2, 3, 4, 5]

	# Set the figure size
	cer = plt.figure(figsize=(10, 5))

	# Set the same y-axis scale for all charts
	#plt.ylim(0, 0.7)

	# Set the width of each bar
	bar_width = 0.3

	# Position of the bars on the x-axis
	r1 = np.arange(len(index))
	r2 = r1 + bar_width

	# Set value on top of the bar
	#for index, value in enumerate(results['cer']):
	#plt.text(index - 0.12, value + 0.01, str(round(value, 3)))

	# Set the bar chart
	plt.bar(r1, finetuned['cer'], color='orange', width=0.3)
	plt.bar(r2, results['cer'], color='blue', width=0.3)

	# Set the labels
	plt.xlabel('Subgroup')
	plt.ylabel('CER')
	plt.title('CER per subgroup')

	# Set the x-axis ticks
	plt.xticks(r1 + bar_width/2, ['DC', 'DT', 'NNC', 'NNA', 'DOA'])

	# Set the legend
	plt.legend(['Whisper small finetuned', 'Whisper small v1'])

	return cer


	with gr.Blocks() as demo:
	data = pd.read_csv("./eval-whisper-small-results-v1.txt", sep=';', header=0)

	# SUBGROUPS:
	# GROUP 1: native children aged 7-11 (DC)
	# GROUP 2: native children aged 12-16 (DT)
	# GROUP 3: non-native children (NNC)
	# GROUP 4: non-native adults (NNA)
	# GROUP 5: native adults above 65 (DOA)

	baseline_read = [[1, 0.353], [2, 0.184], [3, 0.551], [4, 0.569], [5, 0.242]]
	baseline_hmi = [[1, 0.434], [2, 0.353], [3, 0.616], [4, 0.613], [5, 0.395]]

	# Set wave2vec2 results per subgroup
	wav2vec2_read = [[1, 0.188], [2, 0.120], [3, 0.303], [4, 0.332], [5, 0.123]]
	wav2vec2_hmi = [[1, 0.312], [2, 0.250], [3, 0.475], [4, 0.501], [5, 0.307]]

	# Create dataframes for the bar charts
	wav2vec2_r = pd.DataFrame(wav2vec2_read, columns=['group', 'wav2vec2'])
	wav2vec2_h = pd.DataFrame(wav2vec2_hmi, columns=['group', 'wav2vec2'])

	baseline_r = pd.DataFrame(baseline_read, columns=['group', 'baseline'])
	baseline_h = pd.DataFrame(baseline_hmi, columns=['group', 'baseline'])

	# Create dataframes for the components
	comp_p = data[data['component'] == 'comp-p']
	comp_q = data[data['component'] == 'comp-q']

	comp_p = comp_p.groupby(['group'])[['wer', 'cer']].mean()
	comp_q = comp_q.groupby(['group'])[['wer', 'cer']].mean()

	comp_p = comp_p.merge(baseline_h, how='left', on=['group'])
	comp_q = comp_q.merge(baseline_r, how='left', on=['group'])

	comp_p = comp_p.merge(wav2vec2_h, how='left', on=['group'])
	comp_q = comp_q.merge(wav2vec2_r, how='left', on=['group'])

	finetuned = pd.read_csv("./eval-rtl-whisper-small-results-v3.txt", sep=';', header=0)

	# Create dataframes for the components
	comp_p_finetuned = finetuned[finetuned['component'] == 'comp-p']
	comp_q_finetuned = finetuned[finetuned['component'] == 'comp-q']

	comp_p_finetuned = comp_p_finetuned.groupby(['group'])[['wer', 'cer']].mean()
	comp_q_finetuned = comp_q_finetuned.groupby(['group'])[['wer', 'cer']].mean()


	# Create graphs for the gradio interface
	wer_read = get_wer_chart(comp_q, comp_q_finetuned)
	cer_read = get_cer_chart(comp_q, comp_q_finetuned)
	wer_hmi = get_wer_chart(comp_p, comp_p_finetuned)
	cer_hmi = get_cer_chart(comp_p, comp_p_finetuned)

	gr.Markdown(
	"""
	# Whisper Dutch - RTL

	This model is a fine-tuned version of [openai/whisper-small](https://huggingface.co/openai/whisper-small) on the Common Voice 11.0 dataset.
	It achieves the following results on the evaluation set:
	- Loss: 0.1790
	- Wer: 37.5081

	## Model description

	### Whisper
	OpenAI has recently released a new speech recognition model called Whisper. Unlike DALLE-2 and GPT-3, Whisper is a free and open-source model.
	Whisper is an automatic speech recognition model trained on 680,000 hours of multilingual data collected from the web. As per OpenAI, this model is robust to accents, background noise and technical language.
	In addition, it supports 99 different languages’ transcription and translation from those languages into English. During this project, we were aiming for the large-v2 model, but because of several issues such as limited computing power, we had to work with the small model.
	More information needed

	## Intended uses & limitations

	### Server
	For the sake of this tutorial, we will assume that you want to connect to the ml.hihva.nl server. There are other servers, like the Jupyter server, but these don't have enough processing power or the necessary access rights to finetune whisper models.

	#### Connecting
	To connect with the server, you first need an account (username and password). You also need to be connected to the HvA wifi. If you are not present at the HvA you can use the VPN via Ivanti. Then, you can connect via your terminal by typing "ssh username@ml.hihva.nl". After executing that command, you will get a prompt for your password.

	#### Note
	It's important to note that the server is only powerful enough to finetune the small Whisper model.

	### Recommendation
	In the tutorial of HuggingFace we followed, the Common Voice dataset is being used. So we decided to experiment with it. This dataset is based in 2017, which is already 6 years ago. There are some updates every year, which makes the resource trustful. For future research another dataset might be needed to compare the results of both datasets.

	Remarkable is that our finetuned model with 1000 iterations has better WER results than with 4000 iterations. This can be a sign of overfitting the model with 4000 iterations, so this is also good to keep in mind.

	After finetuning the small model, the transcribed text still contains hallucinations which we couldn’t find out during the project. Hallucinations are repeated words in a sentence which makes the Word Error Rate (WER) higher. We don’t know why our model hallucinates, but we may have some suggestions that you could try:

	Experimenting with different training arguments (eval_steps, iterations) or pipeline arguments (chunck_length, stride_length)

	It could be a lack of computing power that causes the hallucination.

	The model could be overfitting because of too many data or too many iterations

	We tried to use a small subset of the common voice dataset to finetune the model, but that didn’t help with preventing hallucinations.


	## Training and evaluation data

	During our project we have used two datasets: common voice for training our finetuned model and jasmin to evaluate and test our finetuned model.

	### Common Voice
	The Common Voice dataset is an open-source initiative by Mozilla, aimed at creating a publicly available dataset for training and developing speech recognition technologies. It was created to address the lack of diverse and openly accessible voice data, which can be a significant barrier in the development of speech recognition systems.

	The Common Voice project encourages volunteers from around the world to contribute their voice by recording and submitting audio samples. Participants are asked to read aloud a set of prompted sentences, which helps create a diverse collection of voices, accents, languages, and speech patterns.

	The dataset is designed to be multilingual and includes contributions in various languages. It aims to cover a wide range of accents, dialects, and speaking styles to improve the accuracy and inclusivity of speech recognition systems. By allowing individuals to freely contribute their voice, the project enables the collection of data from underrepresented populations and regions.

	### Jasmin
	The baseline of the subgroups in the modelcard are from the Jasmin research, which are the native children between 7 and 11, native children between 12 and 16, non-native children between 7 and 16, non-native adults and native adults above 65. Almost 100 hours of spoken text are collected to create a baseline.

	## Training procedure

	### Finetune process
	For the entire finetune process we have used a blog: https://huggingface.co/blog/fine-tune-whisper
	This blog contains a step-by-step tutorial for finetuning whisper. To see our finetuned code, you can visit the following link: https://huggingface.co/hannatoenbreker/whisper-dutch-small/tree/main

	#### Finetuning
	The process of finetuning a model involves retraining the model on a new data set, using the existing model parameters as a starting point. By doing this, we can effectively "teach" the model to perform better on our specific task, while still benefiting from the pre-trained "knowledge" of the model.

	##### Pipeline:
	- Dataset loading and preprocessing: The code starts by loading the Common Voice dataset, in Dutch language. It then selects a subset of the data for training and testing. Unnecessary columns are removed to simplify the dataset. The dataset is then processed to generate input features using the WhisperFeatureExtractor and the labels using the WhisperTokenizer.

	- Data Collator: The DataCollatorSpeechSeq2SeqWithPadding class is used for padding the input features and the labels during training. This is necessary because batches of sequences in NLP tasks often have variable lengths, and we need to pad them to the same length to enable batch processing.

	- Model Initialization: The WhisperForConditionalGeneration model is loaded from a pre-trained model. This model is used for the task of conditional generation, where the task is to generate a sequence of output tokens given a sequence of input tokens.

	- Training Arguments: The Seq2SeqTrainingArguments object is used to specify the various parameters for training the model. Here, they define things like the output directory, batch size, learning rate, max steps, evaluation strategy, etc.

	- Trainer Initialization: The Seq2SeqTrainer is initialized with the previously defined training arguments, the model, the datasets, the data collator, and a method to compute metrics.

	- Training: The trainer.train() method is called to start training the model.

	- Pushing to the Hugging Face Model Hub: After training, the model is pushed to the Hugging Face model hub for others to use. The trainer.push_to_hub() method is called with some metadata about the model.

	##### Adding the tokenizer
	It is important to note that the tutorial does not include code that pushes the tokenizer to Huggingface, which results in a tokenizer error. That’s why it is important to include this line of code after defining the training arguments:

	```
	processor.save_pretrained(training_args.output_dir)
	```

	##### Pushing config to git
	After finetuning the model, the code tries to push the model to huggingface. During this process an error can occur: ‘Error: "model-index[0].results[0].dataset.config" must be a string’. This can be solved by manually editing the README.md of the model. At the top of the file, there is a config that can be edited. You should edit the value of the key named “config” which can be found under model-index -> results -> dataset. The given value should be a valid string.

	![Alt text](https://huggingface.co/spaces/hannatoenbreker/whisper-dutch-small-gradio/resolve/main/image.png)

	We believe the core of the issue is in the kwargs argument (see picture below). We did not have time to try solving the issue from our code, but we would suggest trying a different dataset argument to solve the error beforehand.

	![Alt text](https://huggingface.co/spaces/hannatoenbreker/whisper-dutch-small-gradio/resolve/main/image-1.png)

	#### Connect Huggingface to Visual studio code
	https://huggingface.co/docs/huggingface_hub/installation

	https://huggingface.co/docs/hub/security-tokens

	#### Our finetune proces
	To get some inspiration for the finetuning proces you can few our notebook that we used for finetuning which can be found here: [Finetuning notebook](https://huggingface.co/spaces/hannatoenbreker/whisper-dutch-small-gradio/blob/main/finetuningwhisper.ipynb)

	#### Evaluation
	To evaluate the model, you can use the shell script found here: /home2/rai_asr/eval/eval_whisper_rtl.sh. This script uses eval.py to evaluate the model on the Jasmin dataset. Eval.py can be found here: /home2/rai_asr/eval/eval.py. To change the model that you want to evaluate, you need to open eval.py and find the class “rtl_whisper_asr_model”. This class has the init method and a prediction method, which is used to evaluate an audio file with the model. To change the model that is used in this process you need to change the model in the pipeline to the model that you want to evaluate. The pipeline can be found in the predict method.

	##### Evaluation pipeline

	The pipeline, which can be found in the predict method of the rtl_whisper_asr_model class, has a few parameters.

	Chunck_length_s is an important one because if you don’t use this parameter, the model will only transcribe the first 30 seconds of the audio file instead of the whole audio file. The value of this parameter can be changed and will have a (slight) impact on the WER, so we advise experimenting with it. When working with audio data, it is common to divide the waveform into smaller segments or chunks to process them efficiently. These chunks can be of fixed length or variable length, depending on the requirements of the task.

	Device defines the GPU that is used for the transcribing. In this case, we use GPU 0. This was because other students were using GPU 1.

	Generate_kwargs if you do not add this parameter, the pipeline will translate the transcribed text to english. Therefore, it is important to specify the language and task of the pipeline.

	### Training hyperparameters

	The following hyperparameters were used during training:
	- learning_rate: 1e-05
	- train_batch_size: 16
	- eval_batch_size: 16
	- seed: 42
	- gradient_accumulation_steps: 2
	- total_train_batch_size: 32
	- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
	- lr_scheduler_type: linear
	- lr_scheduler_warmup_steps: 500
	- training_steps: 4000
	- mixed_precision_training: Native AMP

	### Training results

	\| Training Loss \| Epoch \| Step \| Validation Loss \| Wer \|
	\|:-------------:\|:-----:\|:----:\|:---------------:\|:-------:\|
	\| 0.1238 \| 0.78 \| 1000 \| 0.2017 \| 19.8254 \|
	\| 0.0548 \| 1.56 \| 2000 \| 0.1829 \| 35.4625 \|
	\| 0.0259 \| 2.34 \| 3000 \| 0.1795 \| 43.1853 \|
	\| 0.0131 \| 3.12 \| 4000 \| 0.1790 \| 37.5081 \|


	### Framework versions

	- Transformers 4.28.1
	- Pytorch 2.0.0+cu117
	- Datasets 2.12.0
	- Tokenizers 0.13.3

	### Useful links

	#### Our pages
	https://huggingface.co/spaces/hannatoenbreker/whisper-dutch-small-gradio

	https://huggingface.co/hannatoenbreker/whisper-dutch-small

	#### Blog finetuning
	https://huggingface.co/blog/fine-tune-whisper

	#### Backgroundinformation about Whisper
	https://openai.com/research/whisper

	https://github.com/openai/whisper

	#### Huggingface token + installation
	https://huggingface.co/docs/huggingface_hub/installation

	https://huggingface.co/docs/hub/security-tokens
	"""
	)

	gr.Markdown(
	"""
	# Model card
	In this page we visualize the results of our whisper model (small-v1)
	We measure the results by the Word Error Rate and the Character Error Rate
	These values are compared with a baseline.
	"""
	)
	gr.Markdown(
	"""
	### Subgroups
	The subgroups are based on the age and language proficiency of the speaker. The subgroups are as follows:

	- DC: native children aged 7-11
	- DT: native children aged 12-16
	- NNC: non-native children
	- NNA: non-native adults
	- DOA: native adults above 65

	### Word Error Rate (WER)
	The Word Error Rate (WER) is a metric that measures the performance of a speech recognition system. It is calculated by comparing the number of words that are incorrectly predicted by the model with the total number of words in the reference text.

	### Character Error Rate (CER)
	The Character Error Rate (CER) is also metric that measures the performance of a speech recognition system. It is calculated by comparing the number of characters that are incorrectly predicted by the model with the total number of characters in the reference text.

	## Baseline-information
	The baseline in this model card is derived from a study in which the average score of the baseline was established. This score serves as a reference point for evaluating the performance of our model. In the box plot, the baseline is represented as a blue line that indicates the average WER for each subgroup. By comparing the WER of the model with the baseline, we can understand how well our model performs compared to the established standard.

	### References
	Quantifying Bias in Automatic Speech Recognition, Siyuan Feng, Olya Kudina, Bence Mark Halpern and Odette Scharenborg, link: https://drive.google.com/file/d/18Y60qr4SAX21-kdKiyg4QxbWaUtsJ4gR/view?usp=sharing
	"""
	)

	with gr.Tabs():
	with gr.TabItem("Read"):
	gr.Plot(wer_read)
	gr.Plot(cer_read)

	with gr.TabItem("HMI"):
	gr.Plot(wer_hmi)
	gr.Plot(cer_hmi)


	gr.HTML(lines)

	demo.launch()