# Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import contextlib from argparse import Namespace from typing import Any, Optional import torch import torch.nn as nn from dataclasses import dataclass, field from fairseq import checkpoint_utils, tasks, utils from fairseq.dataclass import FairseqDataclass from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.models import BaseFairseqModel, FairseqEncoder, register_model from fairseq.models.hubert.hubert import MASKING_DISTRIBUTION_CHOICES from fairseq.tasks import FairseqTask from omegaconf import II, MISSING from .hubert_asr import HubertAsrConfig from fairseq.models.transformer import TransformerConfig logger = logging.getLogger(__name__) @dataclass class HubertMTConfig(HubertAsrConfig): load_pretrained_mbart_from: Optional[str] = field( default=None, metadata={ "help": "model to take text encoder decoder weights from (for initialization)" }, ) use_rel_pos_enc: bool = field( default=True, metadata={"help": "whether to use relative positional encoding"}, ) text_transformer_encoder_layers: int = field( default=12, metadata={"help": "reset text_transformer_encoder_layers"}, ) @register_model("hubert_mt", dataclass=HubertMTConfig) class HubertMT(BaseFairseqModel): def __init__(self, cfg: HubertMTConfig, w2v_encoder: BaseFairseqModel): super().__init__() self.cfg = cfg self.w2v_encoder = w2v_encoder def upgrade_state_dict_named(self, state_dict, name): super().upgrade_state_dict_named(state_dict, name) return state_dict @classmethod def build_model(cls, cfg: HubertMTConfig, task: FairseqTask): """Build a new model instance.""" w2v_encoder = HubertEncoder(cfg, task.target_dictionary) return cls(cfg, w2v_encoder) def get_normalized_probs(self, net_output, log_probs, sample=None): """Get normalized probabilities (or log probs) from a net's output.""" if "decoder_out" in net_output: return self.w2v_encoder.get_normalized_probs_decoder(net_output["decoder_out"], log_probs, sample) assert "encoder_out" not in net_output if "encoder_out" not in net_output: return self.w2v_encoder.get_normalized_probs_decoder(net_output, log_probs, sample) def get_logits(self, net_output): logits = net_output["encoder_out"] padding = net_output["encoder_padding_mask"] if padding is not None and padding.any(): padding = padding.T logits[padding][..., 0] = 0 logits[padding][..., 1:] = float("-inf") return logits def forward(self, **kwargs): x = self.w2v_encoder(**kwargs) return x @property def encoder(self): return self.w2v_encoder def reorder_encoder_out(self, encoder_out, new_order): return self.encoder.reorder_encoder_out(encoder_out, new_order) @property def decoder(self): return self.w2v_encoder.w2v_model.decoder class HubertEncoder(FairseqEncoder): def __init__(self, cfg: HubertMTConfig, tgt_dict=None): self.apply_mask = cfg.apply_mask arg_overrides = { "dropout": cfg.dropout, "activation_dropout": cfg.activation_dropout, "dropout_input": cfg.dropout_input, "attention_dropout": cfg.attention_dropout, "mask_length": cfg.mask_length, "mask_prob": cfg.mask_prob, "mask_selection": cfg.mask_selection, "mask_other": cfg.mask_other, "no_mask_overlap": cfg.no_mask_overlap, "mask_channel_length": cfg.mask_channel_length, "mask_channel_prob": cfg.mask_channel_prob, "mask_channel_selection": cfg.mask_channel_selection, "mask_channel_other": cfg.mask_channel_other, "no_mask_channel_overlap": cfg.no_mask_channel_overlap, "encoder_layerdrop": cfg.layerdrop, "decoder_layerdrop": cfg.decoder_layerdrop, "feature_grad_mult": cfg.feature_grad_mult, "decoder_dict_size": -1, "add_text_modality": True, "add_text_encoder": True, "load_pretrained_mbart_from": None, "load_pretrained_w2v_from": None, "text_transformer": { "encoder":{ "layers": cfg.text_transformer_encoder_layers, "layerdrop": cfg.layerdrop, }, 'dropout': cfg.dropout, 'attention_dropout': cfg.attention_dropout, 'activation_dropout': cfg.activation_dropout, } } if cfg.no_pretrained_weights: arg_overrides["use_rel_pos_enc"] = cfg.use_rel_pos_enc if cfg.w2v_args is None: state = checkpoint_utils.load_checkpoint_to_cpu( cfg.w2v_path, arg_overrides ) w2v_args = state.get("cfg", None) if w2v_args is None: w2v_args = convert_namespace_to_omegaconf(state["args"]) cfg.w2v_args = w2v_args else: state = None w2v_args = cfg.w2v_args if isinstance(w2v_args, Namespace): cfg.w2v_args = w2v_args = convert_namespace_to_omegaconf(w2v_args) # logger.info("---------------------state.keys()-------------------------------------------") # logger.info(state.keys()) # logger.info("---------------------w2v_args.task-------------------------------------------") # logger.info(w2v_args.task) # logger.info("---------------------w2v_args.model-------------------------------------------") # logger.info(w2v_args.model) # logger.info("----------------------------------------------------------------") w2v_args.task.data = cfg.data w2v_args.task.text_cfg.text_data = cfg.data w2v_args.task.text_cfg.data_config = None task = tasks.setup_task(w2v_args.task) if state is not None and "task_state" in state: # This will load the stored "dictionaries" object task.load_state_dict(state["task_state"]) model = task.build_model(w2v_args.model) ### load mbart if specificed if cfg.load_pretrained_mbart_from is not None and cfg.no_pretrained_weights: logger.info("Loading mbart....") mbart_model_state = model.load_checkpoint(cfg.load_pretrained_mbart_from) model.text_encoder = model.load_pretrained_component_from_model( component=model.text_encoder, state=mbart_model_state ) model.decoder = model.load_pretrained_component_from_model( component=model.decoder, state=mbart_model_state ) if state is not None and not cfg.no_pretrained_weights: logger.info("Loading pre-trained models....") model.load_state_dict(state["model"], strict=True) ### remove_pretraining_modules model.remove_pretraining_modules() model.target_glu = None model.final_proj = None model.feature_extractor = None model.post_extract_proj = None model.encoder = None dropout_keys = [ n for n in w2v_args.model.text_transformer if n.find("drop") >= 0 ] for key in dropout_keys: logger.info(f"{key}: {w2v_args.model.text_transformer[key]}") super().__init__(task.source_dictionary) d = w2v_args.model.encoder_embed_dim self.w2v_model = model self.final_dropout = nn.Dropout(cfg.final_dropout) self.freeze_finetune_updates = cfg.freeze_finetune_updates self.freeze_decoder_updates = cfg.freeze_decoder_updates self.num_updates = 0 def set_num_updates(self, num_updates): """Set the number of parameters updates.""" super().set_num_updates(num_updates) self.num_updates = num_updates def forward(self, src_tokens, src_lengths, prev_output_tokens, tbc=True, **kwargs): # ft = self.freeze_finetune_updates <= self.num_updates w2v_args = { "src_tokens": src_tokens, "src_lengths": src_lengths, "mask": self.apply_mask and self.training, "prev_output_tokens": prev_output_tokens, } results = self.w2v_model(**w2v_args) return results def get_normalized_probs_decoder(self, net_output, log_probs, sample=None): # net_output['encoder_out'] is a (B, T, D) tensor return self.w2v_model.get_normalized_probs(net_output, log_probs, sample) def reorder_encoder_out(self, encoder_out, new_order): if encoder_out["encoder_out"] is not None: if isinstance(encoder_out["encoder_out"], list): encoder_out["encoder_out"] = ( [] if len(encoder_out["encoder_out"]) == 0 else [x.index_select(1, new_order) for x in encoder_out["encoder_out"]] ) else: encoder_out["encoder_out"] = encoder_out[ "encoder_out" ].index_select(1, new_order) if encoder_out["encoder_padding_mask"] is not None: if isinstance(encoder_out["encoder_padding_mask"], list): encoder_out["encoder_padding_mask"] = ( [] if len(encoder_out["encoder_padding_mask"]) == 0 else [x.index_select(0, new_order) for x in encoder_out["encoder_padding_mask"]] ) else: encoder_out["encoder_padding_mask"] = encoder_out[ "encoder_padding_mask" ].index_select(0, new_order) if "decoder_out" in encoder_out and encoder_out["decoder_out"] is not None: if isinstance(encoder_out["decoder_out"], list): encoder_out["decoder_out"] = ( [] if len(encoder_out["decoder_out"]) == 0 else [x.index_select(0, new_order) for x in encoder_out["decoder_out"]] ) else: encoder_out["decoder_out"] = encoder_out[ "decoder_out" ].index_select(0, new_order) if "encoder_out_for_ctc" in encoder_out and encoder_out["encoder_out_for_ctc"] is not None: if isinstance(encoder_out["encoder_out_for_ctc"], list): encoder_out["encoder_out_for_ctc"] = ( [] if len(encoder_out["encoder_out_for_ctc"]) == 0 else [x.index_select(1, new_order) for x in encoder_out["encoder_out_for_ctc"]] ) else: encoder_out["encoder_out_for_ctc"] = encoder_out[ "encoder_out_for_ctc" ].index_select(1, new_order) return encoder_out def forward_torchscript(self, net_input): """A TorchScript-compatible version of forward. Encoders which use additional arguments may want to override this method for TorchScript compatibility. """ encoder_out = self.w2v_model.forward_torchscript(net_input) if "encoder_out_for_ctc" in encoder_out: del encoder_out['encoder_out_for_ctc'] return encoder_out def max_positions(self): """Maximum input length supported by the encoder.""" return None def upgrade_state_dict_named(self, state_dict, name): return state_dict def Embedding(num_embeddings, embedding_dim, padding_idx): m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5) nn.init.constant_(m.weight[padding_idx], 0) return m def Linear(in_features, out_features, bias=True): m = nn.Linear(in_features, out_features, bias) nn.init.xavier_uniform_(m.weight) if bias: nn.init.constant_(m.bias, 0.0) return m