Spaces:

ASLP-lab
/

OSUM-EChat

Running on Zero

File size: 8,849 Bytes

841f290

# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import datetime
import logging
import sys
from contextlib import nullcontext

# if your python version < 3.7 use the below one
# from contextlib import suppress as nullcontext
import torch
from wenet.utils.common import StepTimer

from wenet.utils.train_utils import (wenet_join, batch_forward, batch_backward,
                                     update_parameter_and_lr, log_per_step,
                                     save_model)

import os
from gxl_ai_utils.utils import utils_file
class Executor:

    def __init__(self,
                 global_step: int = 0,
                 device: torch.device = torch.device("cpu")):
        self.step = global_step + 1
        self.train_step_timer = None
        self.cv_step_timer = None
        self.device = device

    def train(self, model, optimizer, scheduler, train_data_loader,
              cv_data_loader, writer, configs, scaler, group_join):
        ''' Train one epoch
        '''
        if self.train_step_timer is None:
            self.train_step_timer = StepTimer(self.step)
        model.train()
        init_infos = configs.get("init_infos", {})
        init_batch_num = init_infos.get("batch_idx", 0)
        configs.pop("init_infos", None)
        info_dict = copy.deepcopy(configs)
        logging.info('using accumulate grad, new batch size is {} times'
                     ' larger than before'.format(info_dict['accum_grad']))
        # A context manager to be used in conjunction with an instance of
        # torch.nn.parallel.DistributedDataParallel to be able to train
        # with uneven inputs across participating processes.
        if isinstance(model, torch.nn.parallel.DistributedDataParallel):
            model_context = model.join
        else:
            model_context = nullcontext
        # continue_data = info_dict['dataset_conf'].get('continue_data', True)
        # utils_file.logging_info(f'OSUM-EChat -------- continue_data: {continue_data}')
        with model_context():
            logging.info(f'init_batch_num: {init_batch_num}')
            for batch_idx, batch_dict in enumerate(train_data_loader):
                # print(f'batch_idx: {batch_idx}')
                # if continue_data and batch_idx < init_batch_num:
                #     if batch_idx %100 == 0:
                #         logging.info(f'OSUM-EChat skipping: batch_idx {batch_idx}')
                #     continue
                # if batch_idx %100 == 0:
                #         logging.info(f'OSUM-EChat using: batch_idx {batch_idx}')    
                info_dict["tag"] = "TRAIN"
                info_dict["step"] = self.step
                info_dict["batch_idx"] = batch_idx
                # if wenet_join(group_join, info_dict):
                #     break
                # rank = int(os.environ.get('RANK', 0))
                # if batch_idx < 2400+3472+200    : # 2400+3572的位置会卡着，试着直接跳过他试试.双机器（调过顺序） 2400+3472+2840会卡着。三机器调过顺序后再 2400+3472+268会卡着
                #     if (rank == 0):
                #         print(f'batch_idx: {batch_idx}')
                #     continue

                if batch_dict["target_lengths"].size(0) == 0:
                    continue

                context = None
                # Disable gradient synchronizations across DDP processes.
                # Within this context, gradients will be accumulated on module
                # variables, which will later be synchronized.
                if info_dict.get("train_engine", "torch_ddp") in [
                        "torch_ddp", "torch_fsdp"
                ] and (batch_idx + 1) % info_dict["accum_grad"] != 0:
                    context = model.no_sync
                # Used for single gpu training and DDP gradient synchronization
                # processes.
                else:
                    context = nullcontext

                with context():
                    info_dict = batch_forward(model, batch_dict, scaler,
                                              info_dict, self.device)
                    # if batch_idx %10 == 0:
                        # logging.info(f'after batch_forward: batch_idx {info_dict["batch_idx"]}')
                    
                    info_dict = batch_backward(model, scaler, info_dict)
                    # if batch_idx %10 == 0:
                        # logging.info(f'after batch_backward: batch_idx {info_dict["batch_idx"]}')
                    

                info_dict = update_parameter_and_lr(model, optimizer,
                                                    scheduler, scaler,
                                                    info_dict)
                # if batch_idx %100 == 0:
                    # logging.info(f'after update_parameter_and_lr: batch_idx {info_dict["batch_idx"]}')
                    
                # write training: tensorboard && log
                log_per_step(writer, info_dict, timer=self.train_step_timer)
                save_interval = info_dict.get('save_interval', sys.maxsize)
                if (self.step +
                        1) % save_interval == 0 and self.step != 0 and (
                            batch_idx + 1) % info_dict["accum_grad"] == 0:
                    import torch.distributed as dist
                    # Ensure all ranks start CV at the same time in step mode
                    dist.barrier()
                    # loss_dict = self.cv(model, cv_data_loader, configs)
                    model.train()
                    info_dict.update({
                        "tag":
                        "step_{}".format(self.step),
                        "loss_dict": {'loss':999,'acc':999},
                        "save_time":
                        datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S'),
                        "lrs":
                        [group['lr'] for group in optimizer.param_groups]
                    })
                    save_model(model, info_dict)
                    # write final cv: tensorboard
                    log_per_step(writer, info_dict)
                    # Ensure all ranks start Train at the same time in step mode
                    dist.barrier()
                self.step += 1 if (batch_idx +
                                   1) % info_dict["accum_grad"] == 0 else 0

    def cv(self, model, cv_data_loader, configs):
        ''' Cross validation on
        '''
        if self.cv_step_timer is None:
            self.cv_step_timer = StepTimer(0.0)
        else:
            self.cv_step_timer.last_iteration = 0.0
        model.eval()
        info_dict = copy.deepcopy(configs)
        num_seen_utts, loss_dict, total_acc = 1, {}, []  # avoid division by 0
        with torch.no_grad():
            for batch_idx, batch_dict in enumerate(cv_data_loader):
                info_dict["tag"] = "CV"
                info_dict["step"] = self.step
                info_dict["batch_idx"] = batch_idx
                info_dict["cv_step"] = batch_idx

                num_utts = batch_dict["target_lengths"].size(0)
                if num_utts == 0:
                    continue

                info_dict = batch_forward(model, batch_dict, None, info_dict,
                                          self.device)
                _dict = info_dict["loss_dict"]

                num_seen_utts += num_utts
                total_acc.append(_dict['th_accuracy'].item(
                ) if _dict.get('th_accuracy', None) is not None else 0.0)
                for loss_name, loss_value in _dict.items():
                    if loss_value is not None and "loss" in loss_name \
                            and torch.isfinite(loss_value):
                        loss_value = loss_value.item()
                        loss_dict[loss_name] = loss_dict.get(loss_name, 0) + \
                            loss_value * num_utts
                # write cv: log
                log_per_step(writer=None,
                             info_dict=info_dict,
                             timer=self.cv_step_timer)
        for loss_name, loss_value in loss_dict.items():
            loss_dict[loss_name] = loss_dict[loss_name] / num_seen_utts
        loss_dict["acc"] = sum(total_acc) / len(total_acc)
        return loss_dict