import torch


class TextRegressionDataset:
    """
    A custom dataset class for text regression tasks for AutoTrain.

    Args:
        data (list of dict): The dataset containing text and target values.
        tokenizer (PreTrainedTokenizer): The tokenizer to preprocess the text data.
        config (object): Configuration object containing dataset parameters.

    Attributes:
        data (list of dict): The dataset containing text and target values.
        tokenizer (PreTrainedTokenizer): The tokenizer to preprocess the text data.
        config (object): Configuration object containing dataset parameters.
        text_column (str): The column name for text data in the dataset.
        target_column (str): The column name for target values in the dataset.
        max_len (int): The maximum sequence length for tokenized inputs.

    Methods:
        __len__(): Returns the number of samples in the dataset.
        __getitem__(item): Returns a dictionary containing tokenized inputs and target value for a given index.
    """

    def __init__(self, data, tokenizer, config):
        self.data = data
        self.tokenizer = tokenizer
        self.config = config
        self.text_column = self.config.text_column
        self.target_column = self.config.target_column
        self.max_len = self.config.max_seq_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        text = str(self.data[item][self.text_column])
        target = float(self.data[item][self.target_column])
        inputs = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        if "token_type_ids" in inputs:
            token_type_ids = inputs["token_type_ids"]
        else:
            token_type_ids = None

        if token_type_ids is not None:
            return {
                "input_ids": torch.tensor(ids, dtype=torch.long),
                "attention_mask": torch.tensor(mask, dtype=torch.long),
                "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
                "labels": torch.tensor(target, dtype=torch.float),
            }
        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
            "labels": torch.tensor(target, dtype=torch.float),
        }