switch to using shared submodule for model code

Browse files

Files changed (12) hide show

model/__init__.py +0 -0
model/config.py +0 -137
model/decoders.py +0 -23
model/encoders.py +0 -26
model/outputs.py +0 -74
model/t5_vae.py +0 -522
model/utils.py +0 -24
model/vae.py +0 -30
t5-vae-flax +0 -1
t5_vae_flax +1 -1
train.py +2 -2
train.sh +3 -3

model/__init__.py DELETED Viewed

File without changes

model/config.py DELETED Viewed

@@ -1,137 +0,0 @@
-import copy
-from transformers.utils import logging
-from transformers.configuration_utils import PretrainedConfig
-from transformers import AutoConfig, T5Config
-from model.encoders import VAE_ENCODER_MODELS
-from model.decoders import VAE_DECODER_MODELS
-from model.utils import assertEqual, assertIn
-logger = logging.get_logger(__name__)
-class T5VaeConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of :class:`FlaxT5VAE`.
-    It is used to instantiate a T5-VAE model according to the specified arguments, defining the model architecture.
-    Instantiating a configuration with the defaults will yield a similar configuration to that of the T5 `t5-vae-base architecture.
-    To be able to use `transformer.trainer.Trainer` we need some specific training logic & config in the model.
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
-    Arguments:
-        n_latent_tokens (:obj:`int`, `optional`, defaults to 6):
-            Number of latent tokens (must be less than seq length).
-        latent_token_size (:obj:`int`, `optional`, defaults to 32):
-            Number of dimensions to use for each latent token.
-        t5_name (:obj:`str`, `optional`, defaults to t5-base):
-            Name of the Transformer model to use as a decoder.
-        block_size (:obj:`int`, `optional`, defaults to 60):
-            NOTE: Every input sequence must be padded to be equal to this length.
-    """
-    model_type = "transformer_vae"
-    is_composition = True
-    def __init__(
-        self,
-        t5_model_name_or_path=None,
-        n_latent_tokens=6,  # set to -1 for full sequence
-        latent_token_size=32,
-        vae_encoder_model='',
-        vae_decoder_model='',
-        block_size=60,
-        decoder_start_token_id=0,
-        cache_dir=None,
-        tie_word_embeddings=True,
-        # T5 config
-        t5=dict(),
-        vocab_size=32128,
-        d_model=512,
-        d_kv=64,
-        d_ff=2048,
-        num_layers=6,
-        num_decoder_layers=None,
-        num_heads=8,
-        relative_attention_num_buckets=32,
-        dropout_rate=0.1,
-        layer_norm_epsilon=1e-6,
-        initializer_factor=1.0,
-        feed_forward_proj="relu",
-        is_encoder_decoder=True,
-        use_cache=True,
-        pad_token_id=0,
-        eos_token_id=1,
-        gradient_checkpointing=False,
-        # end
-        **kwargs,
-    ):
-        assertIn(vae_encoder_model, VAE_ENCODER_MODELS.keys(), "Unexpected VAE encoder.")
-        assertIn(vae_decoder_model, VAE_DECODER_MODELS.keys(), "Unexpected VAE decoder.")
-        super().__init__(**kwargs)
-        self.set_seq_size = block_size
-        # VAE
-        self.vae_encoder_model = vae_encoder_model
-        self.vae_decoder_model = vae_decoder_model
-        self.latent_token_size = latent_token_size
-        assert(n_latent_tokens <= self.set_seq_size, 'Cannot use more latent tokens than input tokens.')
-        self.n_latent_tokens = n_latent_tokens
-        self.use_cache = use_cache
-        # T5
-        if t5_model_name_or_path:
-            self.t5 = AutoConfig.from_pretrained(t5_model_name_or_path, cache_dir=cache_dir)
-            assertEqual(self.t5.model_type, "t5", "Need t5 model type for transformer_decoder.")
-            self.t5.decoder_start_token_id = decoder_start_token_id
-        elif t5:
-            # use for loading a config
-            self.t5 = T5Config(**t5)
-        else:
-            self.t5 = T5Config(
-                vocab_size=vocab_size,
-                d_model=d_model,
-                d_kv=d_kv,
-                d_ff=d_ff,
-                num_layers=num_layers,
-                num_decoder_layers=num_decoder_layers,
-                num_heads=num_heads,
-                relative_attention_num_buckets=relative_attention_num_buckets,
-                dropout_rate=dropout_rate,
-                layer_norm_epsilon=layer_norm_epsilon,
-                initializer_factor=initializer_factor,
-                feed_forward_proj=feed_forward_proj,
-                is_encoder_decoder=is_encoder_decoder,
-                use_cache=use_cache,
-                pad_token_id=pad_token_id,
-                eos_token_id=eos_token_id,
-                gradient_checkpointing=gradient_checkpointing,
-                **kwargs
-            )
-        if self.t5.d_model < self.latent_token_size:
-            raise Exception('Using larger latent token dimension then T5 hidden dimension.')
-        # Add t5 config options
-        self.tie_word_embeddings = tie_word_embeddings
-        self.t5.tie_word_embeddings = self.tie_word_embeddings
-        self.t5.use_cache = self.use_cache
-        self.pad_token_id = pad_token_id
-        self.eos_token_id = eos_token_id
-        self.decoder_start_token_id = self.t5.decoder_start_token_id
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default `to_dict()` from `PretrainedConfig`.
-        Returns:
-            :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["model_type"] = self.__class__.model_type
-        output['t5'] = self.t5.to_dict()
-        return output

model/decoders.py DELETED Viewed

@@ -1,23 +0,0 @@
-import logging
-import flax.linen as nn
-logger = logging.getLogger(__name__)
-class Decoder(nn.Module):
-    '''
-        Converts latent code -> transformer encoding.
-    '''
-    dim_model: int
-    n_latent_tokens: int
-    @nn.compact
-    def __call__(self, latent_code):  # (batch, latent_tokens_per_sequence, latent_token_dim)
-        raw_latent_tokens = nn.Dense(self.dim_model)(latent_code)
-        latent_tokens = nn.LayerNorm()(raw_latent_tokens)
-        return latent_tokens  # (batch, latent_tokens_per_sequence, dim_model)
-VAE_DECODER_MODELS = {
-    '': Decoder,
-}

model/encoders.py DELETED Viewed

@@ -1,26 +0,0 @@
-import logging
-import jax.numpy as jnp
-import flax.linen as nn
-logger = logging.getLogger(__name__)
-class Encoder(nn.Module):
-    '''
-        Converts N hidden tokens into N seperate latent codes.
-    '''
-    latent_token_size: int
-    n_latent_tokens: int
-    @nn.compact
-    def __call__(self, encoding):
-        latent_tokens = nn.Dense(self.latent_token_size)(encoding)
-        raw_latent_code = latent_tokens[:, : self.n_latent_tokens, :]
-        # TODO does this just apply tanh to each latent token? Or across the whole batch
-        latent_code = jnp.tanh(raw_latent_code)
-        return latent_code  # (batch, latent_tokens_per_sequence, latent_token_dim)
-VAE_ENCODER_MODELS = {
-    '': Encoder,
-}

model/outputs.py DELETED Viewed

@@ -1,74 +0,0 @@
-from typing import Optional, Tuple
-import flax
-import jaxlib.xla_extension as jax_xla
-from transformers.file_utils import ModelOutput
-@flax.struct.dataclass
-class TransformerVaeOutput(ModelOutput):
-    """
-    Base class for a Transformer-VAE's outputs.
-    Args:
-        latent_codes (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, n_latent_tokens, latent_token_size)`):
-            Latent codes representing encoded sequences.
-        remade_encoder_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, n_tokens, model_dim)`):
-            Reconstructed encoder hidden states representing sequences.
-    (std Seq2Seq) Args:
-        logits (:obj:`jax_xla.DeviceArray` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        past_key_values (:obj:`tuple(tuple(jax_xla.DeviceArray))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
-            Tuple of :obj:`tuple(jax_xla.DeviceArray)` of length :obj:`config.n_layers`, with each tuple having 2
-            tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
-            tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
-        last_hidden_state (:obj:`tuple(jax_xla.DeviceArray)`:
-            Last model hidden state.
-        decoder_hidden_states (:obj:`tuple(jax_xla.DeviceArray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`jax_xla.DeviceArray` (one for the output of the embeddings + one for the output of each
-            layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
-        decoder_attentions (:obj:`tuple(jax_xla.DeviceArray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`jax_xla.DeviceArray` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-        cross_attentions (:obj:`tuple(jax_xla.DeviceArray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`jax_xla.DeviceArray` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
-            weighted average in the cross-attention heads.
-        encoder_last_hidden_state (:obj:`jax_xla.DeviceArray` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder of the model.
-        encoder_hidden_states (:obj:`tuple(jax_xla.DeviceArray)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`jax_xla.DeviceArray` (one for the output of the embeddings + one for the output of each
-            layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
-        encoder_attentions (:obj:`tuple(jax_xla.DeviceArray)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`jax_xla.DeviceArray` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
-            self-attention heads.
-    """
-    logits: jax_xla.DeviceArray = None
-    latent_codes: jax_xla.DeviceArray = None
-    remade_encoder_hidden_state: jax_xla.DeviceArray = None
-    # seq2seq
-    past_key_values: Optional[Tuple[Tuple[jax_xla.DeviceArray]]] = None
-    decoder_hidden_states: Optional[Tuple[jax_xla.DeviceArray]] = None
-    decoder_attentions: Optional[Tuple[jax_xla.DeviceArray]] = None
-    cross_attentions: Optional[Tuple[jax_xla.DeviceArray]] = None
-    last_hidden_state: Optional[jax_xla.DeviceArray] = None
-    encoder_last_hidden_state: Optional[jax_xla.DeviceArray] = None
-    encoder_hidden_states: Optional[Tuple[jax_xla.DeviceArray]] = None
-    encoder_attentions: Optional[Tuple[jax_xla.DeviceArray]] = None

model/t5_vae.py DELETED Viewed

@@ -1,522 +0,0 @@
-from typing import Optional, Tuple
-import jax
-import jax.numpy as jnp
-from jax.random import PRNGKey
-import flax.linen as nn
-from flax.core.frozen_dict import FrozenDict, unfreeze
-from transformers.modeling_flax_outputs import FlaxCausalLMOutputWithCrossAttentions
-from transformers.file_utils import add_start_docstrings
-from transformers.modeling_flax_utils import FlaxPreTrainedModel
-from transformers.models.t5.modeling_flax_t5 import FlaxT5ForConditionalGenerationModule
-from model.vae import VAE
-from model.outputs import TransformerVaeOutput
-from model.config import T5VaeConfig
-@add_start_docstrings("""T5 Model with a `language modeling` head on top converted into a VAE.""")
-class FlaxT5VaeForAutoencodingModule(nn.Module):
-    config: T5VaeConfig
-    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
-    def _get_encoder_module(self):
-        return self.t5.encoder
-    def _get_vae_encoder_module(self):
-        return self.vae.encoder
-    def _get_vae_decoder_module(self):
-        return self.vae.decoder
-    def _get_decoder_module(self):
-        return self.t5.decoder
-    def setup(self):
-        self.t5 = FlaxT5ForConditionalGenerationModule(self.config.t5)
-        self.vae = VAE(self.config)
-    def __call__(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        encoder_outputs=None,
-        latent_codes=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        deterministic: bool = True,
-    ):
-        """
-            Adapted from `FlaxT5ForConditionalGenerationModule`
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        # Encode
-        encoder_outputs = self.t5.encoder(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            deterministic=deterministic,
-        )
-        hidden_states = encoder_outputs[0]
-        # Autoencode
-        hidden_states, latent_codes = self.vae(hidden_states, latent_codes)
-        encoder_attention_mask = jnp.ones((hidden_states.shape[0], hidden_states.shape[1]))
-        # Decode
-        decoder_outputs = self.t5.decoder(
-            input_ids=decoder_input_ids,
-            attention_mask=decoder_attention_mask,
-            encoder_hidden_states=hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            deterministic=deterministic,
-        )
-        sequence_output = decoder_outputs[0]
-        if self.t5.config.tie_word_embeddings:
-            # Rescale output before projecting on vocab
-            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
-            sequence_output = sequence_output * (self.t5.config.d_model ** -0.5)
-        if self.t5.config.tie_word_embeddings:
-            shared_embedding = self.t5.shared.variables["params"]["embedding"]
-            lm_logits = self.t5.lm_head.apply({"params": {"kernel": shared_embedding.T}}, sequence_output)
-        else:
-            lm_logits = self.t5.lm_head(sequence_output)
-        if not return_dict:
-            return [lm_logits, latent_codes] + decoder_outputs[1:] + encoder_outputs
-        return TransformerVaeOutput(
-            logits=lm_logits,
-            latent_codes=latent_codes,
-            last_hidden_state=decoder_outputs.last_hidden_state,
-            past_key_values=decoder_outputs.past_key_values,
-            decoder_hidden_states=decoder_outputs.hidden_states,
-            decoder_attentions=decoder_outputs.attentions,
-            cross_attentions=decoder_outputs.cross_attentions,
-            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-            encoder_hidden_states=encoder_outputs.hidden_states,
-            encoder_attentions=encoder_outputs.attentions,
-        )
-class FlaxT5VaePreTrainedModel(FlaxPreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-    config_class = T5VaeConfig
-    base_model_prefix = "transformer"
-    module_class: nn.Module = None
-    def __init__(
-        self,
-        config: T5VaeConfig,
-        input_shape: Tuple[int] = (1, 1),
-        seed: int = 0,
-        dtype: jnp.dtype = jnp.float32,
-        **kwargs
-    ):
-        module = self.module_class(config=config, dtype=dtype, **kwargs)
-        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype)
-    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple) -> FrozenDict:
-        # init input tensors
-        input_ids = jnp.zeros(input_shape, dtype="i4")
-        attention_mask = jnp.ones_like(input_ids)
-        decoder_input_ids = jnp.ones_like(input_ids)
-        decoder_attention_mask = jnp.ones_like(input_ids)
-        params_rng, dropout_rng = jax.random.split(rng)
-        rngs = {"params": params_rng, "dropout": dropout_rng}
-        return self.module.init(
-            rngs,
-            input_ids,
-            attention_mask,
-            decoder_input_ids,
-            decoder_attention_mask,
-        )["params"]
-    def __call__(
-        self,
-        input_ids: jnp.ndarray,
-        attention_mask: Optional[jnp.ndarray] = None,
-        decoder_input_ids: jnp.ndarray = None,
-        decoder_attention_mask: Optional[jnp.ndarray] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        train: bool = False,
-        params: dict = None,
-        dropout_rng: PRNGKey = None,
-    ):
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.return_dict
-        if decoder_input_ids is None:
-            raise ValueError(
-                "Make sure to provide both `input_ids` and `decoder_input_ids`. `decoder_input_ids` is not passed here."
-            )
-        # prepare encoder inputs
-        if attention_mask is None:
-            attention_mask = jnp.ones_like(input_ids)
-        # prepare decoder inputs
-        if decoder_attention_mask is None:
-            decoder_attention_mask = jnp.ones_like(decoder_input_ids)
-        # Handle any PRNG if needed
-        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
-        return self.module.apply(
-            {"params": params or self.params},
-            input_ids=jnp.array(input_ids, dtype="i4"),
-            attention_mask=jnp.array(attention_mask, dtype="i4"),
-            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
-            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            deterministic=not train,
-            rngs=rngs,
-        )
-    def init_cache(self, batch_size, max_length, latent_codes):
-        r"""
-        Args:
-            batch_size (:obj:`int`):
-                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
-            max_length (:obj:`int`):
-                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
-                cache.
-            latent_codes (:obj:`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
-                ``latent_codes`` consists of compressed hidden-states at the output of the last layer of the encoder.
-                Used in the cross-attention of the decoder.
-        """
-        # init input variables to retrieve cache
-        decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4")
-        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
-        def _decoder_forward(module, decoder_input_ids, latent_codes, decoder_attention_mask, **kwargs):
-            vae_decoder_module = module._get_vae_decoder_module()
-            decoder_module = module._get_decoder_module()
-            return decoder_module(
-                decoder_input_ids,
-                decoder_attention_mask,
-                encoder_hidden_states=vae_decoder_module(latent_codes),
-                **kwargs,
-            )
-        init_variables = self.module.init(
-            jax.random.PRNGKey(0),
-            decoder_input_ids=decoder_input_ids,
-            latent_codes=latent_codes,
-            decoder_attention_mask=decoder_attention_mask,
-            init_cache=True,
-            method=_decoder_forward,  # we only need to call the decoder to init the cache
-        )
-        return unfreeze(init_variables["cache"])
-    def encode(
-        self,
-        input_ids: jnp.ndarray,
-        attention_mask: Optional[jnp.ndarray] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        train: bool = False,
-        params: dict = None,
-        dropout_rng: PRNGKey = None,
-    ):
-        raise NotImplementedError()
-    def decode(
-        self,
-        decoder_input_ids,
-        latent_codes,
-        encoder_attention_mask: Optional[jnp.ndarray] = None,
-        decoder_attention_mask: Optional[jnp.ndarray] = None,
-        past_key_values: dict = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        train: bool = False,
-        params: dict = None,
-        dropout_rng: PRNGKey = None,
-    ):
-        raise NotImplementedError()
-class FlaxT5VaeForAutoencoding(FlaxT5VaePreTrainedModel):
-    module_class = FlaxT5VaeForAutoencodingModule
-    def __call__(
-        self,
-        input_ids: jnp.ndarray,
-        attention_mask: Optional[jnp.ndarray] = None,
-        decoder_input_ids=None,
-        decoder_attention_mask=None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        train: bool = False,
-        params: dict = None,
-        dropout_rng: PRNGKey = None,
-    ):
-        '''
-            Adapted from `FlaxT5PreTrainedModel`
-        '''
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.return_dict
-        if decoder_input_ids is None:
-            raise ValueError(
-                "Make sure to provide both `input_ids` and `decoder_input_ids`. `decoder_input_ids` is not passed here."
-            )
-        # prepare encoder inputs
-        if attention_mask is None:
-            attention_mask = jnp.ones_like(input_ids)
-        # prepare decoder inputs
-        if decoder_attention_mask is None:
-            decoder_attention_mask = jnp.ones_like(decoder_input_ids)
-        # Handle any PRNG if needed
-        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
-        return self.module.apply(
-            {"params": params or self.params},
-            input_ids=jnp.array(input_ids, dtype="i4"),
-            attention_mask=jnp.array(attention_mask, dtype="i4"),
-            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
-            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            deterministic=not train,
-            rngs=rngs,
-        )
-    def encode(
-        self,
-        input_ids: jnp.ndarray,
-        attention_mask: Optional[jnp.ndarray] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        train: bool = False,
-        params: dict = None,
-        dropout_rng: PRNGKey = None,
-    ):
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.return_dict
-        if attention_mask is None:
-            attention_mask = jnp.ones_like(input_ids)
-        # Handle any PRNG if needed
-        rngs = {}
-        if dropout_rng is not None:
-            rngs["dropout"] = dropout_rng
-        def _encoder_forward(module, input_ids, attention_mask, **kwargs):
-            encode_module = module._get_encoder_module()
-            vae_encoder_module = module._get_vae_encoder_module()
-            return vae_encoder_module(encode_module(input_ids, attention_mask, **kwargs)[0])
-        return self.module.apply(
-            {"params": params or self.params},
-            input_ids=jnp.array(input_ids, dtype="i4"),
-            attention_mask=jnp.array(attention_mask, dtype="i4"),
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            deterministic=not train,
-            rngs=rngs,
-            method=_encoder_forward,
-        )
-    def decode(
-        self,
-        decoder_input_ids,
-        latent_codes,
-        encoder_attention_mask: Optional[jnp.ndarray] = None,
-        decoder_attention_mask: Optional[jnp.ndarray] = None,
-        past_key_values: dict = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        train: bool = False,
-        params: dict = None,
-        dropout_rng: PRNGKey = None,
-    ):
-        r"""
-        Returns:
-        Example::
-            >>> model = FlaxT5VaeForAutoencoding.from_pretrained('t5-small')
-            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=512, return_tensors='jax')
-            >>> latent_codes = model.encode(**inputs)
-            >>> decoder_start_token_id = model.config.decoder_start_token_id
-            >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
-            >>> outputs = model.decode(decoder_input_ids, latent_codes)
-            >>> last_decoder_hidden_states = outputs.last_hidden_state
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.return_dict
-        if encoder_attention_mask is None:
-            batch_size, sequence_length = latent_codes.shape[:2]
-            encoder_attention_mask = jnp.ones((batch_size, sequence_length))
-        batch_size, sequence_length = decoder_input_ids.shape
-        if decoder_attention_mask is None:
-            decoder_attention_mask = jnp.ones((batch_size, sequence_length))
-        # Handle any PRNG if needed
-        rngs = {}
-        if dropout_rng is not None:
-            rngs["dropout"] = dropout_rng
-        inputs = {"params": params or self.params}
-        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be
-        # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that
-        # it can be changed by FlaxT5Attention module
-        if past_key_values:
-            inputs["cache"] = past_key_values
-            mutable = ["cache"]
-        else:
-            mutable = False
-        def _decoder_forward(module, decoder_input_ids, latent_codes, decoder_attention_mask, **kwargs):
-            vae_decoder_module = module._get_vae_decoder_module()
-            decoder_module = module._get_decoder_module()
-            decoder_outputs = decoder_module(
-                decoder_input_ids,
-                decoder_attention_mask,
-                encoder_hidden_states=vae_decoder_module(latent_codes),
-                **kwargs,
-            )
-            sequence_output = decoder_outputs[0]
-            if self.config.tie_word_embeddings:
-                # Rescale output before projecting on vocab
-                # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
-                sequence_output = sequence_output * (self.config.d_model ** -0.5)
-            if self.config.tie_word_embeddings:
-                shared_embedding = module.t5.shared.variables["params"]["embedding"]
-                lm_logits = module.t5.lm_head.apply({"params": {"kernel": shared_embedding.T}}, sequence_output)
-            else:
-                lm_logits = module.t5.lm_head(sequence_output)
-            return lm_logits, decoder_outputs
-        outputs = self.module.apply(
-            inputs,
-            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
-            latent_codes=latent_codes,
-            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
-            encoder_attention_mask=jnp.array(encoder_attention_mask, dtype="i4"),
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            deterministic=not train,
-            rngs=rngs,
-            mutable=mutable,
-            method=_decoder_forward,
-        )
-        if past_key_values is None:
-            lm_logits, decoder_outputs = outputs
-        else:
-            (lm_logits, decoder_outputs), past = outputs
-        if return_dict:
-            outputs = FlaxCausalLMOutputWithCrossAttentions(
-                logits=lm_logits,
-                hidden_states=decoder_outputs.hidden_states,
-                attentions=decoder_outputs.attentions,
-                cross_attentions=decoder_outputs.cross_attentions,
-            )
-        else:
-            outputs = (lm_logits,) + decoder_outputs[1:]
-        # add updated cache to model output
-        if past_key_values is not None and return_dict:
-            outputs["past_key_values"] = unfreeze(past["cache"])
-            return outputs
-        elif past_key_values is not None and not return_dict:
-            outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:]
-        return outputs
-    def prepare_inputs_for_generation(
-        self,
-        decoder_input_ids,
-        max_length,
-        attention_mask: Optional[jnp.DeviceArray] = None,
-        decoder_attention_mask: Optional[jnp.DeviceArray] = None,
-        latent_codes=None,
-        **kwargs
-    ):
-        # initializing the cache
-        batch_size, seq_length = decoder_input_ids.shape
-        past_key_values = self.init_cache(batch_size, max_length, latent_codes)
-        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
-        # But since the decoder uses a causal mask, those positions are masked anyways.
-        # Thus we can create a single static attention_mask here, which is more efficient for compilation
-        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
-        if decoder_attention_mask is not None:
-            extended_attention_mask = jax.lax.dynamic_update_slice(
-                extended_attention_mask, decoder_attention_mask, (0, 0)
-            )
-        return {
-            "past_key_values": past_key_values,
-            "latent_codes": latent_codes,
-            "encoder_attention_mask": attention_mask,
-            "decoder_attention_mask": extended_attention_mask,
-        }
-    def update_inputs_for_generation(self, model_outputs, model_kwargs):
-        model_kwargs["past_key_values"] = model_outputs.past_key_values
-        return model_kwargs

model/utils.py DELETED Viewed

@@ -1,24 +0,0 @@
-from typing import Sequence
-import flax.linen as nn
-class MLP(nn.Module):
-    features: Sequence[int]
-    @nn.compact
-    def __call__(self, x):
-        for feat in self.features[:-1]:
-            x = nn.relu(nn.Dense(feat)(x))
-        x = nn.Dense(self.features[-1])(x)
-        return x
-def assertEqual(actual, expected, msg, first="Got", second="Expected"):
-    if actual != expected:
-        raise ValueError(msg + f' {first}: "{actual}" {second}: "{expected}"')
-def assertIn(actual, expected, msg, first="Got", second="Expected one of"):
-    if actual not in expected:
-        raise ValueError(msg + f' {first}: "{actual}" {second}: {expected}')

model/vae.py DELETED Viewed

@@ -1,30 +0,0 @@
-import jax.numpy as jnp
-import flax.linen as nn
-from model.encoders import VAE_ENCODER_MODELS
-from model.decoders import VAE_DECODER_MODELS
-from model.config import T5VaeConfig
-class VAE(nn.Module):
-    # see https://github.com/google/flax#what-does-flax-look-like
-    """
-        An MMD-VAE used with encoder-decoder models.
-        Encodes all token encodings into a single latent & spits them back out.
-    """
-    config: T5VaeConfig
-    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
-    def setup(self):
-        self.encoder = VAE_ENCODER_MODELS[self.config.vae_encoder_model](self.config.latent_token_size, self.config.n_latent_tokens)
-        self.decoder = VAE_DECODER_MODELS[self.config.vae_decoder_model](self.config.t5.d_model,  self.config.n_latent_tokens)
-    def __call__(self, encoding=None, latent_codes=None):
-        latent_codes = self.encode(encoding)
-        return self.decode(latent_codes), latent_codes
-    def encode(self, encoding):
-        return self.encoder(encoding)
-    def decode(self, latent):
-        return self.decoder(latent)

t5-vae-flax DELETED Viewed

	@@ -1 +0,0 @@
1	- Subproject commit 0a7735b81b50995c0d1901501c5e6928ce62c0ef

t5_vae_flax CHANGED Viewed

	@@ -1 +1 @@
1	- Subproject commit ~~78562617b5fac81e1798f5dbde27c8ff9d4e378b~~


1	+ Subproject commit 0c030dca4751e6def730968a2f33fe093a608cdb

train.py CHANGED Viewed

@@ -46,8 +46,8 @@ from transformers import (
 from transformers.models.t5.modeling_flax_t5 import shift_tokens_right
 from transformers.testing_utils import CaptureLogger
-from model.t5_vae import FlaxT5VaeForAutoencoding
-from model.config import T5VaeConfig
 logger = logging.getLogger(__name__)

 from transformers.models.t5.modeling_flax_t5 import shift_tokens_right
 from transformers.testing_utils import CaptureLogger
+from t5_vae_flax.src.t5_vae import FlaxT5VaeForAutoencoding
+from t5_vae_flax.src.config import T5VaeConfig
 logger = logging.getLogger(__name__)

train.sh CHANGED Viewed

@@ -1,4 +1,4 @@
-export RUN_NAME=single_latent
 ./venv/bin/python train.py \
 --t5_model_name_or_path="t5-base" \
@@ -6,8 +6,8 @@ export RUN_NAME=single_latent
 --overwrite_output_dir \
 --dataset_name="Fraser/python-lines" \
 --do_train --do_eval \
---n_latent_tokens 1 \
---latent_token_size 32 \
 --save_steps="2500" \
 --eval_steps="2500" \
 --block_size="32" \

+export RUN_NAME=two_latent
 ./venv/bin/python train.py \
 --t5_model_name_or_path="t5-base" \
 --overwrite_output_dir \
 --dataset_name="Fraser/python-lines" \
 --do_train --do_eval \
+--n_latent_tokens 2 \
+--latent_token_size 16 \
 --save_steps="2500" \
 --eval_steps="2500" \
 --block_size="32" \