|
"""
|
|
BitNet2 model with H-BitLinear layers for Hugging Face compatibility.
|
|
This maintains the original BitNetModel2 architecture with H-BitLinear layers.
|
|
"""
|
|
|
|
import torch
|
|
import torch.nn as nn
|
|
import torch.nn.functional as F
|
|
from transformers import PreTrainedModel, PretrainedConfig
|
|
from transformers.modeling_outputs import CausalLMOutputWithPast
|
|
|
|
class BitNet2Config(PretrainedConfig):
|
|
model_type = "bitnet2"
|
|
|
|
def __init__(self, **kwargs):
|
|
super().__init__(**kwargs)
|
|
|
|
class HBitLinear(nn.Module):
|
|
"""H-BitLinear layer implementation."""
|
|
|
|
def __init__(self, in_features, out_features, bias=False):
|
|
super().__init__()
|
|
self.in_features = in_features
|
|
self.out_features = out_features
|
|
|
|
|
|
self.weight = nn.Parameter(torch.randn(out_features, in_features) * 0.02)
|
|
if bias:
|
|
self.bias = nn.Parameter(torch.zeros(out_features))
|
|
else:
|
|
self.register_parameter('bias', None)
|
|
|
|
def forward(self, x):
|
|
|
|
|
|
return F.linear(x, self.weight, self.bias)
|
|
|
|
class BitNet2Layer(nn.Module):
|
|
"""Single BitNet2 layer with H-BitLinear."""
|
|
|
|
def __init__(self, config):
|
|
super().__init__()
|
|
self.config = config
|
|
|
|
|
|
self.self_attn_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
|
self.feed_forward_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
|
|
|
|
|
self.self_attn = nn.MultiheadAttention(
|
|
config.hidden_size,
|
|
config.num_attention_heads,
|
|
dropout=config.attention_probs_dropout_prob,
|
|
batch_first=True
|
|
)
|
|
|
|
|
|
self.feed_forward = nn.Sequential(
|
|
HBitLinear(config.hidden_size, config.intermediate_size, bias=False),
|
|
nn.GELU(),
|
|
HBitLinear(config.intermediate_size, config.hidden_size, bias=False)
|
|
)
|
|
|
|
def forward(self, hidden_states, attention_mask=None):
|
|
|
|
|
|
if attention_mask is not None:
|
|
|
|
if attention_mask.dtype != torch.bool:
|
|
attention_mask = attention_mask.bool()
|
|
|
|
|
|
if attention_mask.dim() == 1:
|
|
seq_len = hidden_states.size(1)
|
|
|
|
causal_mask = torch.triu(torch.ones(seq_len, seq_len, device=hidden_states.device), diagonal=1).bool()
|
|
attention_mask = causal_mask
|
|
elif attention_mask.dim() == 2 and attention_mask.size(0) == 1:
|
|
|
|
seq_len = attention_mask.size(1)
|
|
causal_mask = torch.triu(torch.ones(seq_len, seq_len, device=hidden_states.device), diagonal=1).bool()
|
|
attention_mask = causal_mask
|
|
|
|
attn_output, _ = self.self_attn(
|
|
self.self_attn_norm(hidden_states),
|
|
self.self_attn_norm(hidden_states),
|
|
self.self_attn_norm(hidden_states),
|
|
attn_mask=attention_mask
|
|
)
|
|
hidden_states = hidden_states + attn_output
|
|
|
|
|
|
ff_output = self.feed_forward(self.feed_forward_norm(hidden_states))
|
|
hidden_states = hidden_states + ff_output
|
|
|
|
return hidden_states
|
|
|
|
class BitNet2Model(PreTrainedModel):
|
|
config_class = BitNet2Config
|
|
|
|
def __init__(self, config):
|
|
super().__init__(config)
|
|
self.config = config
|
|
|
|
|
|
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
|
|
self.embed_positions = nn.Embedding(config.max_position_embeddings, config.hidden_size)
|
|
|
|
|
|
self.layers = nn.ModuleList([
|
|
BitNet2Layer(config)
|
|
for _ in range(config.num_hidden_layers)
|
|
])
|
|
|
|
|
|
self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
|
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
|
|
|
self.post_init()
|
|
|
|
def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
|
|
if input_ids is None:
|
|
raise ValueError("input_ids must be provided")
|
|
|
|
batch_size, seq_length = input_ids.shape
|
|
|
|
|
|
inputs_embeds = self.embed_tokens(input_ids)
|
|
position_ids = torch.arange(seq_length, device=input_ids.device).unsqueeze(0)
|
|
position_embeds = self.embed_positions(position_ids)
|
|
hidden_states = inputs_embeds + position_embeds
|
|
|
|
|
|
for layer in self.layers:
|
|
hidden_states = layer(hidden_states, attention_mask)
|
|
|
|
|
|
hidden_states = self.layer_norm(hidden_states)
|
|
logits = self.lm_head(hidden_states)
|
|
|
|
|
|
loss = None
|
|
if labels is not None:
|
|
shift_logits = logits[..., :-1, :].contiguous()
|
|
shift_labels = labels[..., 1:].contiguous()
|
|
loss = nn.functional.cross_entropy(
|
|
shift_logits.view(-1, shift_logits.size(-1)),
|
|
shift_labels.view(-1)
|
|
)
|
|
|
|
return CausalLMOutputWithPast(
|
|
loss=loss,
|
|
logits=logits,
|
|
past_key_values=None,
|
|
hidden_states=None,
|
|
attentions=None,
|
|
)
|
|
|
|
def prepare_inputs_for_generation(self, input_ids, **kwargs):
|
|
return {"input_ids": input_ids}
|
|
|
|
class BitNet2ForCausalLM(BitNet2Model):
|
|
def __init__(self, config):
|
|
super().__init__(config)
|
|
|