[HIL-SERL]Remove overstrict pre-commit modifications (#1028)

This commit is contained in:
Adil Zouitine
2025-04-24 13:48:52 +02:00
committed by GitHub
parent 671ac3411f
commit c58b504a9e
47 changed files with 163 additions and 757 deletions

View File

@@ -293,18 +293,12 @@ class PaliGemmaWithExpertModel(PreTrainedModel):
# in `transformers`. (molbap)
key_states = torch.cat([past_key_values[layer_idx]["key_states"], key_states], dim=1)
value_states = torch.cat(
[past_key_values[layer_idx]["value_states"], value_states],
dim=1,
[past_key_values[layer_idx]["value_states"], value_states], dim=1
)
attention_interface = self.get_attention_interface()
att_output = attention_interface(
attention_mask,
batch_size,
head_dim,
query_states,
key_states,
value_states,
attention_mask, batch_size, head_dim, query_states, key_states, value_states
)
att_output = att_output.to(dtype=torch.bfloat16)
@@ -364,24 +358,12 @@ class PaliGemmaWithExpertModel(PreTrainedModel):
return attention_interface
def flash_attention_forward(
self,
attention_mask,
batch_size,
head_dim,
query_states,
key_states,
value_states,
self, attention_mask, batch_size, head_dim, query_states, key_states, value_states
):
raise NotImplementedError("FA2 is not implemented (yet)")
def eager_attention_forward(
self,
attention_mask,
batch_size,
head_dim,
query_states,
key_states,
value_states,
self, attention_mask, batch_size, head_dim, query_states, key_states, value_states
):
num_att_heads = self.config.paligemma_config.text_config.num_attention_heads
num_key_value_heads = self.config.paligemma_config.text_config.num_key_value_heads
@@ -393,31 +375,17 @@ class PaliGemmaWithExpertModel(PreTrainedModel):
sequence_length = key_states.shape[1]
key_states = key_states[:, :, :, None, :].expand(
batch_size,
sequence_length,
num_key_value_heads,
num_key_value_groups,
head_dim,
batch_size, sequence_length, num_key_value_heads, num_key_value_groups, head_dim
)
key_states = key_states.reshape(
batch_size,
sequence_length,
num_key_value_heads * num_key_value_groups,
head_dim,
batch_size, sequence_length, num_key_value_heads * num_key_value_groups, head_dim
)
value_states = value_states[:, :, :, None, :].expand(
batch_size,
sequence_length,
num_key_value_heads,
num_key_value_groups,
head_dim,
batch_size, sequence_length, num_key_value_heads, num_key_value_groups, head_dim
)
value_states = value_states.reshape(
batch_size,
sequence_length,
num_key_value_heads * num_key_value_groups,
head_dim,
batch_size, sequence_length, num_key_value_heads * num_key_value_groups, head_dim
)
# Attention here is upcasted to float32 to match the original eager implementation.