DAMO-NLP-SG
/

CLEX-7B-Chat-16K

@@ -166,14 +166,17 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
-def apply_rotary_pos_emb(q, k, cos, sin, q_len, position_ids):
     # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
     cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
     sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
-    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    q_embed = (q * cos[:, :, -q_len:, :]) + (rotate_half(q) * sin[:, :, -q_len:, :])
-    k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
@@ -282,28 +285,33 @@ class LlamaAttention(nn.Module):
         if past_key_value is not None:
             kv_seq_len += past_key_value[0].shape[-2]
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
         if pack_cos_sin is not None:
             cos, sin = pack_cos_sin.to(query_states.device)
         else:
             cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         key_position_ids = torch.arange(kv_seq_len, dtype=torch.long, device=position_ids.device).unsqueeze(0).view(-1, kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, q_len, key_position_ids)
         if past_key_value is not None:
             # reuse k, v, self_attention
             value_states = torch.cat([past_key_value[1], value_states], dim=2)
-        past_key_value = (key_states, value_states) if use_cache else None
-        use_flashatn =  self.config.use_flashattn and is_flash_attn_available()
         if self.log_scale:
             log_n = torch.log(torch.tensor(kv_seq_len*1.0)).to(query_states.device, dtype=query_states.dtype) / \
                      torch.log(torch.tensor(self.config.max_position_embeddings)).to(query_states.device, dtype=query_states.dtype)
             query_states = query_states * log_n
-        if query_states.shape[-2] == 1 or query_states.shape[-2] != key_states.shape[-2] or use_flashatn:
             attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
             if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):

     return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin,  position_ids, key_position_ids):
     # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
     cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
     sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
+    cos_q = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    sin_q = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    cos_k = cos[key_position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    sin_k = sin[key_position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    q_embed = (q * cos_q) + (rotate_half(q) * sin_q)
+    k_embed = (k * cos_k) + (rotate_half(k) * sin_k)
     return q_embed, k_embed
         if past_key_value is not None:
             kv_seq_len += past_key_value[0].shape[-2]
+            cache_key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        else:
+            cache_key_states = key_states
         if pack_cos_sin is not None:
             cos, sin = pack_cos_sin.to(query_states.device)
         else:
             cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         key_position_ids = torch.arange(kv_seq_len, dtype=torch.long, device=position_ids.device).unsqueeze(0).view(-1, kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, cache_key_states, cos, sin, position_ids, key_position_ids)
         if past_key_value is not None:
             # reuse k, v, self_attention
+            # key_states = torch.cat([past_key_value[0], key_states], dim=2)
             value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (cache_key_states, value_states) if use_cache else None
+        use_flashattn =  self.config.use_flashattn and is_flash_attn_available()
         if self.log_scale:
             log_n = torch.log(torch.tensor(kv_seq_len*1.0)).to(query_states.device, dtype=query_states.dtype) / \
                      torch.log(torch.tensor(self.config.max_position_embeddings)).to(query_states.device, dtype=query_states.dtype)
             query_states = query_states * log_n
+        if query_states.shape[-2] == 1 or query_states.shape[-2] != key_states.shape[-2] and not use_flashattn:
             attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
             if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):