jupyterjazz
commited on
Commit
•
90873c4
1
Parent(s):
d8cbc92
Update rotary.py
Browse files
rotary.py
CHANGED
@@ -493,9 +493,15 @@ class RotaryEmbedding(torch.nn.Module):
|
|
493 |
|
494 |
@base.setter
|
495 |
def base(self, new_base):
|
496 |
-
|
497 |
-
|
498 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
499 |
else:
|
500 |
raise ValueError("Rotary base value must be positive")
|
501 |
|
@@ -508,21 +514,27 @@ class RotaryEmbedding(torch.nn.Module):
|
|
508 |
)
|
509 |
)
|
510 |
|
511 |
-
def _update_cos_sin_cache(
|
|
|
|
|
512 |
# Reset the tables if the sequence length has changed,
|
513 |
# if we're on a new device (possibly due to tracing for instance),
|
514 |
# or if we're switching from inference mode to training
|
|
|
515 |
if (
|
516 |
seqlen > self._seq_len_cached
|
517 |
or self._cos_cached is None
|
518 |
or self._cos_cached.device != device
|
519 |
or self._cos_cached.dtype != dtype
|
520 |
or (self.training and self._cos_cached.is_inference())
|
|
|
521 |
):
|
522 |
self._seq_len_cached = seqlen
|
523 |
# We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
|
524 |
# And the output of arange can be quite large, so bf16 would lose a lot of precision.
|
525 |
# However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
|
|
|
|
|
526 |
if self.pos_idx_in_fp32:
|
527 |
t = torch.arange(seqlen, device=device, dtype=torch.float32)
|
528 |
# We want fp32 here as well since inv_freq will be multiplied with t, and the output
|
@@ -536,6 +548,7 @@ class RotaryEmbedding(torch.nn.Module):
|
|
536 |
else:
|
537 |
t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
|
538 |
inv_freq = self.inv_freq
|
|
|
539 |
# Don't do einsum, it converts fp32 to fp16 under AMP
|
540 |
# freqs = torch.einsum("i,j->ij", t, self.inv_freq)
|
541 |
freqs = torch.outer(t, inv_freq)
|
|
|
493 |
|
494 |
@base.setter
|
495 |
def base(self, new_base):
|
496 |
+
new_base = float(new_base)
|
497 |
+
if new_base > 0 and new_base != self._base:
|
498 |
+
self._base = new_base
|
499 |
+
self._update_cos_sin_cache(
|
500 |
+
self._seq_len_cached,
|
501 |
+
device=self.inv_freq.device,
|
502 |
+
dtype=self._cos_cached.dtype if self._cos_cached is not None else None,
|
503 |
+
rotary_base_changed=True,
|
504 |
+
)
|
505 |
else:
|
506 |
raise ValueError("Rotary base value must be positive")
|
507 |
|
|
|
514 |
)
|
515 |
)
|
516 |
|
517 |
+
def _update_cos_sin_cache(
|
518 |
+
self, seqlen, device=None, dtype=None, rotary_base_changed=False
|
519 |
+
):
|
520 |
# Reset the tables if the sequence length has changed,
|
521 |
# if we're on a new device (possibly due to tracing for instance),
|
522 |
# or if we're switching from inference mode to training
|
523 |
+
# or if the rotary base value was changed
|
524 |
if (
|
525 |
seqlen > self._seq_len_cached
|
526 |
or self._cos_cached is None
|
527 |
or self._cos_cached.device != device
|
528 |
or self._cos_cached.dtype != dtype
|
529 |
or (self.training and self._cos_cached.is_inference())
|
530 |
+
or rotary_base_changed
|
531 |
):
|
532 |
self._seq_len_cached = seqlen
|
533 |
# We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
|
534 |
# And the output of arange can be quite large, so bf16 would lose a lot of precision.
|
535 |
# However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
|
536 |
+
if rotary_base_changed:
|
537 |
+
self.inv_freq = self._compute_inv_freq(device=self.inv_freq.device)
|
538 |
if self.pos_idx_in_fp32:
|
539 |
t = torch.arange(seqlen, device=device, dtype=torch.float32)
|
540 |
# We want fp32 here as well since inv_freq will be multiplied with t, and the output
|
|
|
548 |
else:
|
549 |
t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
|
550 |
inv_freq = self.inv_freq
|
551 |
+
|
552 |
# Don't do einsum, it converts fp32 to fp16 under AMP
|
553 |
# freqs = torch.einsum("i,j->ij", t, self.inv_freq)
|
554 |
freqs = torch.outer(t, inv_freq)
|