update cpu support, readme and convert_tokens_to_string

Browse files

Files changed (3) hide show

README.md +5 -0
modeling_qwen.py +63 -24
tokenization_qwen.py +18 -12

README.md CHANGED Viewed

@@ -73,11 +73,16 @@ You can easily call the model with the following code:
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.generation import GenerationConfig
 tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True)
 # use bf16
 # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="auto", trust_remote_code=True, bf16=True).eval()
 # use fp16
 # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="auto", trust_remote_code=True, fp16=True).eval()
 # use fp32
 model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="auto", trust_remote_code=True).eval()
 model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参

 from transformers import AutoModelForCausalLM, AutoTokenizer
 from transformers.generation import GenerationConfig
+# Note: our tokenizer rejects attacks and so that you cannot input special tokens like <|endoftext|> or it will throw an error.
+# To remove the strategy, you can add `allowed_special`, which accepts the string "all" or a `set` of special tokens.
+# For example: tokens = tokenizer(text, allowed_special="all")
 tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True)
 # use bf16
 # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="auto", trust_remote_code=True, bf16=True).eval()
 # use fp16
 # model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="auto", trust_remote_code=True, fp16=True).eval()
+# use cpu only
+# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="cpu", trust_remote_code=True).eval()
 # use fp32
 model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="auto", trust_remote_code=True).eval()
 model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True) # 可指定不同的生成长度、top_p等相关超参

modeling_qwen.py CHANGED Viewed

@@ -15,6 +15,7 @@ from torch.cuda.amp import autocast
 from torch.nn import CrossEntropyLoss
 from transformers import PreTrainedTokenizer, GenerationConfig, StoppingCriteriaList
 from transformers.generation.logits_process import LogitsProcessorList
 if TYPE_CHECKING:
     from transformers.generation.streamers import BaseStreamer
 from transformers.generation.utils import GenerateOutput
@@ -38,15 +39,19 @@ try:
     use_flash_rotary = True
 except ImportError:
     use_flash_rotary = False
-    print("Warning: import flash_attn rotary fail, please install FlashAttention rotary to get better performance "
-          "https://github.com/Dao-AILab/flash-attention/tree/main/csrc/rotary")
 try:
     from flash_attn.ops.rms_norm import rms_norm
 except ImportError:
     rms_norm = None
-    print("Warning: import flash_attn rms_norm fail, please install FlashAttention layer_norm to get better performance "
-          "https://github.com/Dao-AILab/flash-attention/tree/main/csrc/layer_norm")
 from .configuration_qwen import QWenConfig
 from .qwen_generation_utils import (
@@ -69,8 +74,10 @@ try:
     from flash_attn.flash_attn_interface import flash_attn_unpadded_func
 except ImportError:
     flash_attn_unpadded_func = None
-    print("Warning: import flash_attn fail, please install FlashAttention "
-          "https://github.com/Dao-AILab/flash-attention")
 class FlashSelfAttention(torch.nn.Module):
@@ -177,8 +184,12 @@ class QWenAttention(nn.Module):
             config.hidden_size, self.projection_size, bias=not config.no_bias
         )
-        self.is_fp32 = not(config.bf16 or config.fp16)
-        if self.use_flash_attn and flash_attn_unpadded_func is not None and not self.is_fp32:
             self.core_attention_flash = FlashSelfAttention(
                 causal=True, attention_dropout=config.attn_pdrop
             )
@@ -197,14 +208,15 @@ class QWenAttention(nn.Module):
             if self.rotary_ndims is not None
             else self.hidden_size_per_attention_head
         )
-        self.rotary_emb = RotaryEmbedding(
-            dim, base=config.rotary_emb_base
-        )
         self.use_dynamic_ntk = config.use_dynamic_ntk
         self.use_logn_attn = config.use_logn_attn
-        logn_list = [math.log(i, self.seq_length) if i > self.seq_length else 1 for i in range(1, 32768)]
         self.logn_tensor = torch.Tensor(logn_list)[None, :, None, None]
         self._ntk_cached = 1.0
@@ -335,14 +347,20 @@ class QWenAttention(nn.Module):
         if layer_past:
             # layer past[0] shape: bs * seq_len * head_num * dim
             kv_seq_len += layer_past[0].shape[1]
-        if self.use_dynamic_ntk and kv_seq_len == hidden_states.size()[1] and not self.training:
             context_value = math.log(kv_seq_len / self.seq_length, 2) + 1
             ntk_alpha = 2 ** math.ceil(context_value) - 1
             ntk_alpha = max(ntk_alpha, 1)
             self._ntk_cached = ntk_alpha
         else:
             ntk_alpha = self._ntk_cached
-        rotary_pos_emb = self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha).to(hidden_states.device)
         if rotary_pos_emb is not None:
             if isinstance(rotary_pos_emb, tuple):
@@ -377,7 +395,12 @@ class QWenAttention(nn.Module):
             logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
             query = query * logn_tensor.expand_as(query)
-        if self.use_flash_attn and flash_attn_unpadded_func is not None and not self.is_fp32:
             q, k, v = query, key, value
             context_layer = self.core_attention_flash(q, k, v)
@@ -398,7 +421,11 @@ class QWenAttention(nn.Module):
         attn_output = self.c_proj(context_layer)
         outputs = (attn_output, present)
         if output_attentions:
-            if self.use_flash_attn and flash_attn_unpadded_func is not None and not self.is_fp32:
                 raise ValueError("Cannot output attentions while using flash-attn")
             else:
                 outputs += (attn_weight,)
@@ -750,7 +777,9 @@ class QWenLMHeadModel(QWenPreTrainedModel):
         super().__init__(config)
         self.transformer = QWenModel(config)
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
-        assert not(config.bf16 and config.fp16), ("In config, bf16 and fp16 cannot both be true")
         if config.bf16:
             self.transformer.bfloat16()
             self.lm_head.bfloat16()
@@ -929,21 +958,25 @@ class QWenLMHeadModel(QWenPreTrainedModel):
         generation_config: Optional[GenerationConfig] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
         stopping_criteria: Optional[StoppingCriteriaList] = None,
-        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
         synced_gpus: Optional[bool] = None,
         streamer: Optional["BaseStreamer"] = None,
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
         # Process stop_words_ids.
-        stop_words_ids = kwargs.pop('stop_words_ids', None)
         if stop_words_ids is None and generation_config is not None:
-            stop_words_ids = getattr(generation_config, 'stop_words_ids', None)
         if stop_words_ids is None:
-            stop_words_ids = getattr(self.generation_config, 'stop_words_ids', None)
         if stop_words_ids is not None:
             stop_words_logits_processor = StopWordsLogitsProcessor(
-                stop_words_ids=stop_words_ids, eos_token_id=self.generation_config.eos_token_id)
             if logits_processor is None:
                 logits_processor = LogitsProcessorList([stop_words_logits_processor])
             else:
@@ -978,7 +1011,13 @@ class RotaryEmbedding(torch.nn.Module):
         seqlen = max_seq_len + offset
         if seqlen > self._seq_len_cached or ntk_alpha != self._ntk_alpha_cached:
             base = self.base * ntk_alpha ** (self.dim / (self.dim - 2))
-            self.inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2, device=self.inv_freq.device).float() / self.dim))
             self._seq_len_cached = seqlen
             self._ntk_alpha_cached = ntk_alpha
             seq = torch.arange(seqlen, device=self.inv_freq.device)
@@ -1028,7 +1067,7 @@ class RMSNorm(torch.nn.Module):
         return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
     def forward(self, x):
-        if rms_norm is not None:
             return rms_norm(x, self.weight, self.eps)
         else:
             output = self._norm(x.float()).type_as(x)

 from torch.nn import CrossEntropyLoss
 from transformers import PreTrainedTokenizer, GenerationConfig, StoppingCriteriaList
 from transformers.generation.logits_process import LogitsProcessorList
 if TYPE_CHECKING:
     from transformers.generation.streamers import BaseStreamer
 from transformers.generation.utils import GenerateOutput
     use_flash_rotary = True
 except ImportError:
     use_flash_rotary = False
+    print(
+        "Warning: import flash_attn rotary fail, please install FlashAttention rotary to get better performance "
+        "https://github.com/Dao-AILab/flash-attention/tree/main/csrc/rotary"
+    )
 try:
     from flash_attn.ops.rms_norm import rms_norm
 except ImportError:
     rms_norm = None
+    print(
+        "Warning: import flash_attn rms_norm fail, please install FlashAttention layer_norm to get better performance "
+        "https://github.com/Dao-AILab/flash-attention/tree/main/csrc/layer_norm"
+    )
 from .configuration_qwen import QWenConfig
 from .qwen_generation_utils import (
     from flash_attn.flash_attn_interface import flash_attn_unpadded_func
 except ImportError:
     flash_attn_unpadded_func = None
+    print(
+        "Warning: import flash_attn fail, please install FlashAttention "
+        "https://github.com/Dao-AILab/flash-attention"
+    )
 class FlashSelfAttention(torch.nn.Module):
             config.hidden_size, self.projection_size, bias=not config.no_bias
         )
+        self.is_fp32 = not (config.bf16 or config.fp16)
+        if (
+            self.use_flash_attn
+            and flash_attn_unpadded_func is not None
+            and not self.is_fp32
+        ):
             self.core_attention_flash = FlashSelfAttention(
                 causal=True, attention_dropout=config.attn_pdrop
             )
             if self.rotary_ndims is not None
             else self.hidden_size_per_attention_head
         )
+        self.rotary_emb = RotaryEmbedding(dim, base=config.rotary_emb_base)
         self.use_dynamic_ntk = config.use_dynamic_ntk
         self.use_logn_attn = config.use_logn_attn
+        logn_list = [
+            math.log(i, self.seq_length) if i > self.seq_length else 1
+            for i in range(1, 32768)
+        ]
         self.logn_tensor = torch.Tensor(logn_list)[None, :, None, None]
         self._ntk_cached = 1.0
         if layer_past:
             # layer past[0] shape: bs * seq_len * head_num * dim
             kv_seq_len += layer_past[0].shape[1]
+        if (
+            self.use_dynamic_ntk
+            and kv_seq_len == hidden_states.size()[1]
+            and not self.training
+        ):
             context_value = math.log(kv_seq_len / self.seq_length, 2) + 1
             ntk_alpha = 2 ** math.ceil(context_value) - 1
             ntk_alpha = max(ntk_alpha, 1)
             self._ntk_cached = ntk_alpha
         else:
             ntk_alpha = self._ntk_cached
+        rotary_pos_emb = self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha).to(
+            hidden_states.device
+        )
         if rotary_pos_emb is not None:
             if isinstance(rotary_pos_emb, tuple):
             logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
             query = query * logn_tensor.expand_as(query)
+        if (
+            self.use_flash_attn
+            and flash_attn_unpadded_func is not None
+            and not self.is_fp32
+            and query.is_cuda
+        ):
             q, k, v = query, key, value
             context_layer = self.core_attention_flash(q, k, v)
         attn_output = self.c_proj(context_layer)
         outputs = (attn_output, present)
         if output_attentions:
+            if (
+                self.use_flash_attn
+                and flash_attn_unpadded_func is not None
+                and not self.is_fp32
+            ):
                 raise ValueError("Cannot output attentions while using flash-attn")
             else:
                 outputs += (attn_weight,)
         super().__init__(config)
         self.transformer = QWenModel(config)
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        assert not (
+            config.bf16 and config.fp16
+        ), "In config, bf16 and fp16 cannot both be true"
         if config.bf16:
             self.transformer.bfloat16()
             self.lm_head.bfloat16()
         generation_config: Optional[GenerationConfig] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
         stopping_criteria: Optional[StoppingCriteriaList] = None,
+        prefix_allowed_tokens_fn: Optional[
+            Callable[[int, torch.Tensor], List[int]]
+        ] = None,
         synced_gpus: Optional[bool] = None,
         streamer: Optional["BaseStreamer"] = None,
         **kwargs,
     ) -> Union[GenerateOutput, torch.LongTensor]:
         # Process stop_words_ids.
+        stop_words_ids = kwargs.pop("stop_words_ids", None)
         if stop_words_ids is None and generation_config is not None:
+            stop_words_ids = getattr(generation_config, "stop_words_ids", None)
         if stop_words_ids is None:
+            stop_words_ids = getattr(self.generation_config, "stop_words_ids", None)
         if stop_words_ids is not None:
             stop_words_logits_processor = StopWordsLogitsProcessor(
+                stop_words_ids=stop_words_ids,
+                eos_token_id=self.generation_config.eos_token_id,
+            )
             if logits_processor is None:
                 logits_processor = LogitsProcessorList([stop_words_logits_processor])
             else:
         seqlen = max_seq_len + offset
         if seqlen > self._seq_len_cached or ntk_alpha != self._ntk_alpha_cached:
             base = self.base * ntk_alpha ** (self.dim / (self.dim - 2))
+            self.inv_freq = 1.0 / (
+                base
+                ** (
+                    torch.arange(0, self.dim, 2, device=self.inv_freq.device).float()
+                    / self.dim
+                )
+            )
             self._seq_len_cached = seqlen
             self._ntk_alpha_cached = ntk_alpha
             seq = torch.arange(seqlen, device=self.inv_freq.device)
         return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
     def forward(self, x):
+        if rms_norm is not None and x.is_cuda:
             return rms_norm(x, self.weight, self.eps)
         else:
             output = self._norm(x.float()).type_as(x)

tokenization_qwen.py CHANGED Viewed

@@ -22,7 +22,6 @@ logger = logging.getLogger(__name__)
 VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
 class QWenTokenizer(PreTrainedTokenizer):
     """QWen tokenizer."""
@@ -126,6 +125,7 @@ class QWenTokenizer(PreTrainedTokenizer):
         self.mergeable_ranks = mergeable_ranks
         self.encoder = self.mergeable_ranks
         self.decoder = {v: k for k, v in self.encoder.items()}
         self.tokenizer = enc  # type: tiktoken.Encoding
         self.eod_id = self.tokenizer.eot_token
         self.im_start_id = special_tokens[IMSTART]
@@ -182,29 +182,32 @@ class QWenTokenizer(PreTrainedTokenizer):
             text (`str`):
                 The sequence to be encoded.
             kwargs (additional keyword arguments, *optional*):
-                Will be passed to the underlying model specific encode method. See details in
-                [`~PreTrainedTokenizerBase.__call__`]
         Returns:
             `List[str]`: The list of tokens.
         """
         tokens = []
         text = unicodedata.normalize("NFC", text)
-        for t in self.tokenizer.encode_ordinary(text):
             tokens.append(self.decoder[t])
         return tokens
-    def convert_tokens_to_string(self, tokens: List[str]) -> str:
         """
         Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
         often want to remove sub-word tokenization artifacts at the same time.
         """
-        text = "".join(tokens)
-        text = bytearray([self.byte_decoder[c] for c in text]).decode(
-            "utf-8", errors=self.errors
-        )
-        return text
     @property
     def vocab_size(self):
         return self.tokenizer.n_vocab
@@ -216,7 +219,10 @@ class QWenTokenizer(PreTrainedTokenizer):
     def _convert_token_to_id(self, token: str) -> int:
         """Converts a token to an id using the vocab."""
-        return self.encoder.get(token.encode('UTF-8'), self.tokenizer.encode(self.unk_token, allowed_special='all')[0])
     @property
     def all_special_tokens(self) -> List[str]:

 VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
 class QWenTokenizer(PreTrainedTokenizer):
     """QWen tokenizer."""
         self.mergeable_ranks = mergeable_ranks
         self.encoder = self.mergeable_ranks
         self.decoder = {v: k for k, v in self.encoder.items()}
+        self.decoder.update({v: k for k, v in self.special_tokens.items()})
         self.tokenizer = enc  # type: tiktoken.Encoding
         self.eod_id = self.tokenizer.eot_token
         self.im_start_id = special_tokens[IMSTART]
             text (`str`):
                 The sequence to be encoded.
             kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific encode method.
+                Tiktoken allows users to allow the tokenization of special tokens with the following args:
+                `allowed_special`: set to 'all' or a `set` of special tokens.
+                `disallowed_special`: set to 'all' or a `Collection` of special tokens. NOT RECOMMENDED, AS IT MAY BE CONFLICTED WITH `allowed_special`.
         Returns:
             `List[str]`: The list of tokens.
         """
         tokens = []
         text = unicodedata.normalize("NFC", text)
+        for t in self.tokenizer.encode(text, **kwargs):
             tokens.append(self.decoder[t])
         return tokens
+    def convert_tokens_to_string(self, tokens: List[bytes]) -> str:
         """
         Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
         often want to remove sub-word tokenization artifacts at the same time.
         """
+        text = b""
+        for token in tokens:
+            text += token
+        return text.decode('utf-8')
     @property
     def vocab_size(self):
         return self.tokenizer.n_vocab
     def _convert_token_to_id(self, token: str) -> int:
         """Converts a token to an id using the vocab."""
+        return self.encoder.get(
+            token.encode("UTF-8"),
+            self.tokenizer.encode(self.unk_token, allowed_special="all")[0],
+        )
     @property
     def all_special_tokens(self) -> List[str]: