Mikhil-jivus commited on
Commit
00a2ac7
1 Parent(s): 432a144

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -10
app.py CHANGED
@@ -8,6 +8,8 @@ import torch
8
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
9
 
10
 
 
 
11
 
12
  DESCRIPTION = """\
13
  # Llama 3.2 3B Instruct
@@ -18,10 +20,9 @@ For more details, please check [our post](https://huggingface.co/blog/llama32).
18
 
19
  # Access token for the model (if required)
20
  access_token = os.getenv('HF_TOKEN')
21
-
22
  # Download the Base model
23
  #model_id = "./models/Llama-32-3B-Instruct"
24
- model_id = "Mikhil-jivus/Llama-32-8B-FineTuned-Instruct-v1"
25
  MAX_MAX_NEW_TOKENS = 2048
26
  DEFAULT_MAX_NEW_TOKENS = 1024
27
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
@@ -29,15 +30,14 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
29
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
30
 
31
  #model_id = "nltpt/Llama-3.2-3B-Instruct"
32
- tokenizer = AutoTokenizer.from_pretrained(model_id,token = access_token)
33
  tokenizer.padding_side = 'right'
34
- tokenizer.pad_token_id = 128004
35
- tokenizer.pad_token = "<|finetune_right_pad_id|>"
36
  model = AutoModelForCausalLM.from_pretrained(
37
  model_id,
38
- device_map="auto",
39
  torch_dtype=torch.bfloat16,
40
- token = access_token
41
  )
42
  model.eval()
43
 
@@ -63,15 +63,26 @@ def generate(
63
  )
64
  conversation.append({"role": "user", "content": message})
65
 
66
- input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True,add_special_tokens = True, return_tensors="pt")
 
 
 
 
 
67
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
68
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
69
  gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
 
 
 
70
  input_ids = input_ids.to(model.device)
 
 
 
71
 
72
  streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
73
  generate_kwargs = dict(
74
- {"input_ids": input_ids},
75
  streamer=streamer,
76
  max_new_tokens=max_new_tokens,
77
  do_sample=True,
@@ -79,7 +90,7 @@ def generate(
79
  top_k=top_k,
80
  temperature=temperature,
81
  num_beams=1,
82
- repetition_penalty=repetition_penalty,
83
  )
84
  t = Thread(target=model.generate, kwargs=generate_kwargs)
85
  t.start()
 
8
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
9
 
10
 
11
+ # Set the environment variable
12
+ os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
13
 
14
  DESCRIPTION = """\
15
  # Llama 3.2 3B Instruct
 
20
 
21
  # Access token for the model (if required)
22
  access_token = os.getenv('HF_TOKEN')
 
23
  # Download the Base model
24
  #model_id = "./models/Llama-32-3B-Instruct"
25
+ model_id = "Mikhil-jivus/Llama-32-3B-FineTuned-Instruct-v4"
26
  MAX_MAX_NEW_TOKENS = 2048
27
  DEFAULT_MAX_NEW_TOKENS = 1024
28
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 
30
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
31
 
32
  #model_id = "nltpt/Llama-3.2-3B-Instruct"
33
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
34
  tokenizer.padding_side = 'right'
35
+ tokenizer.pad_token = tokenizer.eos_token
 
36
  model = AutoModelForCausalLM.from_pretrained(
37
  model_id,
38
+ device_map=device,
39
  torch_dtype=torch.bfloat16,
40
+ local_files_only = True
41
  )
42
  model.eval()
43
 
 
63
  )
64
  conversation.append({"role": "user", "content": message})
65
 
66
+ # Set pad_token_id if it's not already set
67
+ if tokenizer.pad_token_id is None:
68
+ tokenizer.padding_side = 'right'
69
+ tokenizer.pad_token = tokenizer.eos_token
70
+
71
+ input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True,add_special_tokens=True, return_tensors="pt",padding=True ,return_attention_mask=True)
72
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
73
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
74
  gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
75
+ # Ensure attention mask is set
76
+ #attention_mask = input_ids['attention_mask']
77
+
78
  input_ids = input_ids.to(model.device)
79
+ #attention_mask = attention_mask.to(model.device)
80
+
81
+
82
 
83
  streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
84
  generate_kwargs = dict(
85
+ input_ids=input_ids,
86
  streamer=streamer,
87
  max_new_tokens=max_new_tokens,
88
  do_sample=True,
 
90
  top_k=top_k,
91
  temperature=temperature,
92
  num_beams=1,
93
+ repetition_penalty=repetition_penalty
94
  )
95
  t = Thread(target=model.generate, kwargs=generate_kwargs)
96
  t.start()