nruto commited on
Commit
64e56ac
·
verified ·
1 Parent(s): a3d81b8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -10
app.py CHANGED
@@ -1,18 +1,21 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
 
 
 
3
 
 
 
4
 
5
- client = InferenceClient("nvidia/Llama-3.1-Nemotron-70B-Instruct-HF")
 
 
 
 
6
 
 
7
 
8
- def respond(
9
- message,
10
- history: list[tuple[str, str]],
11
- system_message,
12
- max_tokens,
13
- temperature,
14
- top_p,
15
- ):
16
  messages = [{"role": "system", "content": system_message}]
17
 
18
  for val in history:
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
+ model_name = "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
5
+ model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
6
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
7
 
8
+ prompt = "How many r in strawberry?"
9
+ messages = [{"role": "user", "content": prompt}]
10
 
11
+ tokenized_message = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True)
12
+ response_token_ids = model.generate(tokenized_message['input_ids'].cuda(),attention_mask=tokenized_message['attention_mask'].cuda(), max_new_tokens=4096, pad_token_id = tokenizer.eos_token_id)
13
+ generated_tokens =response_token_ids[:, len(tokenized_message['input_ids'][0]):]
14
+ generated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
15
+ print(generated_text)
16
 
17
+ # See response at top of model card
18
 
 
 
 
 
 
 
 
 
19
  messages = [{"role": "system", "content": system_message}]
20
 
21
  for val in history: