nruto commited on
Commit
4e21816
·
verified ·
1 Parent(s): 64e56ac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -14
app.py CHANGED
@@ -1,21 +1,17 @@
1
  import gradio as gr
2
- import torch
3
- from transformers import AutoModelForCausalLM, AutoTokenizer
4
- model_name = "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"
5
- model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
6
- tokenizer = AutoTokenizer.from_pretrained(model_name)
7
 
8
- prompt = "How many r in strawberry?"
9
- messages = [{"role": "user", "content": prompt}]
10
 
11
- tokenized_message = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt", return_dict=True)
12
- response_token_ids = model.generate(tokenized_message['input_ids'].cuda(),attention_mask=tokenized_message['attention_mask'].cuda(), max_new_tokens=4096, pad_token_id = tokenizer.eos_token_id)
13
- generated_tokens =response_token_ids[:, len(tokenized_message['input_ids'][0]):]
14
- generated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
15
- print(generated_text)
16
-
17
- # See response at top of model card
18
 
 
 
 
 
 
 
 
 
19
  messages = [{"role": "system", "content": system_message}]
20
 
21
  for val in history:
 
1
  import gradio as gr
2
+ from huggingface_hub import InferenceClient
 
 
 
 
3
 
4
+ client = InferenceClient("Qwen/Qwen2.5-Coder-32B-Instruct")
 
5
 
 
 
 
 
 
 
 
6
 
7
+ def respond(
8
+ message,
9
+ history: list[tuple[str, str]],
10
+ system_message,
11
+ max_tokens,
12
+ temperature,
13
+ top_p,
14
+ ):
15
  messages = [{"role": "system", "content": system_message}]
16
 
17
  for val in history: