Mikhil-jivus commited on
Commit
e3f498d
·
verified ·
1 Parent(s): b17ecc2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -61
app.py CHANGED
@@ -1,32 +1,14 @@
1
- import os
2
  import gradio as gr
3
  import torch
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
5
 
6
- access_token = os.getenv('HF_TOKEN')
7
-
8
- # Define the repository ID and access token
9
- repo_id = "Mikhil-jivus/Llama-32-3B-FineTuned"
10
-
11
- # Load the tokenizer and model from the Hugging Face repository
12
- tokenizer = AutoTokenizer.from_pretrained(repo_id, token=access_token)
13
 
14
- model = AutoModelForCausalLM.from_pretrained(
15
- repo_id,
16
- token=access_token,
17
- torch_dtype=torch.bfloat16, # or use torch.bfloat16 if supported
18
- device_map="auto" # Automatically use available GPU/CPU efficiently
19
- )
20
 
21
- # Define a function to clean up any repeated segments in the generated response
22
- def clean_response(response, history):
23
- # Check for repetition in the response and remove it
24
- if len(history) > 0:
25
- last_user_message, last_bot_response = history[-1]
26
- if last_bot_response in response:
27
- response = response.replace(last_bot_response, "").strip()
28
-
29
- return response
30
 
31
  def respond(
32
  message,
@@ -36,52 +18,37 @@ def respond(
36
  temperature,
37
  top_p,
38
  ):
39
- # Add system prompt only once at the beginning of the conversation
40
- if len(history) == 0:
41
- input_text = f"system: {system_message}\nuser: {message}\n"
42
- else:
43
- input_text = f"user: {message}\n"
44
 
45
- # Append previous conversation history to the input text
46
- for user_msg, bot_msg in history:
47
- input_text += f"user: {user_msg}\nassistant: {bot_msg}\n"
 
 
48
 
49
- # Tokenize the input messages
50
- input_ids = tokenizer.encode(input_text, return_tensors="pt")
51
-
52
- # Move input_ids to the GPU
53
- input_ids = input_ids.to("cuda")
54
-
55
- # Create attention mask and move to GPU
56
- attention_mask = input_ids.ne(tokenizer.pad_token_id).long().to("cuda")
57
-
58
- # Generate a response
59
- chat_history_ids = model.generate(
60
- input_ids,
61
- max_new_tokens=max_tokens,
62
  temperature=temperature,
63
  top_p=top_p,
64
- pad_token_id=tokenizer.eos_token_id,
65
- do_sample=True,
66
- attention_mask=attention_mask,
67
- )
68
-
69
- # Decode the response
70
- response = tokenizer.decode(chat_history_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
71
 
72
- # Clean the response to remove any repeated or unnecessary text
73
- response = clean_response(response, history)
74
-
75
- # Update history with the new user message and bot response
76
- history.append((message, response))
77
 
78
- return response
79
-
80
- # Set up the Gradio app interface
81
  demo = gr.ChatInterface(
82
  respond,
83
  additional_inputs=[
84
- gr.Textbox(value="You are a helpful and friendly assistant.", label="System message"),
85
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
86
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
87
  gr.Slider(
@@ -94,5 +61,6 @@ demo = gr.ChatInterface(
94
  ],
95
  )
96
 
 
97
  if __name__ == "__main__":
98
- demo.launch(share=True)
 
 
1
  import gradio as gr
2
  import torch
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
 
5
+ """
6
+ For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
7
+ """
 
 
 
 
8
 
9
+ access_token = os.getenv('HF_TOKEN')
10
+ client = InferenceClient("Mikhil-jivus/Llama-32-3B-FineTuned",api_key = access_token)
 
 
 
 
11
 
 
 
 
 
 
 
 
 
 
12
 
13
  def respond(
14
  message,
 
18
  temperature,
19
  top_p,
20
  ):
21
+ messages = [{"role": "system", "content": system_message}]
 
 
 
 
22
 
23
+ for val in history:
24
+ if val[0]:
25
+ messages.append({"role": "user", "content": val[0]})
26
+ if val[1]:
27
+ messages.append({"role": "assistant", "content": val[1]})
28
 
29
+ messages.append({"role": "user", "content": message})
30
+
31
+ response = ""
32
+
33
+ for message in client.chat_completion(
34
+ messages,
35
+ max_tokens=max_tokens,
36
+ stream=True,
 
 
 
 
 
37
  temperature=temperature,
38
  top_p=top_p,
39
+ ):
40
+ token = message.choices[0].delta.content
 
 
 
 
 
41
 
42
+ response += token
43
+ yield response
 
 
 
44
 
45
+ """
46
+ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
47
+ """
48
  demo = gr.ChatInterface(
49
  respond,
50
  additional_inputs=[
51
+ gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
52
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
53
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
54
  gr.Slider(
 
61
  ],
62
  )
63
 
64
+
65
  if __name__ == "__main__":
66
+ demo.launch()