Mikhil-jivus commited on
Commit
6ac164c
1 Parent(s): 9b150ae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -55
app.py CHANGED
@@ -1,72 +1,98 @@
1
  import gradio as gr
2
- import os
3
- from huggingface_hub import InferenceClient
4
  import torch
5
- from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
 
 
 
 
6
 
7
- """
8
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
9
- """
10
- repo_id = "Mikhil-jivus/Llama-32-3B-FineTuned"
11
  access_token = os.getenv('HF_TOKEN')
12
- # Load the tokenizer and model from the Hugging Face repository
13
- tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True, token=access_token)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
 
16
- client = InferenceClient(model=repo_id,tokenizer=tokenizer,token = access_token)
 
 
 
17
 
 
 
 
18
 
19
- def respond(
20
- message,
21
- history: list[tuple[str, str]],
22
- system_message,
23
- max_tokens,
24
- temperature,
25
- top_p,
26
- ):
27
- messages = [{"role": "system", "content": system_message}]
28
 
29
- for val in history:
30
- if val[0]:
31
- messages.append({"role": "user", "content": val[0]})
32
- if val[1]:
33
- messages.append({"role": "assistant", "content": val[1]})
 
 
 
 
 
 
 
 
34
 
35
- messages.append({"role": "user", "content": message})
 
 
 
36
 
37
- response = ""
 
38
 
39
- for message in client.chat_completion(
40
- messages,
41
- max_tokens=max_tokens,
42
- stream=True,
43
- temperature=temperature,
44
- top_p=top_p,
45
- ):
46
- token = message.choices[0].delta.content
47
 
48
- response += token
49
  yield response
50
 
51
- """
52
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
53
- """
54
- demo = gr.ChatInterface(
55
- respond,
56
- additional_inputs=[
57
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
58
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
59
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
60
- gr.Slider(
61
- minimum=0.1,
62
- maximum=1.0,
63
- value=0.95,
64
- step=0.05,
65
- label="Top-p (nucleus sampling)",
66
- ),
67
- ],
68
- )
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- if __name__ == "__main__":
72
- demo.launch()
 
1
  import gradio as gr
2
+
 
3
  import torch
4
+ from transformers import (
5
+ AutoTokenizer,
6
+ AutoModelForCausalLM,
7
+ TextIteratorStreamer,
8
+ pipeline,
9
+ )
10
+ from threading import Thread
11
 
 
 
 
 
12
  access_token = os.getenv('HF_TOKEN')
13
+
14
+ # The huggingface model id for Finetuned model
15
+ checkpoint = "Mikhil-jivus/Llama-32-3B-FineTuned"
16
+
17
+ # Download and load model and tokenizer
18
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True,token=access_token)
19
+ model = AutoModelForCausalLM.from_pretrained(
20
+ checkpoint, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True,token=access_token
21
+ )
22
+
23
+ # Text generation pipeline
24
+ phi2 = pipeline(
25
+ "text-generation",
26
+ tokenizer=tokenizer,
27
+ model=model,
28
+ pad_token_id=tokenizer.eos_token_id,
29
+ eos_token_id=tokenizer.eos_token_id,
30
+ device_map="auto",
31
+ )
32
 
33
 
34
+ # Function that accepts a prompt and generates text using the phi2 pipeline
35
+ def generate(message, chat_history, max_new_tokens):
36
+ instruction = "You are a helpful assistant to 'User'. You do not respond as 'User' or pretend to be 'User'. You only respond once as 'Assistant'."
37
+ final_prompt = f"Instruction: {instruction}\n"
38
 
39
+ for sent, received in chat_history:
40
+ final_prompt += "User: " + sent + "\n"
41
+ final_prompt += "Assistant: " + received + "\n"
42
 
43
+ final_prompt += "User: " + message + "\n"
44
+ final_prompt += "Output:"
 
 
 
 
 
 
 
45
 
46
+ # Streamer
47
+ streamer = TextIteratorStreamer(
48
+ tokenizer=tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=300.0
49
+ )
50
+ thread = Thread(
51
+ target=phi2,
52
+ kwargs={
53
+ "text_inputs": final_prompt,
54
+ "max_new_tokens": max_new_tokens,
55
+ "streamer": streamer,
56
+ },
57
+ )
58
+ thread.start()
59
 
60
+ generated_text = ""
61
+ for word in streamer:
62
+ generated_text += word
63
+ response = generated_text.strip()
64
 
65
+ if "User:" in response:
66
+ response = response.split("User:")[0].strip()
67
 
68
+ if "Assistant:" in response:
69
+ response = response.split("Assistant:")[1].strip()
 
 
 
 
 
 
70
 
 
71
  yield response
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ # Chat interface with gradio
75
+ with gr.Blocks() as demo:
76
+ gr.Markdown(
77
+ """
78
+ # Jivus AI Chatbot Demo
79
+ This chatbot was created using Llama 3 billion parameter Transformer model.
80
+ """
81
+ )
82
+
83
+ tokens_slider = gr.Slider(
84
+ 8,
85
+ 128,
86
+ value=21,
87
+ label="Maximum new tokens",
88
+ info="A larger `max_new_tokens` parameter value gives you longer text responses but at the cost of a slower response time.",
89
+ )
90
+
91
+ chatbot = gr.ChatInterface(
92
+ fn=generate,
93
+ additional_inputs=[tokens_slider],
94
+ stop_btn=None,
95
+ examples=[["Who is Leonhard Euler?"]],
96
+ )
97
 
98
+ demo.queue().launch()