Spaces:

augmxnt
/

shisa

Runtime error

App Files Files Community

leonardlin commited on Nov 16, 2023

Commit

0e02ca5

•

1 Parent(s): 4a8282c

working streaming interface

Browse files

Files changed (2) hide show

app.py +112 -14
requirements.txt +4 -1

app.py CHANGED Viewed

@@ -1,13 +1,19 @@
 # https://www.gradio.app/guides/using-hugging-face-integrations
 import gradio as gr
-from transformers import pipeline, Conversation
-model = "mistralai/Mistral-7B-Instruct-v0.1"
-# Test Model
-model = "TinyLlama/TinyLlama-1.1B-Chat-v0.3"
 title = "Shisa 7B"
 description = "Test out Shisa 7B in either English or Japanese."
 placeholder = "Type Here / ここに入力してください"
@@ -18,23 +24,114 @@ examples = [
     "こんにちは、いかがお過ごしですか？",
 ]
 # Docs: https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/conversational.py
 conversation = Conversation()
-chatbot = pipeline('conversational', model)
-def chat(input, history=[]):
     conversation.add_message({"role": "user", "content": input})
     # we do this shuffle so local shadow response doesn't get created
-    response_conversation = chatbot(conversation)
-    print(response_conversation)
-    print(response_conversation.messages)
-    print(response_conversation.messages[-1]["content"])
     conversation.add_message(response_conversation.messages[-1])
     response = conversation.messages[-1]["content"]
-    return response, history
-gr.ChatInterface(
     chat,
     chatbot=gr.Chatbot(height=400),
     textbox=gr.Textbox(placeholder=placeholder, container=False, scale=7),
@@ -48,4 +145,5 @@ gr.ChatInterface(
 ).launch()
 # For async
-# ).queue().launch(share=True)

 # https://www.gradio.app/guides/using-hugging-face-integrations
 import gradio as gr
+import logging
+import html
+import time
+import torch
+from   threading import Thread
+from   transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+# Model
+model_name = "mistralai/Mistral-7B-Instruct-v0.1"
+model_name = "TinyLlama/TinyLlama-1.1B-Chat-v0.3"
+model_name = "/models/llm/hf/mistralai_Mistral-7B-Instruct-v0.1"
+# UI Settings
 title = "Shisa 7B"
 description = "Test out Shisa 7B in either English or Japanese."
 placeholder = "Type Here / ここに入力してください"
     "こんにちは、いかがお過ごしですか？",
 ]
+# LLM Settings
+system_prompt = 'You are a helpful, friendly assistant.'
+chat_history = [{"role": "system", "content": system_prompt}]
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+tokenizer.chat_template = "{%- for idx in range(0, messages|length) -%}\n{%- if messages[idx]['role'] == 'user' -%}\n{%- if idx > 1 -%}\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\n{%- else -%}\n{{- messages[idx]['content'] + ' [/INST]' -}}\n{%- endif -%}\n{% elif messages[idx]['role'] == 'system' %}\n{{- '[INST] <<SYS>>\\n' + messages[idx]['content'] + '\\n<</SYS>>\\n\\n' -}}\n{%- elif messages[idx]['role'] == 'assistant' -%}\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\n{% endif %}\n{% endfor %}\n"
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    load_in_8bit=True,
+)
+streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
+def chat(message, history):
+    chat_history.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(chat_history, add_generation_prompt=True, return_tensors="pt").to('cuda')
+    generate_kwargs = dict(
+        inputs=input_ids,
+        streamer=streamer,
+        max_new_tokens=200,
+        do_sample=True,
+        temperature=0.7,
+        top_p=0.95,
+        eos_token_id=tokenizer.eos_token_id,
+    )
+    # https://www.gradio.app/main/guides/creating-a-chatbot-fast#example-using-a-local-open-source-llm-with-hugging-face
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    partial_message = ""
+    for new_token in streamer:
+        partial_message += new_token # html.escape(new_token)
+        yield partial_message
+    '''
+    # https://www.gradio.app/main/guides/creating-a-chatbot-fast#streaming-chatbots
+    for i in range(len(message)):
+        time.sleep(0.3)
+        yield message[: i+1]
+    '''
+chat_interface = gr.ChatInterface(
+    chat,
+    chatbot=gr.Chatbot(height=400),
+    textbox=gr.Textbox(placeholder=placeholder, container=False, scale=7),
+    title=title,
+    description=description,
+    theme="soft",
+    examples=examples,
+    cache_examples=False,
+    undo_btn="Delete Previous",
+    clear_btn="Clear",
+)
+# https://huggingface.co/spaces/ysharma/Explore_llamav2_with_TGI/blob/main/app.py#L219 - we use this with construction b/c Gradio barfs on autoreload otherwise
+with gr.Blocks() as demo:
+    chat_interface.render()
+    gr.Markdown("You can try these greetings in English, Japanese, familiar Japanese, or formal Japanese. We limit output to 200 tokens.")
+demo.queue().launch()
+'''
+# Works for Text input...
+demo = gr.Interface.from_pipeline(pipe)
+'''
+'''
+def chat(message, history):
+    print("foo")
+    for i in range(len(message)):
+        time.sleep(0.3)
+        yield "You typed: " + message[: i+1]
+    # print('history:', history)
+    # print('message:', message)
+    # for new_next in streamer:
+    #    yield new_text
+'''
+'''
 # Docs: https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/conversational.py
 conversation = Conversation()
+conversation.add_message({"role": "system", "content": system})
+device = torch.device('cuda')
+pipe = pipeline(
+    'conversational',
+    model=model,
+    tokenizer=tokenizer,
+    streamer=streamer,
+)
+def chat(input, history):
     conversation.add_message({"role": "user", "content": input})
     # we do this shuffle so local shadow response doesn't get created
+    response_conversation = pipe(conversation)
+    print("foo:", response_conversation.messages[-1]["content"])
     conversation.add_message(response_conversation.messages[-1])
+    print("boo:", response_conversation.messages[-1]["content"])
     response = conversation.messages[-1]["content"]
+    response = "ping"
+    return response
+demo = gr.ChatInterface(
     chat,
     chatbot=gr.Chatbot(height=400),
     textbox=gr.Textbox(placeholder=placeholder, container=False, scale=7),
 ).launch()
 # For async
+# ).queue().launch()
+'''

requirements.txt CHANGED Viewed

@@ -1,3 +1,6 @@
 gradio
 torch
-transformers

+accelerate
+bitsandbytes
 gradio
+scipy
 torch
+transformers