Spaces:

tafxle
/

Bloom_chat

Runtime error

tafxle commited on Nov 21, 2022

Commit

d8e2347

1 Parent(s): 3ac45a0

Cache + Measure time

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,20 +1,23 @@
 import torch
 import transformers
-import numpy as np
 from huggingface_hub import hf_hub_download
-tokenizer = transformers.AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
-hf_hub_download("OpenDungeon/gpt-j-8bit-ffbgem", "model.pt")
-qmodel = torch.load("model.pt")
 def PrintContinuation(prompt, local_model, single_hook=None, batch=1, limit_tokens = 50):
     past_key_values = None  # used to keep track of conversation history
     input_dict = tokenizer([prompt] * batch, return_tensors='pt', padding=False)
     output = [""] * batch
     with torch.inference_mode():
         for i in range(limit_tokens + 20):
             if i == 5:
@@ -33,16 +36,25 @@ def PrintContinuation(prompt, local_model, single_hook=None, batch=1, limit_toke
             if single_hook is not None:
                 single_hook(tokenizer.decode(token_ix[0]))
             if i == limit_tokens:
-                print()
-                print((time.perf_counter() - start_time) / (i - 4), "s per token")
                 break
             input_dict = dict(input_ids=token_ix)
-    print()
-    return output
-import streamlit as st
-text = st.text_area("Prompt")
-PrintContinuation(text, qmodel, lambda x: t.markdown(f"## {x}..."))

 import torch
 import transformers
+import time
 from huggingface_hub import hf_hub_download
+import streamlit as st
+@st.cache
+def load_model():
+    hf_hub_download("OpenDungeon/gpt-j-8bit-ffbgem", "model.pt")
+    qmodel = torch.load("model.pt")
+    return transformers.AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B"), qmodel
 def PrintContinuation(prompt, local_model, single_hook=None, batch=1, limit_tokens = 50):
     past_key_values = None  # used to keep track of conversation history
     input_dict = tokenizer([prompt] * batch, return_tensors='pt', padding=False)
     output = [""] * batch
+    batch_time = 0
     with torch.inference_mode():
         for i in range(limit_tokens + 20):
             if i == 5:
             if single_hook is not None:
                 single_hook(tokenizer.decode(token_ix[0]))
             if i == limit_tokens:
+                batch_time = (time.perf_counter() - start_time) / (i - 4)
                 break
             input_dict = dict(input_ids=token_ix)
+    return output, batch_time
+tokenizer, model = load_model()
+text = st.text_area("Prefix")
+batch = st.number_input("Variants", value=1)
+t = st.empty()
+firstline = ""
+def PrintSome(text):
+    global t, firstline
+    firstline += text
+    t.markdown(f"## {firstline}...")
+choices, batch_time = PrintContinuation(text, model, PrintSome, batch, 50)
+t.markdown("  \n\n".join(choices) + f"  \n\nBatch:Seconds per batch: {batch_time}, Batch: {batch}")