Spaces:
Sleeping
Sleeping
Dockerfile now loads models to local folder. Can use custom output folder. requrirements for GPU-enabled summarisation now in separate file to hopefully avoid HF space issues.
3809dc8
from typing import TypeVar | |
# Model packages | |
import torch.cuda | |
from transformers import pipeline | |
import time | |
torch.cuda.empty_cache() | |
PandasDataFrame = TypeVar('pd.core.frame.DataFrame') | |
model_type = None # global variable setup | |
full_text = "" # Define dummy source text (full text) just to enable highlight function to load | |
model = [] # Define empty list for model functions to run | |
tokenizer = [] # Define empty list for model functions to run | |
# Currently set gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda | |
if torch.cuda.is_available(): | |
torch_device = "cuda" | |
gpu_layers = 0 | |
else: | |
torch_device = "cpu" | |
gpu_layers = 0 | |
print("Running on device:", torch_device) | |
threads = 8 #torch.get_num_threads() | |
print("CPU threads:", threads) | |
# flan-t5-large-stacked-xsum Model parameters | |
temperature: float = 0.1 | |
top_k: int = 3 | |
top_p: float = 1 | |
repetition_penalty: float = 1.05 #1.3 | |
last_n_tokens: int = 64 | |
max_new_tokens: int = 4096 # 200 | |
seed: int = 42 | |
reset: bool = True | |
stream: bool = False | |
threads: int = threads | |
batch_size:int = 256 | |
context_length:int = 4096 | |
sample = True | |
class CtransInitConfig_gpu: | |
def __init__(self, | |
last_n_tokens=last_n_tokens, | |
seed=seed, | |
n_threads=threads, | |
n_batch=batch_size, | |
n_ctx=24576, | |
n_gpu_layers=gpu_layers): | |
self.last_n_tokens = last_n_tokens | |
self.seed = seed | |
self.n_threads = n_threads | |
self.n_batch = n_batch | |
self.n_ctx = n_ctx | |
self.n_gpu_layers = n_gpu_layers | |
# self.stop: list[str] = field(default_factory=lambda: [stop_string]) | |
def update_gpu(self, new_value): | |
self.n_gpu_layers = new_value | |
class CtransInitConfig_cpu(CtransInitConfig_gpu): | |
def __init__(self): | |
super().__init__() | |
self.n_gpu_layers = 0 | |
gpu_config = CtransInitConfig_gpu() | |
cpu_config = CtransInitConfig_cpu() | |
class CtransGenGenerationConfig: | |
def __init__(self, temperature=temperature, | |
top_k=top_k, | |
top_p=top_p, | |
repeat_penalty=repetition_penalty, | |
seed=seed, | |
stream=stream, | |
max_tokens=max_new_tokens | |
): | |
self.temperature = temperature | |
self.top_k = top_k | |
self.top_p = top_p | |
self.repeat_penalty = repeat_penalty | |
self.seed = seed | |
self.max_tokens=max_tokens | |
self.stream = stream | |
def update_temp(self, new_value): | |
self.temperature = new_value | |
def llama_cpp_streaming(history, full_prompt, model_type, | |
temperature=temperature, | |
max_new_tokens=max_new_tokens, | |
sample=sample, | |
repetition_penalty=repetition_penalty, | |
top_p=top_p, | |
top_k=top_k | |
): | |
#print("Model type is: ", model_type) | |
#if not full_prompt.strip(): | |
# if history is None: | |
# history = [] | |
# return history | |
#tokens = model.tokenize(full_prompt) | |
gen_config = CtransGenGenerationConfig() | |
gen_config.update_temp(temperature) | |
print(vars(gen_config)) | |
# Pull the generated text from the streamer, and update the model output. | |
start = time.time() | |
NUM_TOKENS=0 | |
print('-'*4+'Start Generation'+'-'*4) | |
output = model( | |
full_prompt, **vars(gen_config)) | |
history[-1][1] = "" | |
for out in output: | |
if "choices" in out and len(out["choices"]) > 0 and "text" in out["choices"][0]: | |
history[-1][1] += out["choices"][0]["text"] | |
NUM_TOKENS+=1 | |
yield history | |
else: | |
print(f"Unexpected output structure: {out}") | |
time_generate = time.time() - start | |
print('\n') | |
print('-'*4+'End Generation'+'-'*4) | |
print(f'Num of generated tokens: {NUM_TOKENS}') | |
print(f'Time for complete generation: {time_generate}s') | |
print(f'Tokens per secound: {NUM_TOKENS/time_generate}') | |
print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms') |