Spaces:
Sleeping
Sleeping
Dockerfile now loads models to local folder. Can use custom output folder. requrirements for GPU-enabled summarisation now in separate file to hopefully avoid HF space issues.
3809dc8
import pandas as pd | |
import concurrent.futures | |
import gradio as gr | |
from chatfuncs.chatfuncs import model, CtransGenGenerationConfig, temperature | |
from datetime import datetime | |
from typing import Type | |
from chatfuncs.helper_functions import output_folder | |
today = datetime.now().strftime("%d%m%Y") | |
today_rev = datetime.now().strftime("%Y%m%d") | |
PandasDataFrame = Type[pd.DataFrame] | |
def summarise_text(text:str, text_df:PandasDataFrame, length_slider:int, in_colname:str, model_type:str, progress=gr.Progress()): | |
''' | |
Summarise a text or series of texts using Transformers of Llama.cpp | |
''' | |
outputs = [] | |
output_name = "" | |
output_name_parquet = "" | |
if text_df.empty: | |
in_colname="text" | |
in_colname_list_first = in_colname | |
in_text_df = pd.DataFrame({in_colname_list_first:[text]}) | |
else: | |
in_text_df = text_df | |
in_colname_list_first = in_colname | |
print(model_type) | |
texts_list = list(in_text_df[in_colname_list_first]) | |
if model_type != "Phi 3 128k (24k tokens max)": | |
summarised_texts = [] | |
for single_text in progress.tqdm(texts_list, desc = "Summarising texts", unit = "texts"): | |
summarised_text = model(single_text, max_length=length_slider) | |
#print(summarised_text) | |
summarised_text_str = summarised_text[0]['summary_text'] | |
summarised_texts.append(summarised_text_str) | |
print(summarised_text_str) | |
#pd.Series(summarised_texts).to_csv("summarised_texts_out.csv") | |
#print(summarised_texts) | |
if model_type == "Phi 3 128k (24k tokens max)": | |
gen_config = CtransGenGenerationConfig() | |
gen_config.update_temp(temperature) | |
print(gen_config) | |
# Define a function that calls your model | |
# def call_model(formatted_string):#, vars): | |
# return model(formatted_string)#, vars) | |
def call_model(formatted_string, gen_config): | |
""" | |
Calls your generation model with parameters from the CtransGenGenerationConfig object. | |
Args: | |
formatted_string (str): The formatted input text for the model. | |
gen_config (CtransGenGenerationConfig): An object containing generation parameters. | |
""" | |
# Extracting parameters from the gen_config object | |
temperature = gen_config.temperature | |
top_k = gen_config.top_k | |
top_p = gen_config.top_p | |
repeat_penalty = gen_config.repeat_penalty | |
seed = gen_config.seed | |
max_tokens = gen_config.max_tokens | |
stream = gen_config.stream | |
# Now you can call your model directly, passing the parameters: | |
output = model( | |
formatted_string, | |
temperature=temperature, | |
top_k=top_k, | |
top_p=top_p, | |
repeat_penalty=repeat_penalty, | |
seed=seed, | |
max_tokens=max_tokens, | |
stream=stream, | |
) | |
return output | |
# Set your timeout duration (in seconds) | |
timeout_duration = 300 # Adjust this value as needed | |
length = str(length_slider) | |
from chatfuncs.prompts import instruction_prompt_phi3 | |
summarised_texts = [] | |
for single_text in progress.tqdm(texts_list, desc = "Summarising texts", unit = "texts"): | |
formatted_string = instruction_prompt_phi3.format(length=length, text=single_text) | |
# Use ThreadPoolExecutor to enforce a timeout | |
with concurrent.futures.ThreadPoolExecutor() as executor: | |
#future = executor.submit(call_model, formatted_string)#, **vars(gen_config)) | |
future = executor.submit(call_model, formatted_string, gen_config) | |
try: | |
output = future.result(timeout=timeout_duration) | |
# Process the output here | |
except concurrent.futures.TimeoutError: | |
error_text = f"Timeout (five minutes) occurred for text: {single_text}. Consider using a smaller model." | |
print(error_text) | |
return error_text, None | |
print(output) | |
output_str = output['choices'][0]['text'] | |
# Find the index of 'ASSISTANT: ' to select only text after this location | |
# index = output_str.find('ASSISTANT: ') | |
# # Check if 'ASSISTANT: ' is found in the string | |
# if index != -1: | |
# # Add the length of 'ASSISTANT: ' to the index to start from the end of this substring | |
# start_index = index + len('ASSISTANT: ') | |
# # Slice the string from this point to the end | |
# assistant_text = output_str[start_index:] | |
# else: | |
# assistant_text = "ASSISTANT: not found in text" | |
# print(assistant_text) | |
#summarised_texts.append(assistant_text) | |
summarised_texts.append(output_str) | |
#print(summarised_text) | |
#pd.Series(summarised_texts).to_csv("summarised_texts_out.csv") | |
if text_df.empty: | |
#if model_type != "Phi 3 128k (24k tokens max)": | |
summarised_text_out = summarised_texts[0]#.values() | |
#if model_type == "Phi 3 128k (24k tokens max)": | |
# summarised_text_out = summarised_texts[0] | |
else: | |
summarised_text_out = summarised_texts #[d['summary_text'] for d in summarised_texts] #summarised_text[0].values() | |
output_name = output_folder + "summarise_output_" + today_rev + ".csv" | |
output_name_parquet = output_folder + "summarise_output_" + today_rev + ".parquet" | |
output_df = pd.DataFrame({"Original text":in_text_df[in_colname_list_first], | |
"Summarised text":summarised_text_out}) | |
summarised_text_out_str = str(output_df["Summarised text"][0])#.str.replace("dict_values([","").str.replace("])","")) | |
output_df.to_csv(output_name, index = None) | |
output_df.to_parquet(output_name_parquet, index = None) | |
outputs.append(output_name) | |
outputs.append(output_name_parquet) | |
return summarised_text_out_str, outputs | |
# def summarise_text(text, text_df, length_slider, in_colname, model_type, progress=gr.Progress()): | |
# if text_df.empty: | |
# in_colname="text" | |
# in_colname_list_first = in_colname | |
# in_text_df = pd.DataFrame({in_colname_list_first:[text]}) | |
# else: | |
# in_text_df = text_df | |
# in_colname_list_first = in_colname | |
# print(model_type) | |
# texts_list = list(in_text_df[in_colname_list_first]) | |
# if model_type != "Phi 3 128k (24k tokens max)": | |
# summarised_texts = [] | |
# for single_text in progress.tqdm(texts_list, desc = "Summarising texts", unit = "texts"): | |
# summarised_text = chatf.model(single_text, max_length=length_slider) | |
# #print(summarised_text) | |
# summarised_text_str = summarised_text[0]['summary_text'] | |
# summarised_texts.append(summarised_text_str) | |
# print(summarised_text_str) | |
# #pd.Series(summarised_texts).to_csv("summarised_texts_out.csv") | |
# #print(summarised_texts) | |
# if model_type == "Phi 3 128k (24k tokens max)": | |
# # Define a function that calls your model | |
# def call_model(formatted_string, max_length=10000): | |
# return chatf.model(formatted_string, max_length=max_length) | |
# # Set your timeout duration (in seconds) | |
# timeout_duration = 300 # Adjust this value as needed | |
# length = str(length_slider) | |
# from chatfuncs.prompts import nous_capybara_prompt | |
# summarised_texts = [] | |
# for single_text in progress.tqdm(texts_list, desc = "Summarising texts", unit = "texts"): | |
# formatted_string = nous_capybara_prompt.format(length=length, text=single_text) | |
# # Use ThreadPoolExecutor to enforce a timeout | |
# with concurrent.futures.ThreadPoolExecutor() as executor: | |
# future = executor.submit(call_model, formatted_string, 10000) | |
# try: | |
# output = future.result(timeout=timeout_duration) | |
# # Process the output here | |
# except concurrent.futures.TimeoutError: | |
# error_text = f"Timeout (five minutes) occurred for text: {single_text}. Consider using a smaller model." | |
# print(error_text) | |
# return error_text, None | |
# print(output) | |
# output_str = output[0]['generated_text'] | |
# # Find the index of 'ASSISTANT: ' to select only text after this location | |
# index = output_str.find('ASSISTANT: ') | |
# # Check if 'ASSISTANT: ' is found in the string | |
# if index != -1: | |
# # Add the length of 'ASSISTANT: ' to the index to start from the end of this substring | |
# start_index = index + len('ASSISTANT: ') | |
# # Slice the string from this point to the end | |
# assistant_text = output_str[start_index:] | |
# else: | |
# assistant_text = "ASSISTANT: not found in text" | |
# print(assistant_text) | |
# summarised_texts.append(assistant_text) | |
# #print(summarised_text) | |
# #pd.Series(summarised_texts).to_csv("summarised_texts_out.csv") | |
# if text_df.empty: | |
# #if model_type != "Phi 3 128k (24k tokens max)": | |
# summarised_text_out = summarised_texts[0]#.values() | |
# #if model_type == "Phi 3 128k (24k tokens max)": | |
# # summarised_text_out = summarised_texts[0] | |
# else: | |
# summarised_text_out = summarised_texts #[d['summary_text'] for d in summarised_texts] #summarised_text[0].values() | |
# output_name = "summarise_output_" + today_rev + ".csv" | |
# output_df = pd.DataFrame({"Original text":in_text_df[in_colname_list_first], | |
# "Summarised text":summarised_text_out}) | |
# summarised_text_out_str = str(output_df["Summarised text"][0])#.str.replace("dict_values([","").str.replace("])","")) | |
# output_df.to_csv(output_name, index = None) | |
# return summarised_text_out_str, output_name | |