from typing import List import pandas as pd from h2o_wave import data, ui def histogram_card( x, a=0.1, b=0.9, x_axis_description="text_length", histogram_box="first", title="Text Length (split by whitespace)", ): assert " " not in x_axis_description, ( "x_axis_description in histogram card must not contain spaces, " "as the card would not be rendered." ) df_quantile = compute_quantile_df(x, a, b) df_quantile = df_quantile.rename(columns={"length": x_axis_description}) card = ui.plot_card( box=histogram_box, title=title, data=data( fields=df_quantile.columns.tolist(), rows=df_quantile.values.tolist(), pack=True, ), plot=ui.plot( marks=[ ui.mark( type="area", x=f"={x_axis_description}", x_title=f"Total samples: {len(x)}", y="=count", y_title="Count", color="=data_type", shape="circle", ) ] ), ) return card def compute_quantile_df(x: List[int], a: float, b: float): """ Compute the quantiles based on the input list x. Returns a dataframe with the following columns: - length: length of the text - count: number of texts with this length - data_type: quantile type (first (a * 100)% quantile, (a * 100)%-(100 * (1 - b))% quantile, last (100 * (1 - b))% quantile) Note that quantiles are overlapping on the edges. """ if not x: raise ValueError("Input list x is empty") if not 0 <= a <= b <= 1: raise ValueError( "Values of a and b must be in [0, 1] " "and a should be less than or equal to b" ) x_axis_description = "length" df = pd.DataFrame(x, columns=[x_axis_description]) df["count"] = 1 df_quantile = ( df.groupby([x_axis_description]) .sum() .reset_index() .sort_values(by=x_axis_description)[[x_axis_description, "count"]] ) sorted_data = sorted(x) first_quantile = sorted_data[int(len(sorted_data) * a)] last_quantile = sorted_data[-int(len(sorted_data) * (1 - b))] df_first = df_quantile.loc[df_quantile[x_axis_description] <= first_quantile].copy() df_first["data_type"] = f"first {int(a * 100)}% quantile" df_last = df_quantile.loc[df_quantile[x_axis_description] >= last_quantile].copy() df_last["data_type"] = f"last {100 - int(b * 100)}% quantile" df_quantile["data_type"] = f"{int(a * 100)}%-{int(b * 100)}% quantile" middle_quantile_min = max(0, len(df_first) - 1) middle_quantile_max = ( min(len(df_quantile), (len(df_quantile) - len(df_last) - 1)) + 1 ) df_quantile = pd.concat( [ df_first, df_quantile.loc[middle_quantile_min:middle_quantile_max], df_last, ] ) return df_quantile