|
from typing import List |
|
|
|
import pandas as pd |
|
from h2o_wave import data, ui |
|
|
|
|
|
def histogram_card( |
|
x, |
|
a=0.1, |
|
b=0.9, |
|
x_axis_description="text_length", |
|
histogram_box="first", |
|
title="Text Length (split by whitespace)", |
|
): |
|
assert " " not in x_axis_description, ( |
|
"x_axis_description in histogram card must not contain spaces, " |
|
"as the card would not be rendered." |
|
) |
|
df_quantile = compute_quantile_df(x, a, b) |
|
df_quantile = df_quantile.rename(columns={"length": x_axis_description}) |
|
card = ui.plot_card( |
|
box=histogram_box, |
|
title=title, |
|
data=data( |
|
fields=df_quantile.columns.tolist(), |
|
rows=df_quantile.values.tolist(), |
|
pack=True, |
|
), |
|
plot=ui.plot( |
|
marks=[ |
|
ui.mark( |
|
type="area", |
|
x=f"={x_axis_description}", |
|
x_title=f"Total samples: {len(x)}", |
|
y="=count", |
|
y_title="Count", |
|
color="=data_type", |
|
shape="circle", |
|
) |
|
] |
|
), |
|
) |
|
return card |
|
|
|
|
|
def compute_quantile_df(x: List[int], a: float, b: float): |
|
""" |
|
Compute the quantiles based on the input list x. |
|
|
|
Returns a dataframe with the following columns: |
|
- length: length of the text |
|
- count: number of texts with this length |
|
- data_type: quantile type |
|
(first (a * 100)% quantile, (a * 100)%-(100 * (1 - b))% quantile, |
|
last (100 * (1 - b))% quantile) |
|
|
|
Note that quantiles are overlapping on the edges. |
|
""" |
|
if not x: |
|
raise ValueError("Input list x is empty") |
|
|
|
if not 0 <= a <= b <= 1: |
|
raise ValueError( |
|
"Values of a and b must be in [0, 1] " |
|
"and a should be less than or equal to b" |
|
) |
|
|
|
x_axis_description = "length" |
|
df = pd.DataFrame(x, columns=[x_axis_description]) |
|
df["count"] = 1 |
|
df_quantile = ( |
|
df.groupby([x_axis_description]) |
|
.sum() |
|
.reset_index() |
|
.sort_values(by=x_axis_description)[[x_axis_description, "count"]] |
|
) |
|
sorted_data = sorted(x) |
|
first_quantile = sorted_data[int(len(sorted_data) * a)] |
|
last_quantile = sorted_data[-int(len(sorted_data) * (1 - b))] |
|
|
|
df_first = df_quantile.loc[df_quantile[x_axis_description] <= first_quantile].copy() |
|
df_first["data_type"] = f"first {int(a * 100)}% quantile" |
|
df_last = df_quantile.loc[df_quantile[x_axis_description] >= last_quantile].copy() |
|
df_last["data_type"] = f"last {100 - int(b * 100)}% quantile" |
|
df_quantile["data_type"] = f"{int(a * 100)}%-{int(b * 100)}% quantile" |
|
middle_quantile_min = max(0, len(df_first) - 1) |
|
middle_quantile_max = ( |
|
min(len(df_quantile), (len(df_quantile) - len(df_last) - 1)) + 1 |
|
) |
|
df_quantile = pd.concat( |
|
[ |
|
df_first, |
|
df_quantile.loc[middle_quantile_min:middle_quantile_max], |
|
df_last, |
|
] |
|
) |
|
return df_quantile |
|
|