Spaces:
Build error
Build error
import numpy as np | |
from bs4 import BeautifulSoup | |
from sklearn.preprocessing import minmax_scale | |
from sentence_transformers import SentenceTransformer, util | |
import numpy as np | |
from sklearn.metrics.pairwise import cosine_similarity | |
sbert = SentenceTransformer("all-MiniLM-L6-v2") | |
from llm_query_api import * | |
def get_row_embedding(html_table): | |
def get_row_elements(html_table): | |
tr_elements = [] | |
soup = BeautifulSoup(html_table, 'html.parser') | |
tr_tags = soup.find_all('tr') | |
for t in tr_tags: | |
temp = " " + str(t.get('description')) | |
try: | |
tr_elements.append({'id':str(t.get('id')), 'text': temp}) | |
except: | |
pass | |
return tr_elements | |
rows = get_row_elements(html_table) | |
all_elements = rows | |
sentences = [] | |
element_ids = [] | |
for i in range(len(all_elements)): | |
sentences.append(all_elements[i]['text']) | |
element_ids.append(all_elements[i]['id']) | |
embeddings = sbert.encode(sentences, convert_to_tensor=True).cpu().numpy() | |
return embeddings, element_ids | |
def get_col_embedding(html_table): | |
def get_column_elements(html_table): | |
th_elements = [] | |
soup = BeautifulSoup(html_table, 'html.parser') | |
th_tags = soup.find_all('th') | |
for t in th_tags: | |
temp = " " + str(t.get('description')) | |
try: | |
th_elements.append({'id':str(t.get('id')), 'text': temp}) | |
except: | |
pass | |
return th_elements | |
cols = get_column_elements(html_table) | |
all_elements = cols | |
sentences = [] | |
element_ids = [] | |
for i in range(len(all_elements)): | |
sentences.append(all_elements[i]['text']) | |
element_ids.append(all_elements[i]['id']) | |
embeddings = sbert.encode(sentences, convert_to_tensor=True).cpu().numpy() | |
return embeddings, element_ids | |
def normalize_list_numpy(list_numpy): | |
normalized_list = minmax_scale(list_numpy) | |
return normalized_list | |
def get_answer_embedding(answer): | |
return sbert.encode([answer], convert_to_tensor=True).cpu().numpy() | |
def row_attribution(answer, html_table, topk=5, threshold = 0.7): | |
answer_embedding = get_answer_embedding(answer) | |
row_embedding = get_row_embedding(html_table) | |
similarities = cosine_similarity(row_embedding[0], answer_embedding.reshape(1, -1)) | |
sims = similarities.flatten() | |
sims = normalize_list_numpy(sims) | |
#if no of rows >= 5, take max of (5, 1/3 x rows) | |
#else if no of rows < 5, take least of (5, rows) | |
k = max(topk, int(0.3*len(sims))) | |
k = min(k, len(sims)) | |
top_k_indices = np.argpartition(sims, -k)[-k:] | |
sorted_indices = top_k_indices[np.argsort(sims[top_k_indices])][::-1] | |
top_k_results = [row_embedding[1][idx] for idx in sorted_indices] | |
return top_k_results | |
def col_attribution(answer, html_table, topk=5, threshold = 0.7): | |
answer_embedding = get_answer_embedding(answer) | |
col_embedding = get_col_embedding(html_table) | |
similarities = cosine_similarity(col_embedding[0], answer_embedding.reshape(1, -1)) | |
sims = similarities.flatten() | |
sims = normalize_list_numpy(sims) | |
#if no of cols >= 5, take max of (5, 1/3 x cols) | |
#else if no of cols < 5, take least of (5, cols) | |
k = max(topk, int(0.3*len(sims))) | |
k = min(k, len(sims)) | |
top_k_indices = np.argpartition(sims, -k)[-k:] | |
sorted_indices = top_k_indices[np.argsort(sims[top_k_indices])][::-1] | |
top_k_results = [col_embedding[1][idx] for idx in sorted_indices] | |
return top_k_results | |
def retain_rows_and_columns(augmented_html_table, row_ids, column_ids): | |
soup = BeautifulSoup(augmented_html_table, 'html.parser') | |
row_ids = list(set(row_ids)) | |
column_ids = list(set(column_ids)) | |
# Retain specified rows and remove others | |
all_rows = soup.find_all('tr') | |
for row in all_rows: | |
if row.get('id') not in row_ids: | |
row.decompose() | |
# Retain specified columns and remove others | |
if all_rows: | |
all_columns = all_rows[0].find_all(['th']) | |
for i, col in enumerate(all_columns): | |
if col.get('id') not in column_ids: | |
for row in soup.find_all('tr'): | |
cells = row.find_all(['td', 'th']) | |
if len(cells) > i: | |
cells[i].decompose() | |
return str(soup) | |
def get_embedding_attribution(augmented_html_table, decomposed_fact_list, topK=5, threshold = 0.7): | |
row_attribution_ids = [] | |
col_attribution_ids = [] | |
for i in range(len(decomposed_fact_list)): | |
answer = decomposed_fact_list[i] | |
rorAttr = row_attribution(answer, augmented_html_table, topK) | |
colAttr = col_attribution(answer, augmented_html_table, topK) | |
row_attribution_ids.extend(rorAttr) | |
col_attribution_ids.extend(colAttr) | |
attributed_html_table = retain_rows_and_columns(augmented_html_table, row_attribution_ids, col_attribution_ids) | |
return attributed_html_table, row_attribution_ids, col_attribution_ids | |