Spaces:
Build error
Build error
File size: 5,139 Bytes
35d31f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import numpy as np
from bs4 import BeautifulSoup
from sklearn.preprocessing import minmax_scale
from sentence_transformers import SentenceTransformer, util
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
sbert = SentenceTransformer("all-MiniLM-L6-v2")
from llm_query_api import *
def get_row_embedding(html_table):
def get_row_elements(html_table):
tr_elements = []
soup = BeautifulSoup(html_table, 'html.parser')
tr_tags = soup.find_all('tr')
for t in tr_tags:
temp = " " + str(t.get('description'))
try:
tr_elements.append({'id':str(t.get('id')), 'text': temp})
except:
pass
return tr_elements
rows = get_row_elements(html_table)
all_elements = rows
sentences = []
element_ids = []
for i in range(len(all_elements)):
sentences.append(all_elements[i]['text'])
element_ids.append(all_elements[i]['id'])
embeddings = sbert.encode(sentences, convert_to_tensor=True).cpu().numpy()
return embeddings, element_ids
def get_col_embedding(html_table):
def get_column_elements(html_table):
th_elements = []
soup = BeautifulSoup(html_table, 'html.parser')
th_tags = soup.find_all('th')
for t in th_tags:
temp = " " + str(t.get('description'))
try:
th_elements.append({'id':str(t.get('id')), 'text': temp})
except:
pass
return th_elements
cols = get_column_elements(html_table)
all_elements = cols
sentences = []
element_ids = []
for i in range(len(all_elements)):
sentences.append(all_elements[i]['text'])
element_ids.append(all_elements[i]['id'])
embeddings = sbert.encode(sentences, convert_to_tensor=True).cpu().numpy()
return embeddings, element_ids
def normalize_list_numpy(list_numpy):
normalized_list = minmax_scale(list_numpy)
return normalized_list
def get_answer_embedding(answer):
return sbert.encode([answer], convert_to_tensor=True).cpu().numpy()
def row_attribution(answer, html_table, topk=5, threshold = 0.7):
answer_embedding = get_answer_embedding(answer)
row_embedding = get_row_embedding(html_table)
similarities = cosine_similarity(row_embedding[0], answer_embedding.reshape(1, -1))
sims = similarities.flatten()
sims = normalize_list_numpy(sims)
#if no of rows >= 5, take max of (5, 1/3 x rows)
#else if no of rows < 5, take least of (5, rows)
k = max(topk, int(0.3*len(sims)))
k = min(k, len(sims))
top_k_indices = np.argpartition(sims, -k)[-k:]
sorted_indices = top_k_indices[np.argsort(sims[top_k_indices])][::-1]
top_k_results = [row_embedding[1][idx] for idx in sorted_indices]
return top_k_results
def col_attribution(answer, html_table, topk=5, threshold = 0.7):
answer_embedding = get_answer_embedding(answer)
col_embedding = get_col_embedding(html_table)
similarities = cosine_similarity(col_embedding[0], answer_embedding.reshape(1, -1))
sims = similarities.flatten()
sims = normalize_list_numpy(sims)
#if no of cols >= 5, take max of (5, 1/3 x cols)
#else if no of cols < 5, take least of (5, cols)
k = max(topk, int(0.3*len(sims)))
k = min(k, len(sims))
top_k_indices = np.argpartition(sims, -k)[-k:]
sorted_indices = top_k_indices[np.argsort(sims[top_k_indices])][::-1]
top_k_results = [col_embedding[1][idx] for idx in sorted_indices]
return top_k_results
def retain_rows_and_columns(augmented_html_table, row_ids, column_ids):
soup = BeautifulSoup(augmented_html_table, 'html.parser')
row_ids = list(set(row_ids))
column_ids = list(set(column_ids))
# Retain specified rows and remove others
all_rows = soup.find_all('tr')
for row in all_rows:
if row.get('id') not in row_ids:
row.decompose()
# Retain specified columns and remove others
if all_rows:
all_columns = all_rows[0].find_all(['th'])
for i, col in enumerate(all_columns):
if col.get('id') not in column_ids:
for row in soup.find_all('tr'):
cells = row.find_all(['td', 'th'])
if len(cells) > i:
cells[i].decompose()
return str(soup)
def get_embedding_attribution(augmented_html_table, decomposed_fact_list, topK=5, threshold = 0.7):
row_attribution_ids = []
col_attribution_ids = []
for i in range(len(decomposed_fact_list)):
answer = decomposed_fact_list[i]
rorAttr = row_attribution(answer, augmented_html_table, topK)
colAttr = col_attribution(answer, augmented_html_table, topK)
row_attribution_ids.extend(rorAttr)
col_attribution_ids.extend(colAttr)
attributed_html_table = retain_rows_and_columns(augmented_html_table, row_attribution_ids, col_attribution_ids)
return attributed_html_table, row_attribution_ids, col_attribution_ids
|