File size: 5,139 Bytes
35d31f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import numpy as np
from bs4 import BeautifulSoup
from sklearn.preprocessing import minmax_scale
from sentence_transformers import SentenceTransformer, util
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
sbert = SentenceTransformer("all-MiniLM-L6-v2")
from llm_query_api import *

def get_row_embedding(html_table):
    
    def get_row_elements(html_table):
        tr_elements = []
        soup = BeautifulSoup(html_table, 'html.parser')
        tr_tags = soup.find_all('tr')
        for t in tr_tags:
            temp = " " + str(t.get('description'))
            try:
                tr_elements.append({'id':str(t.get('id')), 'text': temp})
            except:
                pass
        return tr_elements
    
    rows = get_row_elements(html_table)
    
    all_elements = rows
    sentences = []
    element_ids = []
    for i in range(len(all_elements)):
        sentences.append(all_elements[i]['text'])
        element_ids.append(all_elements[i]['id'])
    
    embeddings = sbert.encode(sentences, convert_to_tensor=True).cpu().numpy()
    return embeddings, element_ids
    
def get_col_embedding(html_table):
    
    def get_column_elements(html_table):
        th_elements = []
        soup = BeautifulSoup(html_table, 'html.parser')
        th_tags = soup.find_all('th')
        for t in th_tags:
            temp = " " + str(t.get('description'))
            try:
                th_elements.append({'id':str(t.get('id')), 'text': temp})
            except:
                pass
            
        return th_elements
    
    cols = get_column_elements(html_table)
    
    all_elements = cols
    sentences = []
    element_ids = []
    for i in range(len(all_elements)):
        sentences.append(all_elements[i]['text'])
        element_ids.append(all_elements[i]['id'])
    
    embeddings = sbert.encode(sentences, convert_to_tensor=True).cpu().numpy()
    return embeddings, element_ids

def normalize_list_numpy(list_numpy):
    normalized_list = minmax_scale(list_numpy)
    return normalized_list

def get_answer_embedding(answer):
    return sbert.encode([answer], convert_to_tensor=True).cpu().numpy()

def row_attribution(answer, html_table, topk=5, threshold = 0.7):
    
    answer_embedding = get_answer_embedding(answer)
    row_embedding = get_row_embedding(html_table)

    similarities = cosine_similarity(row_embedding[0], answer_embedding.reshape(1, -1))
    sims = similarities.flatten()
    sims = normalize_list_numpy(sims)
    #if no of rows >= 5, take max of (5, 1/3 x rows)
    #else if no of rows < 5, take least of (5,  rows)
    k = max(topk, int(0.3*len(sims)))
    k = min(k, len(sims))
    top_k_indices = np.argpartition(sims, -k)[-k:]    
    sorted_indices = top_k_indices[np.argsort(sims[top_k_indices])][::-1]
    top_k_results = [row_embedding[1][idx] for idx in sorted_indices]
    
    return top_k_results

def col_attribution(answer, html_table, topk=5, threshold = 0.7):
    
    answer_embedding = get_answer_embedding(answer)
    col_embedding = get_col_embedding(html_table)
    
    similarities = cosine_similarity(col_embedding[0], answer_embedding.reshape(1, -1))
    sims = similarities.flatten()
    sims = normalize_list_numpy(sims)
    #if no of cols >= 5, take max of (5, 1/3 x cols)
    #else if no of cols < 5, take least of (5,  cols)
    k = max(topk, int(0.3*len(sims)))
    k = min(k, len(sims))
    top_k_indices = np.argpartition(sims, -k)[-k:]    
    sorted_indices = top_k_indices[np.argsort(sims[top_k_indices])][::-1]
    top_k_results = [col_embedding[1][idx] for idx in sorted_indices]
    
    return top_k_results

def retain_rows_and_columns(augmented_html_table, row_ids, column_ids):
    soup = BeautifulSoup(augmented_html_table, 'html.parser')

    row_ids = list(set(row_ids))
    column_ids = list(set(column_ids))

    # Retain specified rows and remove others
    all_rows = soup.find_all('tr')
    for row in all_rows:
        if row.get('id') not in row_ids:
            row.decompose()

    # Retain specified columns and remove others
    if all_rows:
        all_columns = all_rows[0].find_all(['th'])
        for i, col in enumerate(all_columns):
            if col.get('id') not in column_ids:
                for row in soup.find_all('tr'):
                    cells = row.find_all(['td', 'th'])
                    if len(cells) > i:
                        cells[i].decompose()

    return str(soup)

def get_embedding_attribution(augmented_html_table, decomposed_fact_list, topK=5, threshold = 0.7):
    
    row_attribution_ids = []
    col_attribution_ids = []

    for i in range(len(decomposed_fact_list)):
        answer = decomposed_fact_list[i]
        
        rorAttr = row_attribution(answer, augmented_html_table, topK)
        colAttr = col_attribution(answer, augmented_html_table, topK)

        row_attribution_ids.extend(rorAttr)
        col_attribution_ids.extend(colAttr)

        attributed_html_table = retain_rows_and_columns(augmented_html_table, row_attribution_ids, col_attribution_ids)
           
    return attributed_html_table, row_attribution_ids, col_attribution_ids