Spaces:

puneetm
/

Matsa-demo

Build error

File size: 5,139 Bytes

35d31f5

import numpy as np
from bs4 import BeautifulSoup
from sklearn.preprocessing import minmax_scale
from sentence_transformers import SentenceTransformer, util
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
sbert = SentenceTransformer("all-MiniLM-L6-v2")
from llm_query_api import *

def get_row_embedding(html_table):
    
    def get_row_elements(html_table):
        tr_elements = []
        soup = BeautifulSoup(html_table, 'html.parser')
        tr_tags = soup.find_all('tr')
        for t in tr_tags:
            temp = " " + str(t.get('description'))
            try:
                tr_elements.append({'id':str(t.get('id')), 'text': temp})
            except:
                pass
        return tr_elements
    
    rows = get_row_elements(html_table)
    
    all_elements = rows
    sentences = []
    element_ids = []
    for i in range(len(all_elements)):
        sentences.append(all_elements[i]['text'])
        element_ids.append(all_elements[i]['id'])
    
    embeddings = sbert.encode(sentences, convert_to_tensor=True).cpu().numpy()
    return embeddings, element_ids
    
def get_col_embedding(html_table):
    
    def get_column_elements(html_table):
        th_elements = []
        soup = BeautifulSoup(html_table, 'html.parser')
        th_tags = soup.find_all('th')
        for t in th_tags:
            temp = " " + str(t.get('description'))
            try:
                th_elements.append({'id':str(t.get('id')), 'text': temp})
            except:
                pass
            
        return th_elements
    
    cols = get_column_elements(html_table)
    
    all_elements = cols
    sentences = []
    element_ids = []
    for i in range(len(all_elements)):
        sentences.append(all_elements[i]['text'])
        element_ids.append(all_elements[i]['id'])
    
    embeddings = sbert.encode(sentences, convert_to_tensor=True).cpu().numpy()
    return embeddings, element_ids

def normalize_list_numpy(list_numpy):
    normalized_list = minmax_scale(list_numpy)
    return normalized_list

def get_answer_embedding(answer):
    return sbert.encode([answer], convert_to_tensor=True).cpu().numpy()

def row_attribution(answer, html_table, topk=5, threshold = 0.7):
    
    answer_embedding = get_answer_embedding(answer)
    row_embedding = get_row_embedding(html_table)

    similarities = cosine_similarity(row_embedding[0], answer_embedding.reshape(1, -1))
    sims = similarities.flatten()
    sims = normalize_list_numpy(sims)
    #if no of rows >= 5, take max of (5, 1/3 x rows)
    #else if no of rows < 5, take least of (5,  rows)
    k = max(topk, int(0.3*len(sims)))
    k = min(k, len(sims))
    top_k_indices = np.argpartition(sims, -k)[-k:]    
    sorted_indices = top_k_indices[np.argsort(sims[top_k_indices])][::-1]
    top_k_results = [row_embedding[1][idx] for idx in sorted_indices]
    
    return top_k_results

def col_attribution(answer, html_table, topk=5, threshold = 0.7):
    
    answer_embedding = get_answer_embedding(answer)
    col_embedding = get_col_embedding(html_table)
    
    similarities = cosine_similarity(col_embedding[0], answer_embedding.reshape(1, -1))
    sims = similarities.flatten()
    sims = normalize_list_numpy(sims)
    #if no of cols >= 5, take max of (5, 1/3 x cols)
    #else if no of cols < 5, take least of (5,  cols)
    k = max(topk, int(0.3*len(sims)))
    k = min(k, len(sims))
    top_k_indices = np.argpartition(sims, -k)[-k:]    
    sorted_indices = top_k_indices[np.argsort(sims[top_k_indices])][::-1]
    top_k_results = [col_embedding[1][idx] for idx in sorted_indices]
    
    return top_k_results

def retain_rows_and_columns(augmented_html_table, row_ids, column_ids):
    soup = BeautifulSoup(augmented_html_table, 'html.parser')

    row_ids = list(set(row_ids))
    column_ids = list(set(column_ids))

    # Retain specified rows and remove others
    all_rows = soup.find_all('tr')
    for row in all_rows:
        if row.get('id') not in row_ids:
            row.decompose()

    # Retain specified columns and remove others
    if all_rows:
        all_columns = all_rows[0].find_all(['th'])
        for i, col in enumerate(all_columns):
            if col.get('id') not in column_ids:
                for row in soup.find_all('tr'):
                    cells = row.find_all(['td', 'th'])
                    if len(cells) > i:
                        cells[i].decompose()

    return str(soup)

def get_embedding_attribution(augmented_html_table, decomposed_fact_list, topK=5, threshold = 0.7):
    
    row_attribution_ids = []
    col_attribution_ids = []

    for i in range(len(decomposed_fact_list)):
        answer = decomposed_fact_list[i]
        
        rorAttr = row_attribution(answer, augmented_html_table, topK)
        colAttr = col_attribution(answer, augmented_html_table, topK)

        row_attribution_ids.extend(rorAttr)
        col_attribution_ids.extend(colAttr)

        attributed_html_table = retain_rows_and_columns(augmented_html_table, row_attribution_ids, col_attribution_ids)
           
    return attributed_html_table, row_attribution_ids, col_attribution_ids