import base64 import json from prompts import * import ast from bs4 import BeautifulSoup from semantic_retrieval import * from llm_query_api import * import base64 from mimetypes import guess_type class InputInstance: def __init__(self, id=None, html_table=None, question=None, answer=None): self.id = id self.html_table = html_table self.question = question self.answer = answer return class MATSA: def __init__(self, llm = "gpt-4"): self.llm = llm self.llm_query_api = LLMQueryAPI() #LLMProxyQueryAPI() pass def table_formatting_agent(self, html_table = None, table_image_path = None): def local_image_to_data_url(image_path): mime_type, _ = guess_type(image_path) if mime_type is None: mime_type = 'application/octet-stream' with open(image_path, "rb") as image_file: base64_encoded_data = base64.b64encode(image_file.read()).decode('utf-8') return f"data:{mime_type};base64,{base64_encoded_data}" if table_image_path != None: tesseract = TesseractOCR() pdf = PDF(src=table_image_path, pages=[0, 0]) extracted_tables = pdf.extract_tables(ocr=tesseract, implicit_rows=True, borderless_tables=True,) html_table = extracted_tables[0][0].html_repr() table_image_data_url = local_image_to_data_url(table_image_path) query = table_image_to_html_prompt.replace("{{html_table}}", html_table) html_table = llm_query_api.get_llm_response("gpt-4V", query, table_image_data_url) soup = BeautifulSoup(html_table, 'html.parser') tr_tags = soup.find_all('tr') for i, tr_tag in enumerate(tr_tags): tr_tag['id'] = f"row-{i + 1}" # Assign unique ID using 'row-i' format if i == 0: th_tags = tr_tag.find_all('th') for i, th_tag in enumerate(th_tags): th_tag['id'] = f"col-{i + 1}" # Assign unique ID using 'col-i' format return str(soup) def description_augmentation_agent(self, html_table): query = col_description_prompt.replace("{{html_table}}", str(html_table)) col_augmented_html_table = self.llm_query_api.get_llm_response(self.llm, query) query = row_description_prompt.replace("{{html_table}}", str(col_augmented_html_table)) row_augmented_html_table = self.llm_query_api.get_llm_response(self.llm, query) query = trend_description_prompt.replace("{{html_table}}", str(row_augmented_html_table)) trend_augmented_html_table = self.llm_query_api.get_llm_response(self.llm, query) return trend_augmented_html_table def answer_decomposition_agent(self, answer): prompt = answer_decomposition_prompt query = prompt.replace("{{answer}}", answer) res = self.llm_query_api.get_llm_response(self.llm, query) res = ast.literal_eval(res) if isinstance(res, list): return res else: return None def semantic_retreival_agent(self, html_table, fact_list, topK=5): attributed_html_table, row_attribution_ids, col_attribution_ids = get_embedding_attribution(html_table, fact_list, topK) return attributed_html_table, row_attribution_ids, col_attribution_ids def sufficiency_attribution_agent(self, fact_list, attributed_html_table): fact_verification_function = {} fact_verification_list = [] for i in range(len(fact_list)): fact=fact_list[i] fxn = {} fxn["Fact " + str(i+1)+":"] = str(fact) # fxn["Verified"] = "..." fact_verification_list.append(fxn) fact_verification_function["List of Fact"] = fact_verification_list fact_verification_function["Row Citations"] = "[..., ..., ...]" fact_verification_function["Column Citations"] = "[..., ..., ...]" fact_verification_function["Explanation"] = "..." fact_verification_function_string = json.dumps(fact_verification_function) query = functional_attribution_prompt.replace("{{attributed_html_table}}", str(attributed_html_table)).replace("{{fact_verification_function}}", fact_verification_function_string) attribution_fxn = self.llm_query_api.get_llm_response(self.llm, query) attribution_fxn = attribution_fxn.replace("```json", "") attribution_fxn = attribution_fxn.replace("```", "") print(attribution_fxn) attribution_fxn = json.loads(attribution_fxn) if isinstance(attribution_fxn, dict): return attribution_fxn else: return None if __name__ == '__main__': html_table = """
Sr. Number Types Remark
A B C
1 Mitten Kity Teddy Names of cats
1 Tommy Rudolph Jerry Names of dogs
""" answer = "Tommy is a dog but Mitten is a cat." x = InputInstance(html_table=html_table, answer=answer) matsa_agent = MATSA() x_reformulated = matsa_agent.table_formatting_agent(x.html_table) print(x_reformulated) x_descriptions = matsa_agent.description_augmentation_agent(x_reformulated) print(x_descriptions) fact_list = matsa_agent.answer_decomposition_agent(x.answer) print(fact_list) attributed_html_table, row_attribution_ids, col_attribution_ids = matsa_agent.semantic_retreival_agent(x_descriptions, fact_list) print(attributed_html_table) attribution_fxn = matsa_agent.sufficiency_attribution_agent(fact_list, attributed_html_table) print(attribution_fxn) row_attribution_set = attribution_fxn["Row Citations"] col_attribution_set = attribution_fxn["Column Citations"] print(row_attribution_set) print(col_attribution_set)