Spaces:

KalbeDigitalLab
/

nutrigenme-paper-extractor

Running

App Files Files Community

fadliaulawi commited on May 24

Commit

63bec36

•

1 Parent(s): 8503206

Remove unnecessary function

Browse files

Files changed (1) hide show

process.py +4 -166

process.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from datetime import datetime
 from dotenv import load_dotenv
 from img2table.document import Image
 from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain
@@ -9,7 +8,7 @@ from langchain.prompts import PromptTemplate
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_openai import ChatOpenAI
 from pdf2image import convert_from_path
-from prompt import prompt_entity_gsd_chunk, prompt_entity_gsd_combine, prompt_entity_summ_chunk, prompt_entity_summ_combine, prompt_entities_chunk, prompt_entities_combine, prompt_entity_one_chunk, prompt_table, prompt_validation
 from table_detector import detection_transform, device, model, ocr, outputs_to_objects
 import io
@@ -17,8 +16,6 @@ import json
 import os
 import pandas as pd
 import re
-import requests
-import time
 import torch
 load_dotenv()
@@ -31,7 +28,7 @@ prompts = {
 class Process():
-    def __init__(self, llm, llm_val):
         if llm.startswith('gpt'):
             self.llm = ChatOpenAI(temperature=0, model_name=llm)
@@ -40,13 +37,6 @@ class Process():
         else:
             self.llm = ChatOpenAI(temperature=0, model_name=llm, api_key=os.environ['PERPLEXITY_API_KEY'], base_url="https://api.perplexity.ai")
-        if llm_val.startswith('gpt'):
-            self.llm_val = ChatOpenAI(temperature=0, model_name=llm_val)
-        elif llm_val.startswith('gemini'):
-            self.llm_val = ChatGoogleGenerativeAI(temperature=0, model=llm_val)
-        else:
-            self.llm_val = ChatOpenAI(temperature=0, model_name=llm_val, api_key=os.environ['PERPLEXITY_API_KEY'], base_url="https://api.perplexity.ai")
     def get_entity(self, data):
         chunks, types = data
@@ -97,9 +87,7 @@ class Process():
     def get_table(self, path):
-        start_time = datetime.now()
         images = convert_from_path(path)
-        print('PDF to Image', round((datetime.now().timestamp() - start_time.timestamp()) / 60, 2), "minutes")
         tables = []
         # Loop pages
@@ -124,7 +112,6 @@ class Process():
                     print(detected_tables[idx])
                     tables.append(cropped_table)
-        print('Detect table from image', round((datetime.now().timestamp() - start_time.timestamp()) / 60, 2), "minutes")
         genes = []
         snps = []
         diseases = []
@@ -147,29 +134,10 @@ class Process():
             for extracted_table in extracted_tables[1:]:
                 df_table = pd.concat([df_table, extracted_table.df]).reset_index(drop=True)
-            df_table.loc[0] = df_table.loc[0].fillna('')
-            # Identify multiple rows (in dataframe) as one row (in image)
-            rows = []
-            indexes = []
-            for i in df_table.index:
-                if not df_table.loc[i].isna().any():
-                    if len(indexes) > 0:
-                        rows.append(indexes)
-                    indexes = []
-                indexes.append(i)
-            rows.append(indexes)
-            df_table_cleaned = pd.DataFrame(columns=df_table.columns)
-            for row in rows:
-                row_str = df_table.loc[row[0]]
-                for idx in row[1:]:
-                    row_str += ' ' + df_table.loc[idx].fillna('')
-                row_str = row_str.str.strip()
-                df_table_cleaned.loc[len(df_table_cleaned)] = row_str
             # Ask LLM with JSON data
-            json_table = df_table_cleaned.to_json(orient='records')
             str_json_table = json.dumps(json.loads(json_table), indent=2)
             result = self.llm.invoke(prompt_table.format(str_json_table)).content
@@ -191,135 +159,5 @@ class Process():
                     snps.append(snp)
                     diseases.append(res_disease)
-        print('OCR table to extract', round((datetime.now().timestamp() - start_time.timestamp()) / 60, 2), "minutes")
         print(genes, snps, diseases)
         return genes, snps, diseases
-    def validate(self, df):
-        df = df.fillna('')
-        df['Genes'] = df['Genes'].str.replace(' ', '').str.upper()
-        df['SNPs'] = df['SNPs'].str.lower()
-        # Check if there is two gene names
-        sym = [',', '/', '|']
-        for i in df.index:
-            gene = df.loc[i, 'Genes']
-            for s in sym:
-                if s in gene:
-                    genes = gene.split(s)
-                    df.loc[i + 0.5] = df.loc[i]
-                    df = df.sort_index().reset_index(drop=True)
-                    df.loc[i, 'Genes'], df.loc[i + 1, 'Genes'] = genes[0], s.join(genes[1:])
-                    break
-        # Check if there is SNPs without 'rs'
-        for i in df.index:
-            safe = True
-            snp = df.loc[i, 'SNPs']
-            snp = snp.replace('l', '1')
-            if re.fullmatch('rs(\d)+|', snp):
-                pass
-            elif re.fullmatch('ts(\d)+', snp):
-                snp = 'r' + snp[1:]
-            elif re.fullmatch('s(\d)+', snp):
-                snp = 'r' + snp
-            elif re.fullmatch('(\d)+', snp):
-                snp = 'rs' + snp
-            else:
-                safe = False
-                df = df.drop(i)
-            if safe:
-                df.loc[i, 'SNPs'] = snp
-        df.reset_index(drop=True, inplace=True)
-        df_clean = df.copy()
-        # # Validate genes and SNPs with APIs
-        def permutate(word):
-            if len(word) == 0:
-                return ['']
-            change = []
-            res = permutate(word[1:])
-            if word[0] in mistakes:
-                change = [mistakes[word[0]] + r for r in res]
-            return [word[0] + r for r in res] + change
-        def call(url):
-            while True:
-                try:
-                    res = requests.get(url)
-                    time.sleep(1)
-                    break
-                except Exception as e:
-                    print(e)
-            return res
-        mistakes = {'I': '1', 'O': '0'} # Common mistakes need to be maintained
-        dbsnp = {}
-        for i in df.index:
-            snp = df.loc[i, 'SNPs']
-            gene = df.loc[i, 'Genes']
-            if snp not in dbsnp:
-                res = call(f'https://www.ebi.ac.uk/gwas/rest/api/singleNucleotidePolymorphisms/{snp}/')
-                try:
-                    res = res.json()
-                    dbsnp[snp] = [r['gene']['geneName'] for r in res['genomicContexts']]
-                except:
-                    dbsnp[snp] = []
-                res = call(f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=snp&retmode=json&id={snp[2:]}').json()['result'][snp[2:]]
-                if 'error' not in res:
-                    dbsnp[snp].extend([r['name'] for r in res['genes']])
-                dbsnp[snp] = list(set(dbsnp[snp]))
-            if gene not in dbsnp[snp]:
-                for other in permutate(gene):
-                    if other in dbsnp[snp]:
-                        df.loc[i, 'Genes'] = other
-                        print(f'{gene} corrected to {other}')
-                        break
-                else:
-                    df = df.drop(i)
-        # df.reset_index(drop=True, inplace=True)
-        df_no_llm = df.copy()
-        # Validate genes and diseases with LLM (for each 50 rows)
-        idx = 0
-        results = []
-        while True:
-            json_table = df[['Genes', 'SNPs', 'Diseases']][idx:idx+50].to_json(orient='records')
-            str_json_table = json.dumps(json.loads(json_table), indent=2)
-            result = self.llm_val.invoke(input=prompt_validation.format(str_json_table)).content
-            print('val', idx)
-            print(result)
-            result = result[result.find('['):result.rfind(']')+1]
-            try:
-                result = eval(result)
-            except SyntaxError:
-                result = []
-            results.extend(result)
-            idx += 50
-            if idx not in df.index:
-                break
-        df = pd.DataFrame(results)
-        df = df.merge(df_no_llm.head(1).drop(['Genes', 'SNPs', 'Diseases'], axis=1), 'cross')
-        return df, df_no_llm, df_clean

 from dotenv import load_dotenv
 from img2table.document import Image
 from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_openai import ChatOpenAI
 from pdf2image import convert_from_path
+from prompt import *
 from table_detector import detection_transform, device, model, ocr, outputs_to_objects
 import io
 import os
 import pandas as pd
 import re
 import torch
 load_dotenv()
 class Process():
+    def __init__(self, llm):
         if llm.startswith('gpt'):
             self.llm = ChatOpenAI(temperature=0, model_name=llm)
         else:
             self.llm = ChatOpenAI(temperature=0, model_name=llm, api_key=os.environ['PERPLEXITY_API_KEY'], base_url="https://api.perplexity.ai")
     def get_entity(self, data):
         chunks, types = data
     def get_table(self, path):
         images = convert_from_path(path)
         tables = []
         # Loop pages
                     print(detected_tables[idx])
                     tables.append(cropped_table)
         genes = []
         snps = []
         diseases = []
             for extracted_table in extracted_tables[1:]:
                 df_table = pd.concat([df_table, extracted_table.df]).reset_index(drop=True)
+            df_table = df_table.fillna('')
             # Ask LLM with JSON data
+            json_table = df_table.to_json(orient='records')
             str_json_table = json.dumps(json.loads(json_table), indent=2)
             result = self.llm.invoke(prompt_table.format(str_json_table)).content
                     snps.append(snp)
                     diseases.append(res_disease)
         print(genes, snps, diseases)
         return genes, snps, diseases