|
import io |
|
import os |
|
import pandas as pd |
|
import streamlit as st |
|
|
|
from concurrent.futures import ThreadPoolExecutor |
|
from datetime import datetime |
|
from langchain_community.document_loaders.pdf import PyPDFLoader |
|
from langchain_core.documents.base import Document |
|
from langchain_text_splitters import TokenTextSplitter |
|
from process import Process |
|
from tempfile import NamedTemporaryFile |
|
from stqdm import stqdm |
|
|
|
buffer = io.BytesIO() |
|
|
|
st.cache_data() |
|
st.set_page_config(page_title="NutriGenMe Paper Extractor") |
|
st.title("NutriGenMe - Paper Extraction") |
|
st.markdown("<div style='text-align: left; color: white; font-size: 16px'>In its latest version, the app is equipped to extract essential information from papers, including tables in both horizontal and vertical orientations, images, and text exclusively.</div><br>", unsafe_allow_html=True) |
|
|
|
uploaded_files = st.file_uploader("Upload Paper(s) here :", type="pdf", accept_multiple_files=True) |
|
|
|
col1, col2, col3 = st.columns(3) |
|
|
|
with col1: |
|
models = ( |
|
'gpt-4-turbo', |
|
'gemini-1.5-pro-latest', |
|
|
|
|
|
) |
|
model = st.selectbox( |
|
'Model selection:', models, key='model' |
|
) |
|
|
|
with col2: |
|
tokens = ( |
|
8000, |
|
16000, |
|
24000 |
|
) |
|
chunk_option = st.selectbox( |
|
'Token amounts per process:', tokens, key='token' |
|
) |
|
chunk_overlap = 0 |
|
|
|
with col3: |
|
models_val = ( |
|
'gemini-1.5-pro-latest', |
|
'gpt-4-turbo', |
|
'mixtral-8x7b-instruct', |
|
|
|
) |
|
model_val = st.selectbox( |
|
'Model validator selection:', models_val, key='model_val' |
|
) |
|
|
|
|
|
if uploaded_files: |
|
journals = [] |
|
parseButtonHV = st.button("Get Result", key='table_HV') |
|
|
|
if parseButtonHV: |
|
with st.status("Extraction in progress ...", expanded=True) as status: |
|
start_time = datetime.now() |
|
|
|
for uploaded_file in stqdm(uploaded_files): |
|
with NamedTemporaryFile(dir='.', suffix=".pdf", delete=eval(os.getenv('DELETE_TEMP_PDF', 'True'))) as pdf: |
|
pdf.write(uploaded_file.getbuffer()) |
|
|
|
|
|
loader = PyPDFLoader(pdf.name) |
|
pages = loader.load() |
|
|
|
chunk_size = 120000 |
|
chunk_overlap = 0 |
|
docs = pages |
|
|
|
|
|
if chunk_option: |
|
docs = [Document('\n'.join([page.page_content for page in pages]))] |
|
docs[0].metadata = {'source': pages[0].metadata['source']} |
|
|
|
chunk_size = chunk_option |
|
chunk_overlap = int(0.25 * chunk_size) |
|
|
|
text_splitter = TokenTextSplitter.from_tiktoken_encoder( |
|
chunk_size=chunk_size, chunk_overlap=chunk_overlap |
|
) |
|
chunks = text_splitter.split_documents(docs) |
|
|
|
|
|
process = Process(model, model_val) |
|
with ThreadPoolExecutor() as executor: |
|
result_gsd = executor.submit(process.get_entity, (chunks, 'gsd')) |
|
result_summ = executor.submit(process.get_entity, (chunks, 'summ')) |
|
result = executor.submit(process.get_entity, (chunks, 'all')) |
|
result_one = executor.submit(process.get_entity_one, [c.page_content for c in chunks[:1]]) |
|
result_table = executor.submit(process.get_table, pdf.name) |
|
|
|
result_gsd = result_gsd.result() |
|
result_summ = result_summ.result() |
|
result = result.result() |
|
result_one = result_one.result() |
|
res_gene, res_snp, res_dis = result_table.result() |
|
|
|
|
|
result['Genes'] = res_gene + result_gsd['Genes'] |
|
result['SNPs'] = res_snp + result_gsd['SNPs'] |
|
result['Diseases'] = res_dis + result_gsd['Diseases'] |
|
result['Conclusion'] = result_summ |
|
for k in result_one.keys(): |
|
result[k] = result_one[k] |
|
|
|
if len(result['Genes']) == 0: |
|
result['Genes'] = [''] |
|
|
|
num_rows = max(max(len(result['Genes']), len(result['SNPs'])), len(result['Diseases'])) |
|
|
|
|
|
for k in ['Genes', 'SNPs', 'Diseases']: |
|
while len(result[k]) < num_rows: |
|
result[k].append('') |
|
|
|
|
|
result[k] = result[k][:num_rows] |
|
|
|
|
|
result = {key: value if isinstance(value, list) else [value] * num_rows for key, value in result.items()} |
|
|
|
dataframe = pd.DataFrame(result) |
|
dataframe = dataframe[['Genes', 'SNPs', 'Diseases', 'Title', 'Authors', 'Publisher Name', 'Publication Year', 'Population', 'Sample Size', 'Study Methodology', 'Study Level', 'Conclusion']] |
|
dataframe = dataframe[dataframe['Genes'].astype(bool)].reset_index(drop=True) |
|
dataframe.drop_duplicates(['Genes', 'SNPs'], inplace=True) |
|
dataframe.reset_index(drop=True, inplace=True) |
|
|
|
|
|
df, df_no_llm, df_clean = process.validate(dataframe) |
|
|
|
end_time = datetime.now() |
|
st.write("Success in ", round((end_time.timestamp() - start_time.timestamp()) / 60, 2), "minutes") |
|
|
|
st.dataframe(df) |
|
with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer: |
|
df.to_excel(writer, sheet_name='Result Cleaned API LLM') |
|
df_no_llm.to_excel(writer, sheet_name='Result Cleaned API') |
|
df_clean.to_excel(writer, sheet_name='Result Cleaned') |
|
dataframe.to_excel(writer, sheet_name='Original') |
|
writer.close() |
|
|
|
st.download_button( |
|
label="Save Result", |
|
data=buffer, |
|
file_name=f"{uploaded_file.name.replace('.pdf', '')}_{chunk_option}_{model.split('-')[0]}_{model_val.split('-')[0]}.xlsx", |
|
mime='application/vnd.ms-excel' |
|
) |
|
|