import streamlit as st import pickle import pandas as pd from sentence_transformers import SentenceTransformer, util import numpy as np from pymongo import MongoClient import urllib.parse import requests from bertopic import BERTopic from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import string import deepcut import unicodedata from pythainlp.util import normalize import torch import csv #initial state if 'state' not in st.session_state: st.session_state.state = 0 if 'age' not in st.session_state: st.session_state.age = 0 if 'weight' not in st.session_state: st.session_state.weight = 0 if 'height' not in st.session_state: st.session_state.height = 0 if 'gender' not in st.session_state: st.session_state.gender = 0 if 'food_allergy' not in st.session_state: st.session_state.food_allergy = 0 if 'drug_allergy' not in st.session_state: st.session_state.drug_allergy = 0 if 'congentital_disease' not in st.session_state: st.session_state.congentital_disease = 0 if 'optional_keyword' not in st.session_state: st.session_state.optional_keyword = 0 if 'all_recommend' not in st.session_state: st.session_state.all_recommend = None if 'true_check' not in st.session_state: st.session_state.true_check = None if 'queries' not in st.session_state: st.session_state.queries = None if 'string_contain' not in st.session_state: st.session_state.string_contain = False if 'sbert_searched_df' not in st.session_state: st.session_state.sbert_searched_df = None if 'string_contain_df' not in st.session_state: st.session_state.string_contain_df = None for i in range(10): if 'score_'+str(i+1) not in st.session_state: st.session_state['score_'+str(i+1)] = 'NA' def set_state(state): st.session_state.state = state def split_text(text): return text.split(',') #import data sbert_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2') with open('corpus_embeddings.pickle', 'rb') as file: corpus_embeddings = pickle.load(file) corpus_embeddings = pd.DataFrame(corpus_embeddings) topic_model = BERTopic.load("topic_model.pickle") data = pd.read_csv('articles_data.csv') data['child_topic'] = topic_model.topics_[:] with open('sensitive_words.txt', 'r',encoding='utf-8') as file: sensitive_words = file.read() sensitive_words = sensitive_words.lower().replace('\n','').split(' ') sensitive_words = list(set(sensitive_words)) #local function def save_session_state_data(session_state_data, filename): with open(filename, 'a', newline='', encoding='utf-8') as file: writer = csv.DictWriter(file, fieldnames=session_state_data.keys()) if file.tell() == 0: writer.writeheader() writer.writerow(session_state_data) def deepcut_tokenizer(text,sensitive_words=sensitive_words): cleanedText = "".join([i for i in text if i not in string.punctuation]).lower() cleanedText = normalize(unicodedata.normalize('NFKD', cleanedText).replace('\n','').replace('\r','').replace('\t','').replace('“','').replace('”','').replace('.','').replace('–','').replace('‘','').replace('’','').replace('ํา','ำ').replace('...','').replace(',','').replace( 'ี','ี')) #cleanedText = re.sub(r'\d+', '', cleanedText) cleanedText = deepcut.tokenize(cleanedText,custom_dict=sensitive_words) #stopwords = list(thai_stopwords())+'EMagazine GJ international bangkok hospital'.lower().split(' ') stopwords = 'EMagazine GJ international bangkok hospital'.lower().split(' ') cleanedText = [i for i in cleanedText if i not in stopwords] cleanedText = [i.replace(' ','') for i in cleanedText if len(i) != 1 and len(i) !=0] cleanedText = ','.join(cleanedText) return cleanedText def personal_check(age,weight,height,gender): #age check if age >= 60: age = 'ผู้สูงอายุ' else: age = 'ทำงาน' #gender check if gender == 'หญิง': gender = 'ผู้หญิง' else: gender = 'ผู้ชาย' #bmi check height_meters = height / 100 bmi = weight / (height_meters ** 2) if bmi >= 30: bmi = 'อ้วนมาก' elif bmi >= 23 and bmi <30: bmi = 'อ้วน' elif bmi >= 18.5 and bmi <23: bmi = '' else: bmi = 'ผอม' return age,gender,bmi def sbert_search(queries,data,embeddiing,sbert_model=sbert_model): index_lst = [] score_lst = [] query_embedding = sbert_model.encode(queries, convert_to_tensor=True) hits = util.semantic_search(query_embedding, embeddiing, top_k=15) hits = hits[0] for hit in hits: index_lst.append(hit['corpus_id']) score_lst.append(hit['score']) sbert_searched = data.iloc[index_lst] sbert_searched['score'] = score_lst return sbert_searched def sbert_tfidf_search(queries,head,topic_model=topic_model,data=data,corpus_embeddings=corpus_embeddings): similar_df = None text_to_predict_token = deepcut_tokenizer(queries) # Find topics try: similar_topics, similarity = topic_model.find_topics(text_to_predict_token, top_n=1) except: similar_topics, similarity = topic_model.find_topics(queries, top_n=1) # Example DataFrame similar_df = data[data['child_topic'] == similar_topics[0]] # TF-IDF vectorizer vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False) tfidf_matrix = vectorizer.fit_transform(similar_df['text_token']) # TF-IDF vector for input text text_tfidf = vectorizer.transform([text_to_predict_token]) # Compute cosine similarity similarity_scores = cosine_similarity(tfidf_matrix, text_tfidf) # Add similarity scores to DataFrame similar_df['score'] = similarity_scores similar_df = similar_df.sort_values('score', ascending=False).head(15) select_corpus = corpus_embeddings.iloc[similar_df.index.sort_values()] similar_embedding = torch.tensor(select_corpus.values) similar_searched = sbert_search(queries,similar_df,similar_embedding) sbert_searched = sbert_search(queries,data,torch.tensor(corpus_embeddings.values)) combined_searched = pd.concat([similar_searched,sbert_searched]) output = combined_searched.sort_values('score', ascending=False).head(head) return output def string_contain_search(queries,sample,data=data): data['all_content'] = data['title']+data['content'] return data[data['all_content'].str.contains(queries,na=False)].sample(sample) #main def main(): #header st.markdown("