wangchanberta-th-qa

Sleeping

App Files Files Community

SirinootKK commited on Feb 15, 2024

Commit

5f7b796

1 Parent(s): 4b72dd5

init

Browse files

Files changed (2) hide show

app.py +224 -0
requirements.txt +311 -0

app.py ADDED Viewed

	@@ -0,0 +1,224 @@

+# -*- coding: utf-8 -*-
+"""gradio_wangchanberta
+Automatically generated by Colaboratory.
+Original file is located at
+    https://colab.research.google.com/drive/1Kw2k1oymhq4ZAcy4oBYOlIg4bBU-HlVr
+"""
+#@title scirpts
+import time
+import numpy as np
+import pandas as pd
+import torch
+import faiss
+from sklearn.preprocessing import normalize
+from transformers import AutoTokenizer, AutoModelForQuestionAnswering
+from sentence_transformers import SentenceTransformer,util
+from pythainlp import Tokenizer
+import pickle
+import evaluate
+from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances
+print(torch.cuda.is_available())
+__all__ = [
+    "mdeberta",
+    "wangchanberta-hyp", # Best model
+]
+predict_method = [
+    "faiss",
+    "faissWithModel",
+    "cosineWithModel",
+    "semanticSearchWithModel",
+]
+DEFAULT_MODEL='wangchanberta-hyp'
+DEFAULT_SENTENCE_EMBEDDING_MODEL='intfloat/multilingual-e5-base'
+MODEL_DICT = {
+    'wangchanberta': 'Chananchida/wangchanberta-th-wiki-qa_ref-params',
+    'wangchanberta-hyp': 'Chananchida/wangchanberta-th-wiki-qa_hyp-params',
+    'mdeberta': 'Chananchida/mdeberta-v3-th-wiki-qa_ref-params',
+    'mdeberta-hyp': 'Chananchida/mdeberta-v3-th-wiki-qa_hyp-params',
+}
+DATA_PATH='models/dataset.xlsx'
+EMBEDDINGS_PATH='models/embeddings.pkl'
+class ChatbotModel:
+    def __init__(self, model=DEFAULT_MODEL):
+        self._chatbot = Chatbot()
+        self._chatbot.load_data()
+        self._chatbot.load_model(model)
+        self._chatbot.load_embedding_model(DEFAULT_SENTENCE_EMBEDDING_MODEL)
+        self._chatbot.set_vectors()
+        self._chatbot.set_index()
+    def chat(self, question):
+        return self._chatbot.answer_question(question)
+    def eval(self,model,predict_method):
+        return self._chatbot.eval(model_name=model,predict_method=predict_method)
+class Chatbot:
+    def __init__(self):
+        # Initialize variables
+        self.df = None
+        self.test_df = None
+        self.model = None
+        self.model_name = None
+        self.tokenizer = None
+        self.embedding_model = None
+        self.vectors = None
+        self.index = None
+        self.k = 1  # top k most similar
+    def load_data(self, path: str = DATA_PATH):
+        self.df = pd.read_excel(path, sheet_name='Default')
+        self.df['Context'] = pd.read_excel(path, sheet_name='mdeberta')['Context']
+        print('Load data done')
+    def load_model(self, model_name: str = DEFAULT_MODEL):
+        self.model = AutoModelForQuestionAnswering.from_pretrained(MODEL_DICT[model_name])
+        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_DICT[model_name])
+        self.model_name = model_name
+        print('Load model done')
+    def load_embedding_model(self, model_name: str = DEFAULT_SENTENCE_EMBEDDING_MODEL):
+        if torch.cuda.is_available():  # Check if GPU is available
+            self.embedding_model = SentenceTransformer(model_name, device='cpu')
+        else: self.embedding_model = SentenceTransformer(model_name)
+        print('Load sentence embedding model done')
+    def set_vectors(self):
+        self.vectors = self.prepare_sentences_vector(self.load_embeddings(EMBEDDINGS_PATH))
+    def set_index(self):
+        if torch.cuda.is_available():  # Check if GPU is available
+            res = faiss.StandardGpuResources()
+            self.index = faiss.IndexFlatL2(self.vectors.shape[1])
+            gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, self.index)
+            gpu_index_flat.add(self.vectors)
+            self.index = gpu_index_flat
+        else:  # If GPU is not available, use CPU-based Faiss index
+            self.index = faiss.IndexFlatL2(self.vectors.shape[1])
+            self.index.add(self.vectors)
+    def get_embeddings(self, text_list):
+        return self.embedding_model.encode(text_list)
+    def prepare_sentences_vector(self, encoded_list):
+        encoded_list = [i.reshape(1, -1) for i in encoded_list]
+        encoded_list = np.vstack(encoded_list).astype('float32')
+        encoded_list = normalize(encoded_list)
+        return encoded_list
+    def store_embeddings(self, embeddings):
+        with open('models/embeddings.pkl', "wb") as fOut:
+            pickle.dump({'sentences': self.df['Question'], 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)
+        print('Store embeddings done')
+    def load_embeddings(self, file_path):
+        with open(file_path, "rb") as fIn:
+            stored_data = pickle.load(fIn)
+            stored_sentences = stored_data['sentences']
+            stored_embeddings = stored_data['embeddings']
+        print('Load (questions) embeddings done')
+        return stored_embeddings
+    def model_pipeline(self, question, similar_context):
+        inputs = self.tokenizer(question, similar_context, return_tensors="pt")
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        answer_start_index = outputs.start_logits.argmax()
+        answer_end_index = outputs.end_logits.argmax()
+        predict_answer_tokens = inputs.input_ids[0, answer_start_index: answer_end_index + 1]
+        Answer = self.tokenizer.decode(predict_answer_tokens)
+        return Answer
+    def faiss_search(self, question_vector):
+        distances, indices = self.index.search(question_vector, self.k)
+        similar_questions = [self.df['Question'][indices[0][i]] for i in range(self.k)]
+        similar_contexts = [self.df['Context'][indices[0][i]] for i in range(self.k)]
+        return similar_questions, similar_contexts, distances, indices
+    def predict_faiss(self, message):
+        message = message.strip()
+        question_vector = self.get_embeddings(message)
+        question_vector = self.prepare_sentences_vector([question_vector])
+        similar_questions, similar_contexts, distances, indices = self.faiss_search(question_vector)
+        Answers = [self.df['Answer'][i] for i in indices[0]]
+        Answer = Answers[0]
+        return Answer
+    # Function to predict using BERT embedding
+    def predict_bert_embedding(self,message):
+        message = message.strip()
+        question_vector = self.get_embeddings(message)
+        question_vector=self.prepare_sentences_vector([question_vector])
+        similar_questions, similar_contexts, distances,indices = self.faiss_search(question_vector)
+        Answer = self.model_pipeline(similar_questions, similar_contexts)
+        return Answer
+    # def predict_semantic_search(self,message,corpus_embeddings):
+    #     message = message.strip()
+    #     query_embedding = self.embedding_model.encode(message, convert_to_tensor=True)
+    #     query_embedding = query_embedding.to('cpu')
+    #     hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=1)
+    #     hit = hits[0][0]
+    #     context=self.df['Context'][hit['corpus_id']]
+    #     score="{:.4f})".format(hit['score'])
+    #     Answer = self.model_pipeline(message, context)
+    #     return Answer
+    def predict_semantic_search(self, message):
+        message = message.strip()
+        query_embedding = self.embedding_model.encode([message], convert_to_tensor=True)[0]  # Fix here
+        query_embedding = query_embedding.to('cpu')
+        corpus_embeddings = self.embedding_model.encode(self.df['Question'].tolist(), convert_to_tensor=True)  # Fix here
+        hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=1)
+        hit = hits[0][0]
+        context = self.df['Context'][hit['corpus_id']]
+        score = "{:.4f})".format(hit['score'])
+        Answer = self.model_pipeline(message, context)
+        return Answer
+    def predict_without_faiss(self,message):
+        MostSimilarContext = ""
+        min_distance = 1000
+        message = message.strip(' \t\n')
+        question_vector = self.get_embeddings([message])
+        question_vector=self.prepare_sentences_vector(question_vector)
+        for j, _question_vector in enumerate(self.vectors):
+            distance = euclidean_distances(question_vector, _question_vector.reshape(1, -1))[0][0]
+            if distance < min_distance:
+                min_distance = distance
+                MostSimilarContext = self.df['Context'][j]
+                similar_question = self.df['Question'][j]
+            if distance <= 0.02469331026:
+                break
+        predict_answer = self.model_pipeline(message, MostSimilarContext)
+        Answer = predict_answer.strip().replace("<unk>","@")
+        return Answer
+bot = ChatbotModel()
+"""#Gradio"""
+import gradio as gr
+EXAMPLE_PATH = ["หลิน ไห่เฟิง มีชื่อเรียกอีกชื่อว่าอะไร" , "ใครเป็นผู้ตั้งสภาเศรษฐกิจโลกขึ้นในปี พ.ศ. 2514 โดยทุกปีจะมีการประชุมที่ประเทศสวิตเซอร์แลนด์", "โปรดิวเซอร์ของอัลบั้มตลอดกาล ของวงคีรีบูนคือใคร", "สกุลเดิมของหม่อมครูนุ่ม นวรัตน ณ อยุธยา คืออะไร"]
+demoFaiss = gr.Interface(fn=bot._chatbot.predict_faiss, inputs="text", outputs="text", examples=EXAMPLE_PATH, title="TH wiki (just Faiss)")
+demoBert = gr.Interface(fn=bot._chatbot.predict_bert_embedding, inputs="text", outputs="text",examples=EXAMPLE_PATH, title="TH wiki (Faiss & Model)")
+demoSemantic = gr.Interface(fn=bot._chatbot.predict_semantic_search, inputs="text", outputs="text",examples=EXAMPLE_PATH, title="TH wiki (Semantic Search & Model)")
+demoWithoutFiss = gr.Interface(fn=bot._chatbot.predict_without_faiss, inputs="text", outputs="text",examples=EXAMPLE_PATH, title="TH wiki (just Model)")
+demo = gr.TabbedInterface([demoFaiss, demoWithoutFiss, demoBert, demoSemantic], ["Faiss", "Model", "Faiss & Model", "Semantic Search & Model"])
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,311 @@

+absl-py==2.1.0
+accelerate==0.26.1
+aiohttp==3.9.1
+aiosignal==1.3.1
+async-timeout==4.0.3
+attrs==23.2.0
+bert-score==0.3.13
+certifi==2023.11.17
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+contourpy==1.2.0
+cycler==0.12.1
+datasets==2.16.1
+dill==0.3.7
+evaluate==0.4.1
+faiss-cpu==1.7.4
+filelock==3.13.1
+fonttools==4.47.2
+frozenlist==1.4.1
+fsspec==2023.10.0
+huggingface-hub==0.20.2
+idna==3.6
+Jinja2==3.1.3
+joblib==1.3.2
+kiwisolver==1.4.5
+MarkupSafe==2.1.4
+matplotlib==3.8.2
+mpmath==1.3.0
+multidict==6.0.4
+multiprocess==0.70.15
+networkx==3.2.1
+nltk==3.8.1
+numpy==1.26.3
+packaging==23.2
+pandas==2.2.0
+pillow==10.2.0
+psutil==5.9.8
+pyarrow==14.0.2
+pyarrow-hotfix==0.6
+pyparsing==3.1.1
+pythainlp==4.0.2
+python-dateutil==2.8.2
+pytz==2023.3.post1
+PyYAML==6.0.1
+regex==2023.12.25
+requests==2.31.0
+responses==0.18.0
+rouge_score==0.1.2
+safetensors==0.4.1
+scikit-learn==1.4.0
+scipy==1.11.4
+sentence-transformers==2.2.2
+sentencepiece==0.1.99
+six==1.16.0
+sympy==1.12
+threadpoolctl==3.2.0
+tokenizers==0.15.0
+torch==2.1.2
+torchvision==0.16.2
+tqdm==4.66.1
+transformers==4.36.2
+typing_extensions==4.9.0
+tzdata==2023.4
+urllib3==2.1.0
+xxhash==3.4.1
+yarl==1.9.4
+absl-py==2.1.0
+accelerate==0.26.1
+aiohttp==3.9.1
+aiosignal==1.3.1
+async-timeout==4.0.3
+attrs==23.2.0
+bert-score==0.3.13
+certifi==2023.11.17
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+contourpy==1.2.0
+cycler==0.12.1
+datasets==2.16.1
+dill==0.3.7
+evaluate==0.4.1
+faiss-cpu==1.7.4
+filelock==3.13.1
+fonttools==4.47.2
+frozenlist==1.4.1
+fsspec==2023.10.0
+huggingface-cli==0.1
+huggingface-hub==0.20.2
+idna==3.6
+Jinja2==3.1.3
+joblib==1.3.2
+kiwisolver==1.4.5
+MarkupSafe==2.1.4
+matplotlib==3.8.2
+mpmath==1.3.0
+multidict==6.0.4
+multiprocess==0.70.15
+networkx==3.2.1
+nltk==3.8.1
+numpy==1.26.3
+packaging==23.2
+pandas==2.2.0
+pillow==10.2.0
+psutil==5.9.8
+pyarrow==14.0.2
+pyarrow-hotfix==0.6
+pyparsing==3.1.1
+pythainlp==4.0.2
+python-dateutil==2.8.2
+pytz==2023.3.post1
+PyYAML==6.0.1
+regex==2023.12.25
+requests==2.31.0
+responses==0.18.0
+rouge_score==0.1.2
+safetensors==0.4.1
+scikit-learn==1.4.0
+scipy==1.11.4
+sentence-transformers==2.2.2
+sentencepiece==0.1.99
+six==1.16.0
+sympy==1.12
+threadpoolctl==3.2.0
+tokenizers==0.15.0
+torch==2.1.2
+torchvision==0.16.2
+tqdm==4.66.1
+transformers==4.36.2
+typing_extensions==4.9.0
+tzdata==2023.4
+urllib3==2.1.0
+xxhash==3.4.1
+yarl==1.9.4
+absl-py==2.1.0
+accelerate==0.26.1
+aiohttp==3.9.1
+aiosignal==1.3.1
+async-timeout==4.0.3
+attrs==23.2.0
+bert-score==0.3.13
+certifi==2023.11.17
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+contourpy==1.2.0
+cycler==0.12.1
+datasets==2.16.1
+dill==0.3.7
+et-xmlfile==1.1.0
+evaluate==0.4.1
+faiss-cpu==1.7.4
+filelock==3.13.1
+fonttools==4.47.2
+frozenlist==1.4.1
+fsspec==2023.10.0
+huggingface-cli==0.1
+huggingface-hub==0.20.2
+idna==3.6
+Jinja2==3.1.3
+joblib==1.3.2
+kiwisolver==1.4.5
+MarkupSafe==2.1.4
+matplotlib==3.8.2
+mpmath==1.3.0
+multidict==6.0.4
+multiprocess==0.70.15
+networkx==3.2.1
+nltk==3.8.1
+numpy==1.26.3
+openpyxl==3.1.2
+packaging==23.2
+pandas==2.2.0
+pillow==10.2.0
+psutil==5.9.8
+pyarrow==14.0.2
+pyarrow-hotfix==0.6
+pyparsing==3.1.1
+pythainlp==4.0.2
+python-dateutil==2.8.2
+pytz==2023.3.post1
+PyYAML==6.0.1
+regex==2023.12.25
+requests==2.31.0
+responses==0.18.0
+rouge_score==0.1.2
+safetensors==0.4.1
+scikit-learn==1.4.0
+scipy==1.11.4
+sentence-transformers==2.2.2
+sentencepiece==0.1.99
+six==1.16.0
+sympy==1.12
+threadpoolctl==3.2.0
+tokenizers==0.15.0
+torch==2.1.2
+torchvision==0.16.2
+tqdm==4.66.1
+transformers==4.36.2
+typing_extensions==4.9.0
+tzdata==2023.4
+urllib3==2.1.0
+xxhash==3.4.1
+yarl==1.9.4
+absl-py==2.1.0
+accelerate==0.26.1
+aiofiles==23.2.1
+aiohttp==3.9.1
+aiosignal==1.3.1
+altair==5.2.0
+annotated-types==0.6.0
+anyio==4.2.0
+async-timeout==4.0.3
+attrs==23.2.0
+bert-score==0.3.13
+certifi==2023.11.17
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+contourpy==1.2.0
+cycler==0.12.1
+datasets==2.16.1
+dill==0.3.7
+et-xmlfile==1.1.0
+evaluate==0.4.1
+exceptiongroup==1.2.0
+faiss-cpu==1.7.4
+fastapi==0.109.0
+ffmpy==0.3.1
+filelock==3.13.1
+fonttools==4.47.2
+frozenlist==1.4.1
+fsspec==2023.10.0
+gradio==4.15.0
+gradio_client==0.8.1
+h11==0.14.0
+httpcore==1.0.2
+httpx==0.26.0
+huggingface-cli==0.1
+huggingface-hub==0.20.2
+idna==3.6
+importlib-resources==6.1.1
+Jinja2==3.1.3
+joblib==1.3.2
+jsonschema==4.21.1
+jsonschema-specifications==2023.12.1
+kiwisolver==1.4.5
+markdown-it-py==3.0.0
+MarkupSafe==2.1.4
+matplotlib==3.8.2
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.0.4
+multiprocess==0.70.15
+networkx==3.2.1
+nltk==3.8.1
+numpy==1.26.3
+openpyxl==3.1.2
+orjson==3.9.12
+packaging==23.2
+pandas==2.2.0
+pillow==10.2.0
+psutil==5.9.8
+pyarrow==14.0.2
+pyarrow-hotfix==0.6
+pydantic==2.5.3
+pydantic_core==2.14.6
+pydub==0.25.1
+Pygments==2.17.2
+pyparsing==3.1.1
+pythainlp==4.0.2
+python-dateutil==2.8.2
+python-multipart==0.0.6
+pytz==2023.3.post1
+PyYAML==6.0.1
+referencing==0.32.1
+regex==2023.12.25
+requests==2.31.0
+responses==0.18.0
+rich==13.7.0
+rouge_score==0.1.2
+rpds-py==0.17.1
+ruff==0.1.14
+safetensors==0.4.1
+scikit-learn==1.4.0
+scipy==1.11.4
+semantic-version==2.10.0
+sentence-transformers==2.2.2
+sentencepiece==0.1.99
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.0
+starlette==0.35.1
+sympy==1.12
+threadpoolctl==3.2.0
+tokenizers==0.15.0
+tomlkit==0.12.0
+toolz==0.12.0
+torch==2.1.2
+torchvision==0.16.2
+tqdm==4.66.1
+transformers==4.36.2
+typer==0.9.0
+typing_extensions==4.9.0
+tzdata==2023.4
+urllib3==2.1.0
+uvicorn==0.26.0
+websockets==11.0.3
+xxhash==3.4.1
+yarl==1.9.4