Spaces:

Shakhovak
/

Sheldon_Retrieval_chat_bot

Sleeping

App Files Files Community

Shakhovak commited on Feb 11

Commit

3fb88a6

•

1 Parent(s): 7042ec5

Upload 9 files

Browse files

Adding main files

Files changed (9) hide show

Dockerfile +22 -0
app.py +29 -0
data/scripts.pkl +3 -0
data/scripts_vectors.pkl +3 -0
requirements.txt +7 -0
retrieve_bot.py +72 -0
static/style.css +223 -0
templates/chat.html +80 -0
utils.py +166 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,22 @@

+FROM python:3.9.13
+WORKDIR /code
+COPY ./requirements.txt /code//requirements.txt
+RUN pip install --no-cache-dir -r /code/requirements.txt
+COPY . /code
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/app
+COPY --chown=user . $HOME/app
+CMD ["gunicorn", "-b", "0.0.0.0:7860", "app:app"]

app.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from flask import Flask, render_template, request
+from retrieve_bot import ChatBot
+app = Flask(__name__)
+chatSheldon = ChatBot()
+chatSheldon.load()
+# this script is running flask application
+@app.route("/")
+def index():
+    return render_template("chat.html")
+@app.route("/get", methods=["GET", "POST"])
+def chat():
+    msg = request.form["msg"]
+    input = msg
+    return get_Chat_response(input)
+def get_Chat_response(text):
+    answer = chatSheldon.generate_response(text)
+    return answer
+if __name__ == "__main__":
+    app.run(debug=True, port=7860)

data/scripts.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd8ded525a9faf9031e899ba75c5b7f91fdc4052619a43ca1ff608a7cce73b42
+size 2127113

data/scripts_vectors.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba242c25adc032bcf265fa1c805bf1f506150f181a6fc13f6753088af79cd9c7
+size 71223174

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+sentence-transformers==2.2.2
+flask==2.2.5
+pandas==1.3.5
+gunicorn==20.1.0
+requests==2.27.
+datasets==2.13.2
+transformers==4.37.2

retrieve_bot.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import pandas as pd
+import pickle
+from sentence_transformers import SentenceTransformer
+from utils import encode, cosine_sim, top_candidates, candidates_reranking
+from collections import deque
+from transformers import pipeline
+import torch
+from transformers import AutoTokenizer
+# this class representes main functions of retrieve bot
+class ChatBot:
+    def __init__(self):
+        self.vect_data = []
+        self.scripts = []
+        self.conversation_history = deque([], maxlen=5)
+        self.ranking_model = None
+        self.reranking_model = None
+        self.device = None
+        self.tokenizer = None
+    def load(self):
+        """ "This method is called first to load all datasets and
+        model used by the chat bot; all the data to be saved in
+        tha data folder, models to be loaded from hugging face"""
+        with open("data/scripts_vectors.pkl", "rb") as fp:
+            self.vect_data = pickle.load(fp)
+            self.scripts = pd.read_pickle("data/scripts.pkl")
+        self.ranking_model = SentenceTransformer("sentence-transformers/LaBSE")
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+        self.reranking_model = pipeline(
+            model="Shakhovak/RerankerModel_chat_bot",
+            device=self.device,
+            tokenizer=self.tokenizer,
+        )
+    def generate_response(self, utterance: str) -> str:
+        """this functions identifies potential
+        candidates for answer and ranks them"""
+        query_encoding = encode(
+            utterance, self.ranking_model, contexts=self.conversation_history
+        )
+        bot_cosine_scores = cosine_sim(self.vect_data, query_encoding)
+        top_scores, top_indexes = top_candidates(bot_cosine_scores, top=20)
+        # test candidates and collects them with label 0 to dictionary
+        reranked_dict = candidates_reranking(
+            top_indexes,
+            self.conversation_history,
+            utterance,
+            self.scripts,
+            self.reranking_model,
+        )
+        # if any candidates were selected, range them and pick up the top
+        # else keep up the initial top 1
+        if len(reranked_dict) >= 1:
+            updated_top_candidates = dict(
+                sorted(reranked_dict.items(), key=lambda item: item[1])
+            )
+            answer = self.scripts.iloc[list(updated_top_candidates.keys())[0]]["answer"]
+        else:
+            answer = self.scripts.iloc[top_indexes[0]]["answer"]
+        self.conversation_history.append(utterance)
+        self.conversation_history.append(answer)
+        return answer

static/style.css ADDED Viewed

	@@ -0,0 +1,223 @@

+body,html{
+	height: 100%;
+	margin: 0;
+	background: rgb(44, 47, 59);
+   background: -webkit-linear-gradient(to right, rgb(40, 59, 34), rgb(54, 60, 70), rgb(32, 32, 43));
+	background: linear-gradient(to right, rgb(38, 51, 61), rgb(50, 55, 65), rgb(33, 33, 78));
+}
+.chat{
+	margin-top: auto;
+	margin-bottom: auto;
+}
+.card{
+	height: 500px;
+	border-radius: 15px !important;
+	background-color: rgba(0,0,0,0.4) !important;
+}
+.contacts_body{
+	padding:  0.75rem 0 !important;
+	overflow-y: auto;
+	white-space: nowrap;
+}
+.msg_card_body{
+	overflow-y: auto;
+}
+.card-header{
+	border-radius: 15px 15px 0 0 !important;
+	border-bottom: 0 !important;
+}
+.card-footer{
+border-radius: 0 0 15px 15px !important;
+	border-top: 0 !important;
+}
+.container{
+	align-content: center;
+}
+.search{
+	border-radius: 15px 0 0 15px !important;
+	background-color: rgba(0,0,0,0.3) !important;
+	border:0 !important;
+	color:white !important;
+}
+.search:focus{
+	 box-shadow:none !important;
+   outline:0px !important;
+}
+.type_msg{
+	background-color: rgba(0,0,0,0.3) !important;
+	border:0 !important;
+	color:white !important;
+	height: 60px !important;
+	overflow-y: auto;
+}
+	.type_msg:focus{
+	 box-shadow:none !important;
+   outline:0px !important;
+}
+.attach_btn{
+	border-radius: 15px 0 0 15px !important;
+	background-color: rgba(0,0,0,0.3) !important;
+	border:0 !important;
+	color: white !important;
+	cursor: pointer;
+}
+.send_btn{
+	border-radius: 0 15px 15px 0 !important;
+	background-color: rgba(0,0,0,0.3) !important;
+	border:0 !important;
+	color: white !important;
+	cursor: pointer;
+}
+.search_btn{
+	border-radius: 0 15px 15px 0 !important;
+	background-color: rgba(0,0,0,0.3) !important;
+	border:0 !important;
+	color: white !important;
+	cursor: pointer;
+}
+.contacts{
+	list-style: none;
+	padding: 0;
+}
+.contacts li{
+	width: 100% !important;
+	padding: 5px 10px;
+	margin-bottom: 15px !important;
+}
+.active{
+	background-color: rgba(0,0,0,0.3);
+}
+.user_img{
+	height: 70px;
+	width: 70px;
+	border:1.5px solid #f5f6fa;
+}
+.user_img_msg{
+	height: 40px;
+	width: 40px;
+	border:1.5px solid #f5f6fa;
+}
+.img_cont{
+	position: relative;
+	height: 70px;
+	width: 70px;
+}
+.img_cont_msg{
+	height: 40px;
+	width: 40px;
+}
+.online_icon{
+	position: absolute;
+	height: 15px;
+	width:15px;
+	background-color: #4cd137;
+	border-radius: 50%;
+	bottom: 0.2em;
+	right: 0.4em;
+	border:1.5px solid white;
+}
+.offline{
+	background-color: #c23616 !important;
+}
+.user_info{
+	margin-top: auto;
+	margin-bottom: auto;
+	margin-left: 15px;
+}
+.user_info span{
+	font-size: 20px;
+	color: white;
+}
+.user_info p{
+	font-size: 10px;
+	color: rgba(255,255,255,0.6);
+}
+.video_cam{
+	margin-left: 50px;
+	margin-top: 5px;
+}
+.video_cam span{
+	color: white;
+	font-size: 20px;
+	cursor: pointer;
+	margin-right: 20px;
+}
+.msg_cotainer{
+	margin-top: auto;
+	margin-bottom: auto;
+	margin-left: 10px;
+	border-radius: 25px;
+	background-color: rgb(82, 172, 255);
+	padding: 10px;
+	position: relative;
+}
+.msg_cotainer_send{
+	margin-top: auto;
+	margin-bottom: auto;
+	margin-right: 10px;
+	border-radius: 25px;
+	background-color: #58cc71;
+	padding: 10px;
+	position: relative;
+}
+.msg_time{
+	position: absolute;
+	left: 0;
+	bottom: -15px;
+	color: rgba(255,255,255,0.5);
+	font-size: 10px;
+}
+.msg_time_send{
+	position: absolute;
+	right:0;
+	bottom: -15px;
+	color: rgba(255,255,255,0.5);
+	font-size: 10px;
+}
+.msg_head{
+	position: relative;
+}
+#action_menu_btn{
+	position: absolute;
+	right: 10px;
+	top: 10px;
+	color: white;
+	cursor: pointer;
+	font-size: 20px;
+}
+.action_menu{
+	z-index: 1;
+	position: absolute;
+	padding: 15px 0;
+	background-color: rgba(0,0,0,0.5);
+	color: white;
+	border-radius: 15px;
+	top: 30px;
+	right: 15px;
+	display: none;
+}
+.action_menu ul{
+	list-style: none;
+	padding: 0;
+	margin: 0;
+}
+.action_menu ul li{
+	width: 100%;
+	padding: 10px 15px;
+	margin-bottom: 5px;
+}
+.action_menu ul li i{
+	padding-right: 10px;
+}
+.action_menu ul li:hover{
+	cursor: pointer;
+	background-color: rgba(0,0,0,0.2);
+}
+@media(max-width: 576px){
+	.contacts_card{
+	margin-bottom: 15px !important;
+}
+}

templates/chat.html ADDED Viewed

	@@ -0,0 +1,80 @@

+<link href="//maxcdn.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css" rel="stylesheet" id="bootstrap-css">
+<script src="//maxcdn.bootstrapcdn.com/bootstrap/4.1.1/js/bootstrap.min.js"></script>
+<script src="//cdnjs.cloudflare.com/ajax/libs/jquery/3.2.1/jquery.min.js"></script>
+<!DOCTYPE html>
+<html>
+	<head>
+		<title>Chatbot</title>
+		<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css" integrity="sha384-MCw98/SFnGE8fJT3GXwEOngsV7Zt27NXFoaoApmYm81iuXoPkFOJwJ8ERdknLPMO" crossorigin="anonymous">
+		<link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.5.0/css/all.css" integrity="sha384-B4dIYHKNBt8Bc12p+WXckhzcICo0wtJAoU8YZTY5qE0Id1GSseTk6S+L3BlXeVIU" crossorigin="anonymous">
+		<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
+		<link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='style.css')}}"/>
+	</head>
+	<body>
+		<div class="container-fluid h-100">
+			<div class="row justify-content-center h-100">
+				<div class="col-md-8 col-xl-6 chat">
+					<div class="card">
+						<div class="card-header msg_head">
+							<div class="d-flex bd-highlight">
+								<div class="img_cont">
+									<img src="https://stickerpacks.ru/wp-content/uploads/2023/04/nabor-stikerov-teorija-bolshogo-vzryva-5-dlja-telegram-3.webp" class="rounded-circle user_img">
+									<span class="online_icon"></span>
+								</div>
+								<div class="user_info">
+									<span>ChatBot</span>
+									<p>Ask me anything!</p>
+								</div>
+							</div>
+						</div>
+						<div id="messageFormeight" class="card-body msg_card_body">
+						</div>
+						<div class="card-footer">
+							<form id="messageArea" class="input-group">
+                                <input type="text" id="text" name="msg" placeholder="Type your message..." autocomplete="off" class="form-control type_msg" required/>
+								<div class="input-group-append">
+									<button type="submit" id="send" class="input-group-text send_btn"><i class="fas fa-location-arrow"></i></button>
+								</div>
+							</form>
+						</div>
+					</div>
+				</div>
+			</div>
+		</div>
+		<script>
+			$(document).ready(function() {
+				$("#messageArea").on("submit", function(event) {
+					const date = new Date();
+					const hour = date.getHours();
+					const minute = date.getMinutes();
+					const str_time = hour+":"+minute;
+					var rawText = $("#text").val();
+					var userHtml = '<div class="d-flex justify-content-end mb-4"><div class="msg_cotainer_send">' + rawText + '<span class="msg_time_send">'+ str_time + '</span></div><div class="img_cont_msg"><img src="https://i.ibb.co/d5b84Xw/Untitled-design.png" class="rounded-circle user_img_msg"></div></div>';
+					$("#text").val("");
+					$("#messageFormeight").append(userHtml);
+					$.ajax({
+						data: {
+							msg: rawText,
+						},
+						type: "POST",
+						url: "/get",
+					}).done(function(data) {
+						var botHtml = '<div class="d-flex justify-content-start mb-4"><div class="img_cont_msg"><img src="https://stickerpacks.ru/wp-content/uploads/2023/04/nabor-stikerov-teorija-bolshogo-vzryva-5-dlja-telegram-3.webp" class="rounded-circle user_img_msg"></div><div class="msg_cotainer">' + data + '<span class="msg_time">' + str_time + '</span></div></div>';
+						$("#messageFormeight").append($.parseHTML(botHtml));
+					});
+					event.preventDefault();
+				});
+			});
+		</script>
+    </body>
+</html>

utils.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from scipy import sparse
+import pandas as pd
+import pickle
+import random
+def encode(texts, model, contexts=None, do_norm=True):
+    """function to encode texts for cosine similarity search"""
+    question_vectors = model.encode(texts)
+    context_vectors = model.encode("".join(contexts))
+    return np.concatenate(
+        [np.asarray(question_vectors), np.asarray(context_vectors)], axis=-1
+    )
+def cosine_sim(data_vectors, query_vectors) -> list:
+    """returns list of tuples with similarity score and
+    script index in initial dataframe"""
+    data_emb = sparse.csr_matrix(data_vectors)
+    query_emb = sparse.csr_matrix(query_vectors)
+    similarity = cosine_similarity(query_emb, data_emb).flatten()
+    ind = np.argwhere(similarity)
+    match = sorted(zip(similarity, ind.tolist()), reverse=True)
+    return match
+def scripts_rework(path, character):
+    """this functions split scripts for queation, answer, context,
+    picks up the cahracter and saves data in pickle format"""
+    df = pd.read_csv(path)
+    # split data for scenes
+    count = 0
+    df["scene_count"] = ""
+    for index, row in df.iterrows():
+        if index == 0:
+            df.iloc[index]["scene_count"] = count
+        elif row["person_scene"] == "Scene":
+            count += 1
+            df.iloc[index]["scene_count"] = count
+        else:
+            df.iloc[index]["scene_count"] = count
+    df = df.dropna().reset_index()
+    # rework scripts to filer by caracter utterances and related context
+    scripts = pd.DataFrame()
+    for index, row in df.iterrows():
+        if (row["person_scene"] == character) & (
+            df.iloc[index - 1]["person_scene"] != "Scene"
+        ):
+            context = []
+            for i in reversed(range(2, 5)):
+                if (df.iloc[index - i]["person_scene"] != "Scene") & (index - i >= 0):
+                    context.append(df.iloc[index - i]["dialogue"])
+                else:
+                    break
+            new_row = {
+                "answer": row["dialogue"],
+                "question": df.iloc[index - 1]["dialogue"],
+                "context": context,
+            }
+            scripts = scripts.append(new_row, ignore_index=True)
+        elif (row["person_scene"] == character) & (
+            df.iloc[index - 1]["person_scene"] == "Scene"
+        ):
+            context = []
+            new_row = {"answer": row["dialogue"], "question": "", "context": context}
+            scripts = scripts.append(new_row, ignore_index=True)
+    # load reworked data to pkl
+    scripts.to_pickle("data/scripts.pkl")
+def encode_df_save(model):
+    """this functions vectorizes reworked scripts and loads them to
+    pickle file to be used as retrieval base for ranking script"""
+    scripts_reopened = pd.read_pickle("data/scripts.pkl")
+    vect_data = []
+    for index, row in scripts_reopened.iterrows():
+        vect = encode(row["question"], model, row["context"])
+        vect_data.append(vect)
+    with open("data/scripts_vectors.pkl", "wb") as f:
+        pickle.dump(vect_data, f)
+def top_candidates(score_lst_sorted, top=1):
+    """this functions receives results of the cousine similarity ranking and
+    returns top items' scores and their indices"""
+    scores = [item[0] for item in score_lst_sorted]
+    candidates_indexes = [item[1][0] for item in score_lst_sorted]
+    return scores[0:top], candidates_indexes[0:top]
+def candidates_reranking(
+    top_candidates_idx_lst, conversational_history, utterance, initial_df, pipeline
+):
+    """this function applies trained bert classifier to identified candidates and
+    returns their updated rank"""
+    reranked_idx = {}
+    for idx in top_candidates_idx_lst:
+        combined_text = (
+            " ".join(conversational_history)
+            + " [SEP] "
+            + utterance
+            + " [SEP] "
+            + initial_df.iloc[idx]["answer"]
+        )
+        prediction = pipeline(combined_text)
+        if prediction[0]["label"] == "LABEL_0":
+            reranked_idx[idx] = prediction[0]["score"]
+    return reranked_idx
+def read_files_negative(path1, path2):
+    """this functions creates training dataset for classifier incl negative
+    examples and saves it to the pickle file"""
+    star_wars = []
+    for file in path1:
+        star_wars.append(pd.read_csv(file, sep='"', on_bad_lines="warn"))
+    total = pd.concat(star_wars, ignore_index=True)
+    rick_and_morty = pd.read_csv(path2)
+    negative_lines_to_add = list(rick_and_morty["line"])
+    negative_lines_to_add.extend(list(total["dialogue"]))
+    scripts_reopened = pd.read_pickle("data/scripts.pkl")
+    scripts_reopened["label"] = 0
+    source = random.sample(
+        list(scripts_reopened[scripts_reopened["question"] != ""]["question"]), 7062
+    )
+    negative_lines_to_add.extend(source)
+    random.shuffle(negative_lines_to_add)
+    scripts_negative = scripts_reopened[["question", "context"]]
+    scripts_negative["label"] = 1
+    scripts_negative["answer"] = negative_lines_to_add[0 : len(scripts_negative)]
+    fin_scripts = pd.concat([scripts_negative, scripts_reopened])
+    fin_scripts = fin_scripts.sample(frac=1).reset_index(drop=True)
+    fin_scripts["context"] = fin_scripts["context"].apply(lambda x: "".join(x))
+    fin_scripts = fin_scripts[fin_scripts["question"] != ""]
+    fin_scripts = fin_scripts[fin_scripts["answer"] != ""]
+    fin_scripts["combined"] = (
+        fin_scripts["context"]
+        + "[SEP]"
+        + fin_scripts["question"]
+        + "[SEP]"
+        + fin_scripts["answer"]
+    )
+    # fin_scripts = fin_scripts.dropna(how='any')
+    fin_scripts.to_pickle("data/scripts_for_reranker.pkl")