Spaces:
Runtime error
Runtime error
pr/1 (#1)
Browse files- removed streamlit (63c9094e90c4a6e1b2a4dfa7a25d90ed0028856e)
- chatbot widget with flask, socket, html, js, css (a2b71cf094e99b79543d5cfd30c6a45cdb608609)
- git ignore (07de00b304e66f34dfe7a47ad7e6ce8b711b588f)
- added index file (964b90ef783e63834ccd6bfc17087dd74d3177b2)
- .gitattributes +1 -0
- .gitignore +2 -0
- README.md +1 -12
- __pycache__/app.cpython-39.pyc +0 -0
- __pycache__/main.cpython-39.pyc +0 -0
- __pycache__/utils.cpython-39.pyc +0 -0
- app.py +24 -0
- main.py +22 -0
- open_ai.index +3 -0
- open_ai.pkl +3 -0
- requirements.txt +7 -0
- static/chatbot.js +72 -0
- static/style.css +152 -0
- templates/index.html +27 -0
- utils.py +136 -0
.gitattributes
CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*.index filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
env/
|
2 |
+
.idea
|
README.md
CHANGED
@@ -1,12 +1 @@
|
|
1 |
-
|
2 |
-
title: Makerlab Bot
|
3 |
-
emoji: π
|
4 |
-
colorFrom: purple
|
5 |
-
colorTo: green
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.17.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
# makerlab-bot
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__pycache__/app.cpython-39.pyc
ADDED
Binary file (776 Bytes). View file
|
|
__pycache__/main.cpython-39.pyc
ADDED
Binary file (1.14 kB). View file
|
|
__pycache__/utils.cpython-39.pyc
ADDED
Binary file (4.14 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, render_template
|
2 |
+
from flask_socketio import SocketIO, emit
|
3 |
+
from main import run
|
4 |
+
|
5 |
+
app = Flask(__name__)
|
6 |
+
app.config['SECRET_KEY'] = 'secret!'
|
7 |
+
socketio = SocketIO(app)
|
8 |
+
|
9 |
+
|
10 |
+
@app.route('/')
|
11 |
+
def index():
|
12 |
+
return render_template('index.html')
|
13 |
+
|
14 |
+
|
15 |
+
@socketio.on('message')
|
16 |
+
def handle_message(data):
|
17 |
+
question = data['question']
|
18 |
+
print("question: " + question)
|
19 |
+
response = run(question)
|
20 |
+
emit('response', {'response': response})
|
21 |
+
|
22 |
+
|
23 |
+
if __name__ == '__main__':
|
24 |
+
socketio.run(app)
|
main.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain
|
2 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
3 |
+
from langchain.llms import OpenAI
|
4 |
+
|
5 |
+
from utils import generate_answer
|
6 |
+
from utils import get_search_index
|
7 |
+
|
8 |
+
open_ai_pkl = "open_ai.pkl"
|
9 |
+
open_ai_index = "open_ai.index"
|
10 |
+
|
11 |
+
gpt_3_5 = OpenAI(model_name='gpt-3.5-turbo',temperature=0)
|
12 |
+
|
13 |
+
open_ai_embeddings = OpenAIEmbeddings()
|
14 |
+
|
15 |
+
def run(question):
|
16 |
+
|
17 |
+
gpt_3_5_index = get_search_index(open_ai_pkl, open_ai_index, open_ai_embeddings)
|
18 |
+
|
19 |
+
gpt_3_5_chain = load_qa_with_sources_chain(gpt_3_5, chain_type="stuff", verbose=True)
|
20 |
+
|
21 |
+
answer = generate_answer(gpt_3_5_chain, gpt_3_5_index, question)
|
22 |
+
return answer
|
open_ai.index
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1d25013b3bf7b7195d01ec1cc9ac1527638d8db68d94556b3dcc69b7dd8ff704
|
3 |
+
size 3016749
|
open_ai.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9c903018b7b3ad6f89b802f0e36fc92c88fb793c4f6e2499687b8823050a4df0
|
3 |
+
size 3373815
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
faiss-cpu
|
2 |
+
langchain
|
3 |
+
beautifulsoup4
|
4 |
+
PyPDF2
|
5 |
+
openai
|
6 |
+
flask
|
7 |
+
flask-socketio
|
static/chatbot.js
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
$(document).ready(function() {
|
2 |
+
// Initialize variables
|
3 |
+
var $messages = $('.chat-messages');
|
4 |
+
var $input = $('.chat-input input');
|
5 |
+
var $submit = $('.chat_submit');
|
6 |
+
var session_id = '';
|
7 |
+
$messages.children().each(function() {
|
8 |
+
$(this).addClass('chat-message');
|
9 |
+
});
|
10 |
+
|
11 |
+
// Initialize SocketIO connection
|
12 |
+
var socket = io.connect('http://' + document.domain + ':' + location.port);
|
13 |
+
|
14 |
+
// Function to send message to Flask-SocketIO app
|
15 |
+
function sendMessage(message) {
|
16 |
+
console.log("message: " + message )
|
17 |
+
socket.emit('message', {'question': message});
|
18 |
+
}
|
19 |
+
|
20 |
+
// Function to display message
|
21 |
+
function displayMessage(message, isUser) {
|
22 |
+
var $message = $('<div>').addClass('chat-message round');
|
23 |
+
var $messageText = $('<p>').html(message.replace(/(https?:\/\/[^\s]+)/g, '<a href="$1">$1</a>'));
|
24 |
+
|
25 |
+
$message.append($messageText);
|
26 |
+
if (isUser) {
|
27 |
+
$message.addClass('user');
|
28 |
+
} else {
|
29 |
+
$message.addClass('bot')
|
30 |
+
}
|
31 |
+
if ($messages) {
|
32 |
+
$messages.append($message);
|
33 |
+
if ($messages[0]) {
|
34 |
+
$messages.animate({scrollTop: $messages[0].scrollHeight}, 300);
|
35 |
+
}
|
36 |
+
} else {
|
37 |
+
$('.chat-container').append($message);
|
38 |
+
$('.chat-container').animate({scrollTop: 0}, 300);
|
39 |
+
}
|
40 |
+
}
|
41 |
+
|
42 |
+
|
43 |
+
socket.on('response', function(data) {
|
44 |
+
console.log("Received response: " + data.response)
|
45 |
+
var response = data.response;
|
46 |
+
displayMessage(response, false);
|
47 |
+
});
|
48 |
+
|
49 |
+
|
50 |
+
// Send message on submit
|
51 |
+
$submit.click(function(event) {
|
52 |
+
event.preventDefault();
|
53 |
+
var message = $input.val().trim();
|
54 |
+
console.log("Submit clicked: " + message)
|
55 |
+
if (message !== '') {
|
56 |
+
displayMessage(message, true);
|
57 |
+
sendMessage(message);
|
58 |
+
$input.val('');
|
59 |
+
}
|
60 |
+
});
|
61 |
+
|
62 |
+
// Send message on enter key press
|
63 |
+
$input.keydown(function(event) {
|
64 |
+
if (event.keyCode === 13) {
|
65 |
+
event.preventDefault();
|
66 |
+
$submit.click();
|
67 |
+
}
|
68 |
+
});
|
69 |
+
|
70 |
+
// Initial message
|
71 |
+
displayMessage('Ask me anything');
|
72 |
+
});
|
static/style.css
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.chat-container {
|
2 |
+
position: fixed;
|
3 |
+
bottom: 30px;
|
4 |
+
right: 30px;
|
5 |
+
z-index: 999;
|
6 |
+
background-color: #fff;
|
7 |
+
border-radius: 10px;
|
8 |
+
box-shadow: 0px 0px 20px rgba(0, 0, 0, 0.2);
|
9 |
+
max-width: 400px;
|
10 |
+
min-width: 300px;
|
11 |
+
}
|
12 |
+
|
13 |
+
.round {
|
14 |
+
border-radius: 10px;
|
15 |
+
-webkit-border-radius: 10px;
|
16 |
+
-moz-border-radius: 30px;
|
17 |
+
|
18 |
+
}
|
19 |
+
|
20 |
+
.chat-header {
|
21 |
+
display: flex;
|
22 |
+
align-items: center;
|
23 |
+
justify-content: space-between;
|
24 |
+
padding: 10px;
|
25 |
+
background-color: rgb(113, 239, 234);
|
26 |
+
color: #fff;
|
27 |
+
border-top-left-radius: 10px;
|
28 |
+
border-top-right-radius: 10px;
|
29 |
+
}
|
30 |
+
|
31 |
+
.chat-header h4 {
|
32 |
+
margin: 0;
|
33 |
+
}
|
34 |
+
|
35 |
+
.chat-close {
|
36 |
+
cursor: pointer;
|
37 |
+
}
|
38 |
+
|
39 |
+
.chat-body {
|
40 |
+
height: 300px;
|
41 |
+
overflow-y: scroll;
|
42 |
+
padding: 10px;
|
43 |
+
word-wrap: break-word;
|
44 |
+
display:flex;
|
45 |
+
flex-direction: column;
|
46 |
+
}
|
47 |
+
|
48 |
+
.chat-message {
|
49 |
+
margin: 10px;
|
50 |
+
}
|
51 |
+
|
52 |
+
.chat-message p {
|
53 |
+
margin: 0;
|
54 |
+
padding: 10px;
|
55 |
+
font-size: 16px;
|
56 |
+
line-height: 1.4;
|
57 |
+
position: relative;
|
58 |
+
word-wrap: break-word;
|
59 |
+
border-radius: 10px;
|
60 |
+
}
|
61 |
+
|
62 |
+
.chat-message.user {
|
63 |
+
display: flex;
|
64 |
+
align-self: flex-end;
|
65 |
+
justify-content: flex-end;
|
66 |
+
text-align: right;
|
67 |
+
align-items: center;
|
68 |
+
background-color: rgba(113, 239, 234, 0.75);
|
69 |
+
border-top-right-radius: 0px;
|
70 |
+
border-bottom-right-radius: 0px;
|
71 |
+
border-bottom-left-radius: 10px;
|
72 |
+
word-wrap: break-word;
|
73 |
+
color: #000;
|
74 |
+
}
|
75 |
+
|
76 |
+
|
77 |
+
.chat-message.bot {
|
78 |
+
display: flex;
|
79 |
+
align-self: flex-start;
|
80 |
+
justify-content: flex-start;
|
81 |
+
text-align: left;
|
82 |
+
align-items: center;
|
83 |
+
background-color: rgba(113, 239, 234, 0.75);
|
84 |
+
border-top-left-radius: 0px;
|
85 |
+
border-bottom-right-radius: 10px;
|
86 |
+
border-bottom-left-radius: 0px;
|
87 |
+
word-wrap: break-word;
|
88 |
+
}
|
89 |
+
|
90 |
+
.chat-message.bot p {
|
91 |
+
margin: 0;
|
92 |
+
padding: 10px;
|
93 |
+
font-size: 16px;
|
94 |
+
line-height: 1.4;
|
95 |
+
position: relative;
|
96 |
+
word-wrap: break-word;
|
97 |
+
border-radius: 10px;
|
98 |
+
overflow-wrap: anywhere;
|
99 |
+
}
|
100 |
+
|
101 |
+
.chat-message.user:after {
|
102 |
+
content: "";
|
103 |
+
position: relative;
|
104 |
+
top: 0;
|
105 |
+
right: -15px;
|
106 |
+
width: 0;
|
107 |
+
height: 0;
|
108 |
+
border-top: 15px solid transparent;
|
109 |
+
border-bottom: 15px solid transparent;
|
110 |
+
border-left: 15px solid #71EFEABF;
|
111 |
+
border-top-right-radius: 10px;
|
112 |
+
}
|
113 |
+
|
114 |
+
.chat-message.bot:before {
|
115 |
+
content: "";
|
116 |
+
position: relative;
|
117 |
+
top: 0;
|
118 |
+
left: -15px;
|
119 |
+
width: 0;
|
120 |
+
height: 0;
|
121 |
+
border-top: 15px solid transparent;
|
122 |
+
border-bottom: 15px solid transparent;
|
123 |
+
border-right: 15px solid #71EFEABF;
|
124 |
+
border-top-left-radius: 10px;
|
125 |
+
}
|
126 |
+
|
127 |
+
|
128 |
+
.chat-input {
|
129 |
+
display: flex;
|
130 |
+
margin-top: 10px;
|
131 |
+
}
|
132 |
+
|
133 |
+
.chat-input input {
|
134 |
+
flex-grow: 1;
|
135 |
+
border: none;
|
136 |
+
border-radius: 5px;
|
137 |
+
padding: 8px 10px;
|
138 |
+
font-size: 16px;
|
139 |
+
margin-right: 10px;
|
140 |
+
box-shadow: 0px 0px 5px rgba(0, 0, 0, 0.1);
|
141 |
+
}
|
142 |
+
|
143 |
+
.chat-input button {
|
144 |
+
background-color: #FFA500;
|
145 |
+
color: #fff;
|
146 |
+
border: none;
|
147 |
+
border-radius: 5px;
|
148 |
+
padding: 8px 10px;
|
149 |
+
font-size: 16px;
|
150 |
+
cursor: pointer;
|
151 |
+
box-shadow: 0px 0px 5px rgba(0, 0, 0, 0.1);
|
152 |
+
}
|
templates/index.html
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html>
|
3 |
+
<head>
|
4 |
+
<meta charset="utf-8">
|
5 |
+
<title>MakerlabX3DPrinting QA</title>
|
6 |
+
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css">
|
7 |
+
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
|
8 |
+
<link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}">
|
9 |
+
</head>
|
10 |
+
<body>
|
11 |
+
<div class="chat-container">
|
12 |
+
<div class="chat-header">
|
13 |
+
<h4>Makerlab Q&A Bot</h4>
|
14 |
+
<i class="fa fa-close chat-close"></i>
|
15 |
+
</div>
|
16 |
+
<div class="chat-body chat-messages round"></div>
|
17 |
+
<div class="chat-input">
|
18 |
+
<input type="text" placeholder="Type your message">
|
19 |
+
<button class="chat_submit">Send</button>
|
20 |
+
</div>
|
21 |
+
</div>
|
22 |
+
<!--<script src="https://cdnjs.cloudflare.com/ajax/libs/socket.io/4.5.1/socket.io.js" integrity="sha512-sY2t8W1xNQ2yB+1RFXJv+wwhdN7CHX9Z+fhM7JH/3B3q1x7VJBOwKe+zb7VW0EC8XG5M5rjBQd7+47F5fQlhKQ==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>-->
|
23 |
+
<script src="https://cdn.socket.io/4.4.1/socket.io.min.js"></script>
|
24 |
+
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
|
25 |
+
<script src="{{ url_for('static', filename='chatbot.js') }}"></script>
|
26 |
+
</body>
|
27 |
+
</html>
|
utils.py
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pickle
|
3 |
+
import time
|
4 |
+
from urllib.parse import urlparse, urljoin
|
5 |
+
|
6 |
+
import faiss
|
7 |
+
import requests
|
8 |
+
from PyPDF2 import PdfReader
|
9 |
+
from bs4 import BeautifulSoup
|
10 |
+
from langchain.docstore.document import Document
|
11 |
+
from langchain.text_splitter import CharacterTextSplitter
|
12 |
+
from langchain.vectorstores.faiss import FAISS
|
13 |
+
|
14 |
+
book_url = 'https://g.co/kgs/2VFC7u'
|
15 |
+
book_file = "Book.pdf"
|
16 |
+
url = 'https://makerlab.illinois.edu/'
|
17 |
+
def get_search_index(pickle_file, index_file, embeddings):
|
18 |
+
|
19 |
+
if os.path.isfile(pickle_file) and os.path.isfile(index_file) and os.path.getsize(pickle_file) > 0:
|
20 |
+
# Load index from pickle file
|
21 |
+
with open(pickle_file, "rb") as f:
|
22 |
+
search_index = pickle.load(f)
|
23 |
+
else:
|
24 |
+
source_chunks = create_chunk_documents()
|
25 |
+
|
26 |
+
search_index = search_index_from_docs(source_chunks, embeddings=embeddings)
|
27 |
+
|
28 |
+
faiss.write_index(search_index.index, index_file)
|
29 |
+
|
30 |
+
# Save index to pickle file
|
31 |
+
with open(pickle_file, "wb") as f:
|
32 |
+
pickle.dump(search_index, f)
|
33 |
+
|
34 |
+
return search_index
|
35 |
+
|
36 |
+
|
37 |
+
def create_chunk_documents():
|
38 |
+
sources = fetch_data_for_embeddings(url, book_file, book_url)
|
39 |
+
# print("sources" + str(len(sources)))
|
40 |
+
|
41 |
+
splitter = CharacterTextSplitter(separator=" ", chunk_size=800, chunk_overlap=0)
|
42 |
+
|
43 |
+
source_chunks = splitter.split_documents(sources)
|
44 |
+
|
45 |
+
for chunk in source_chunks:
|
46 |
+
print("Size of chunk: " + str(len(chunk.page_content) + len(chunk.metadata)))
|
47 |
+
if chunk.page_content is None or chunk.page_content == '':
|
48 |
+
print("removing chunk: "+ chunk.page_content)
|
49 |
+
source_chunks.remove(chunk)
|
50 |
+
elif len(chunk.page_content) >=1000:
|
51 |
+
print("splitting document")
|
52 |
+
source_chunks.extend(splitter.split_documents([chunk]))
|
53 |
+
# print("Chunks: " + str(len(source_chunks)) + "and type " + str(type(source_chunks)))
|
54 |
+
return source_chunks
|
55 |
+
|
56 |
+
|
57 |
+
def fetch_data_for_embeddings(url, book_file, book_url):
|
58 |
+
sources = get_website_data(url)
|
59 |
+
sources.extend(get_document_data(book_file, book_url))
|
60 |
+
return sources
|
61 |
+
|
62 |
+
def get_website_data(index_url):
|
63 |
+
# Get all page paths from index
|
64 |
+
paths = get_paths(index_url)
|
65 |
+
|
66 |
+
# Filter out invalid links and join them with the base URL
|
67 |
+
links = get_links(index_url, paths)
|
68 |
+
|
69 |
+
return get_content_from_links(links, index_url)
|
70 |
+
|
71 |
+
|
72 |
+
def get_content_from_links(links, index_url):
|
73 |
+
content_list = []
|
74 |
+
for link in set(links):
|
75 |
+
if link.startswith(index_url):
|
76 |
+
page_data = requests.get(link).content
|
77 |
+
soup = BeautifulSoup(page_data, "html.parser")
|
78 |
+
|
79 |
+
# Get page content
|
80 |
+
content = soup.get_text(separator="\n")
|
81 |
+
# print(link)
|
82 |
+
|
83 |
+
# Get page metadata
|
84 |
+
metadata = {"source": link}
|
85 |
+
|
86 |
+
content_list.append(Document(page_content=content, metadata=metadata))
|
87 |
+
time.sleep(1)
|
88 |
+
# print("content list" + str(len(content_list)))
|
89 |
+
return content_list
|
90 |
+
|
91 |
+
|
92 |
+
def get_paths(index_url):
|
93 |
+
index_data = requests.get(index_url).content
|
94 |
+
soup = BeautifulSoup(index_data, "html.parser")
|
95 |
+
paths = set([a.get('href') for a in soup.find_all('a', href=True)])
|
96 |
+
return paths
|
97 |
+
|
98 |
+
|
99 |
+
def get_links(index_url, paths):
|
100 |
+
links = []
|
101 |
+
for path in paths:
|
102 |
+
url = urljoin(index_url, path)
|
103 |
+
parsed_url = urlparse(url)
|
104 |
+
if parsed_url.scheme in ["http", "https"] and "squarespace" not in parsed_url.netloc:
|
105 |
+
links.append(url)
|
106 |
+
return links
|
107 |
+
|
108 |
+
|
109 |
+
def get_document_data(book_file, book_url):
|
110 |
+
document_list = []
|
111 |
+
with open(book_file, 'rb') as f:
|
112 |
+
pdf_reader = PdfReader(f)
|
113 |
+
for i in range(len(pdf_reader.pages)):
|
114 |
+
page_text = pdf_reader.pages[i].extract_text()
|
115 |
+
metadata = {"source": book_url}
|
116 |
+
document_list.append(Document(page_content=page_text, metadata=metadata))
|
117 |
+
|
118 |
+
# print("document list" + str(len(document_list)))
|
119 |
+
return document_list
|
120 |
+
|
121 |
+
def search_index_from_docs(source_chunks, embeddings):
|
122 |
+
# Create index from chunk documents
|
123 |
+
# print("Size of chunk" + str(len(source_chunks)))
|
124 |
+
search_index = FAISS.from_texts([doc.page_content for doc in source_chunks], embeddings, metadatas=[doc.metadata for doc in source_chunks])
|
125 |
+
return search_index
|
126 |
+
def generate_answer(chain, index, question):
|
127 |
+
#Get answer
|
128 |
+
answer = chain(
|
129 |
+
{
|
130 |
+
"input_documents": index.similarity_search(question, k=4),
|
131 |
+
"question": question,
|
132 |
+
},
|
133 |
+
return_only_outputs=True,
|
134 |
+
)["output_text"]
|
135 |
+
|
136 |
+
return answer
|