gkrishnan commited on
Commit
dd738af
1 Parent(s): b9578f7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -0
app.py CHANGED
@@ -6,8 +6,20 @@ from langchain.chains import RetrievalQA
6
  from transformers import AutoTokenizer
7
  import pickle
8
  import os
 
9
  from langchain.document_loaders import BSHTMLLoader, DirectoryLoader
10
 
 
 
 
 
 
 
 
 
 
 
 
11
  bshtml_dir_loader = DirectoryLoader('./data/', loader_cls = BSHTMLLoader)
12
 
13
  data = bshtml_dir_loader.load()
 
6
  from transformers import AutoTokenizer
7
  import pickle
8
  import os
9
+ import shutil
10
  from langchain.document_loaders import BSHTMLLoader, DirectoryLoader
11
 
12
+ !git clone https://github.com/TheMITTech/shakespeare
13
+
14
+ from glob import glob
15
+ files = glob("./shakespeare/**/*.html")
16
+
17
+ os.mkdir('./data')
18
+ destination_folder = './data/'
19
+
20
+ for html_file in files:
21
+ shutil.move(html_file, destination_folder + html_file.split("/"[-1]))
22
+
23
  bshtml_dir_loader = DirectoryLoader('./data/', loader_cls = BSHTMLLoader)
24
 
25
  data = bshtml_dir_loader.load()