charlesfrye commited on
Commit
407c075
1 Parent(s): a08f3cd

rough and ready ask-fsdl backend

Browse files
app.py CHANGED
@@ -1,23 +1,24 @@
1
  import logging
2
  import os
 
 
 
 
3
 
4
  import gradio as gr
5
  import openai
6
 
 
 
 
 
7
 
8
- def greet(name):
9
 
10
- openai.api_key = os.getenv("OPENAI_API_KEY")
11
- result = openai.Completion.create(
12
- model="text-davinci-003",
13
- prompt="Say this is a test",
14
- max_tokens=7,
15
- temperature=0
16
- )
17
- logging.info(result)
18
- model_response = result["choices"][0]["text"]
19
- return "Hello " + name + "!!" + "\n\n" + model_response
20
 
21
- iface = gr.Interface(fn=greet, inputs="text", outputs="text")
22
 
23
  iface.launch()
 
1
  import logging
2
  import os
3
+ import sys
4
+
5
+ if "." not in sys.path:
6
+ sys.path.append(".")
7
 
8
  import gradio as gr
9
  import openai
10
 
11
+ import ask_fsdl
12
+
13
+ openai.api_key = os.getenv("OPENAI_API_KEY")
14
+ runner = ask_fsdl.get_runner()
15
 
 
16
 
17
+ def go(query):
18
+ model_response = runner(query)
19
+ logging.info(model_response)
20
+ return model_response
 
 
 
 
 
 
21
 
22
+ iface = gr.Interface(fn=go, inputs="text", outputs="text")
23
 
24
  iface.launch()
ask_fsdl/__init__.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+
3
+ load_dotenv()
4
+
5
+ import os
6
+
7
+ from . import make_docs
8
+ from .chainrunner import *
9
+ from .docstore import *
10
+
11
+
12
+ def get_runner(regenerate=False):
13
+ from pathlib import Path
14
+ filename = Path(FaissDocumentStore.filename).resolve()
15
+
16
+ if regenerate or not os.path.exists(filename):
17
+ texts, metadatas = make_docs.produce_documents()
18
+ docsearch = FaissDocumentStore.from_texts(texts, metadatas)
19
+ docsearch.to_pickle()
20
+ else:
21
+ docsearch = FaissDocumentStore.from_pickle()
22
+
23
+ chain = StuffChain({"model_name": "text-davinci-003", "temperature": 0.0})
24
+
25
+ query_runner = StuffChainRunner(chain, docsearch)
26
+
27
+ return query_runner
ask_fsdl/chainrunner.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+
3
+ load_dotenv()
4
+
5
+
6
+ class StuffChain:
7
+
8
+ def __init__(self, model_kwargs):
9
+ self.llm = self.make_llm(**model_kwargs)
10
+ self.chain = self.make_chain(self.llm)
11
+
12
+ def make_llm(self, model_name="text-davinci-003", temperature=0.):
13
+ from langchain.llms import OpenAI
14
+
15
+ llm = OpenAI(temperature=temperature, model_name=model_name)
16
+
17
+ return llm
18
+
19
+ def make_chain(self, llm):
20
+ from langchain.chains.qa_with_sources import load_qa_with_sources_chain
21
+
22
+ chain = load_qa_with_sources_chain(llm, chain_type="stuff")
23
+
24
+ return chain
25
+
26
+ def __call__(self, *args, **kwargs):
27
+ return self.chain(*args, **kwargs)
28
+
29
+
30
+ class Runner:
31
+
32
+ def __init__(self, chain, docstore):
33
+ self.chain = chain
34
+ self.docstore = docstore
35
+
36
+ def __call__(self, query):
37
+ raise NotImplementedError
38
+
39
+
40
+ class StuffChainRunner(Runner):
41
+
42
+ def __call__(self, query):
43
+ proposed_docs = self.docstore[query]
44
+ output = self.chain({"input_documents": proposed_docs, "question": query}, return_only_outputs=True)
45
+
46
+ return output["output_text"]
ask_fsdl/docstore.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.embeddings.openai import OpenAIEmbeddings
2
+ from langchain.vectorstores.faiss import FAISS
3
+
4
+
5
+ class DocumentStore:
6
+
7
+ def __getitem__(self, query):
8
+ raise NotImplementedError
9
+
10
+
11
+ class FaissDocumentStore(DocumentStore):
12
+ filename = "documents/stored.pkl"
13
+
14
+ def __init__(self, store):
15
+ self.store = store
16
+
17
+ @classmethod
18
+ def from_texts(cls, texts, metadatas):
19
+ embeddings = OpenAIEmbeddings() # uses ada-002 by default
20
+ docsearch = FAISS.from_texts(texts, embeddings, metadatas=metadatas)
21
+
22
+ return cls(docsearch)
23
+
24
+ @classmethod
25
+ def from_pickle(cls, filename=None):
26
+ import pickle
27
+
28
+ if filename is None:
29
+ filename = cls.filename
30
+
31
+ with open(filename, "rb") as f:
32
+ store = pickle.load(f)
33
+
34
+ return cls(store)
35
+
36
+ def to_pickle(self, filename=None):
37
+ import pickle
38
+
39
+ if filename is None:
40
+ filename = self.filename
41
+
42
+ with open(filename, "wb") as f:
43
+ pickle.dump(self.store, f)
44
+
45
+ def __getitem__(self, query):
46
+ return self.store.similarity_search(query)
ask_fsdl/main.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ sys.path.append(".")
3
+
4
+ import ask_fsdl
5
+
6
+
7
+ if __name__ == "__main__":
8
+ import sys
9
+
10
+ ask_fsdl.make_docs.download_lectures()
11
+
12
+ runner = ask_fsdl.get_runner()
13
+
14
+ print(runner(sys.argv[1]))
ask_fsdl/make_docs.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ DOCS_FOLDER = Path("documents")
4
+
5
+ def download_lectures(docs_folder=DOCS_FOLDER):
6
+ import os
7
+ import subprocess
8
+
9
+ if not os.path.exists(docs_folder):
10
+ os.makedirs(docs_folder, exist_ok=True)
11
+
12
+ lecture_titles = get_lecture_titles()
13
+ lecture_md_urls = list_lecture_md_urls(lecture_titles)
14
+
15
+ for idx, url in lecture_md_urls.items():
16
+ filename = "documents/lecture-{}.md".format(str(idx).zfill(2))
17
+ if not os.path.exists(filename):
18
+ subprocess.run(["wget","-O", filename, url], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
19
+
20
+
21
+ def list_lecture_md_urls(lecture_titles):
22
+
23
+ lecture_md_url_base = "https://raw.githubusercontent.com/full-stack-deep-learning/website/main/docs/course/2022/"
24
+
25
+ lecture_md_urls = {idx: lecture_md_url_base + title + "/index.md" for idx, title in lecture_titles.items()}
26
+
27
+ return lecture_md_urls
28
+
29
+
30
+ def get_lecture_titles():
31
+ lecture_titles = {
32
+ 1: "lecture-1-course-vision-and-when-to-use-ml",
33
+ 2: "lecture-2-development-infrastructure-and-tooling",
34
+ 3: "lecture-3-troubleshooting-and-testing",
35
+ 4: "lecture-4-data-management",
36
+ 5: "lecture-5-deployment",
37
+ 6: "lecture-6-continual-learning",
38
+ 7: "lecture-7-foundation-models",
39
+ 8: "lecture-8-teams-and-pm",
40
+ 9: "lecture-9-ethics"
41
+ }
42
+
43
+ return lecture_titles
44
+
45
+
46
+ def produce_documents(docs_folder=DOCS_FOLDER):
47
+ """Assumes the documents are on disk already."""
48
+ import os
49
+ from pathlib import Path
50
+ import shutil
51
+ import string
52
+
53
+ import srt
54
+
55
+ if not os.path.exists(docs_folder):
56
+ os.makedirs(docs_folder, exist_ok=True)
57
+
58
+ lecture_md_filenames = [elem for elem in os.listdir(docs_folder) if "lecture" in elem]
59
+
60
+ lecture_titles = get_lecture_titles()
61
+ lecture_texts = {}
62
+
63
+ for fn in lecture_md_filenames:
64
+ idx = int("".join(elem for elem in fn if elem in string.digits))
65
+ lecture_md_path = docs_folder / fn
66
+ with open(lecture_md_path) as f:
67
+ lecture = f.read()
68
+ lecture_texts[idx] = lecture
69
+
70
+ from langchain.text_splitter import CharacterTextSplitter
71
+
72
+
73
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
74
+
75
+ lecture_texts_split = {idx: text_splitter.split_text(lecture_text) for idx, lecture_text in lecture_texts.items()}
76
+
77
+ website_url_base = "https://fullstackdeeplearning.com/course/2022/"
78
+ source_urls = {idx: website_url_base + title for idx, title in lecture_titles.items()}
79
+ source_urls_split = {"source": [source_urls[idx]] * len(splits) for idx, splits in lecture_texts_split.items()}
80
+
81
+ lecture_texts_flat = [split for lecture_text in lecture_texts_split.values() for split in lecture_text]
82
+ source_urls_flat = [{"source": source_urls[idx]} for idx, lecture_text in lecture_texts_split.items() for split in lecture_text]
83
+
84
+ srt_filenames = list(sorted([elem for elem in os.listdir(docs_folder) if elem.endswith(".srt")]))
85
+ srt_urls = get_srt_urls()
86
+ srt_texts_flat, srt_metadatas_flat = [], []
87
+
88
+ for fn in srt_filenames:
89
+ idx = int("".join(elem for elem in fn if elem in string.digits))
90
+ srt_url = srt_urls[idx]
91
+
92
+ srt_text_path = docs_folder / fn
93
+ with open(srt_text_path) as f:
94
+ srt_text = "\n".join(f.readlines())
95
+
96
+ subtitles = list(srt.parse(srt_text))
97
+
98
+ texts, metadatas = create_srt_texts_and_metadatas(subtitles, srt_url)
99
+ srt_texts_flat += texts
100
+ srt_metadatas_flat += metadatas
101
+
102
+ texts_flat = lecture_texts_flat + srt_texts_flat
103
+ metadatas_flat = source_urls_flat + srt_metadatas_flat
104
+
105
+ return texts_flat, metadatas_flat
106
+
107
+
108
+ def create_srt_texts_and_metadatas(subtitles, base_url):
109
+ query_params_format = "&t={start}s"
110
+ texts, metadatas = [], []
111
+
112
+ for subtitle in subtitles:
113
+ raw_text = subtitle.content
114
+ text = subtitle.content.strip()
115
+ start = timestamp_from_timedelta(subtitle.start)
116
+ url = base_url + query_params_format.format(start=start)
117
+
118
+ texts.append(text)
119
+ metadatas.append({"source": url})
120
+
121
+ return texts, metadatas
122
+
123
+
124
+ def timestamp_from_timedelta(timedelta):
125
+ return int(timedelta.total_seconds())
126
+
127
+
128
+ def get_srt_urls():
129
+ return {
130
+ 1: "https://www.youtube.com/watch?v=-Iob-FW5jVM",
131
+ 2: "https://www.youtube.com/watch?v=BPYOsDCZbno",
132
+ 3: "https://www.youtube.com/watch?v=RLemHNAO5Lw",
133
+ 4: "https://www.youtube.com/watch?v=Jlm4oqW41vY",
134
+ 5: "https://www.youtube.com/watch?v=W3hKjXg7fXM",
135
+ 6: "https://www.youtube.com/watch?v=nra0Tt3a-Oc",
136
+ 7: "https://www.youtube.com/watch?v=Rm11UeGwGgk",
137
+ 8: "https://www.youtube.com/watch?v=a54xH6nT4Sw",
138
+ 9: "https://www.youtube.com/watch?v=7FQpbYTqjAA"
139
+ }
140
+
141
+
142
+ if __name__ == "__main__":
143
+ download_lectures()
144
+ texts, metadatas = produce_documents()
145
+ print(texts[-1])
146
+ print(metadatas[-1])