from pathlib import Path DOCS_FOLDER = Path("documents") def download_lectures(docs_folder=DOCS_FOLDER): import os import subprocess if not os.path.exists(docs_folder): os.makedirs(docs_folder, exist_ok=True) lecture_titles = get_lecture_titles() lecture_md_urls = list_lecture_md_urls(lecture_titles) for idx, url in lecture_md_urls.items(): filename = "documents/lecture-{}.md".format(str(idx).zfill(2)) if not os.path.exists(filename): subprocess.run(["wget","-O", filename, url], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) def list_lecture_md_urls(lecture_titles): lecture_md_url_base = "https://raw.githubusercontent.com/full-stack-deep-learning/website/main/docs/course/2022/" lecture_md_urls = {idx: lecture_md_url_base + title + "/index.md" for idx, title in lecture_titles.items()} return lecture_md_urls def get_lecture_titles(): lecture_titles = { 1: "lecture-1-course-vision-and-when-to-use-ml", 2: "lecture-2-development-infrastructure-and-tooling", 3: "lecture-3-troubleshooting-and-testing", 4: "lecture-4-data-management", 5: "lecture-5-deployment", 6: "lecture-6-continual-learning", 7: "lecture-7-foundation-models", 8: "lecture-8-teams-and-pm", 9: "lecture-9-ethics" } return lecture_titles def produce_documents(docs_folder=DOCS_FOLDER): """Assumes the documents are on disk already.""" import os from pathlib import Path import shutil import string import srt if not os.path.exists(docs_folder): os.makedirs(docs_folder, exist_ok=True) lecture_md_filenames = [elem for elem in os.listdir(docs_folder) if "lecture" in elem] lecture_titles = get_lecture_titles() lecture_texts = {} for fn in lecture_md_filenames: idx = int("".join(elem for elem in fn if elem in string.digits)) lecture_md_path = docs_folder / fn with open(lecture_md_path) as f: lecture = f.read() lecture_texts[idx] = lecture from langchain.text_splitter import CharacterTextSplitter text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) lecture_texts_split = {idx: text_splitter.split_text(lecture_text) for idx, lecture_text in lecture_texts.items()} website_url_base = "https://fullstackdeeplearning.com/course/2022/" source_urls = {idx: website_url_base + title for idx, title in lecture_titles.items()} source_urls_split = {"source": [source_urls[idx]] * len(splits) for idx, splits in lecture_texts_split.items()} lecture_texts_flat = [split for lecture_text in lecture_texts_split.values() for split in lecture_text] source_urls_flat = [{"source": source_urls[idx]} for idx, lecture_text in lecture_texts_split.items() for split in lecture_text] srt_filenames = list(sorted([elem for elem in os.listdir(docs_folder) if elem.endswith(".srt")])) srt_urls = get_srt_urls() srt_texts_flat, srt_metadatas_flat = [], [] for fn in srt_filenames: idx = int("".join(elem for elem in fn if elem in string.digits)) srt_url = srt_urls[idx] srt_text_path = docs_folder / fn with open(srt_text_path) as f: srt_text = "\n".join(f.readlines()) subtitles = list(srt.parse(srt_text)) texts, metadatas = create_srt_texts_and_metadatas(subtitles, srt_url) srt_texts_flat += texts srt_metadatas_flat += metadatas texts_flat = lecture_texts_flat + srt_texts_flat metadatas_flat = source_urls_flat + srt_metadatas_flat return texts_flat, metadatas_flat def create_srt_texts_and_metadatas(subtitles, base_url): query_params_format = "&t={start}s" texts, metadatas = [], [] for subtitle in subtitles: raw_text = subtitle.content text = subtitle.content.strip() start = timestamp_from_timedelta(subtitle.start) url = base_url + query_params_format.format(start=start) texts.append(text) metadatas.append({"source": url}) return texts, metadatas def timestamp_from_timedelta(timedelta): return int(timedelta.total_seconds()) def get_srt_urls(): return { 1: "https://www.youtube.com/watch?v=-Iob-FW5jVM", 2: "https://www.youtube.com/watch?v=BPYOsDCZbno", 3: "https://www.youtube.com/watch?v=RLemHNAO5Lw", 4: "https://www.youtube.com/watch?v=Jlm4oqW41vY", 5: "https://www.youtube.com/watch?v=W3hKjXg7fXM", 6: "https://www.youtube.com/watch?v=nra0Tt3a-Oc", 7: "https://www.youtube.com/watch?v=Rm11UeGwGgk", 8: "https://www.youtube.com/watch?v=a54xH6nT4Sw", 9: "https://www.youtube.com/watch?v=7FQpbYTqjAA" } if __name__ == "__main__": download_lectures() texts, metadatas = produce_documents() print(texts[-1]) print(metadatas[-1])