import tiktoken from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_chroma import Chroma from langchain_community.embeddings import HuggingFaceBgeEmbeddings from transformers import pipeline from app_config import VECTOR_MAX_TOKENS, VECTORS_TOKEN_OVERLAP_SIZE from langchain.docstore.document import Document from pytube import YouTube from dotenv import load_dotenv from pathlib import Path import os env_path = Path('.') / '.env' load_dotenv(dotenv_path=env_path) tokenizer = tiktoken.get_encoding('cl100k_base') # create the length function def tiktoken_len(text): tokens = tokenizer.encode( text, disallowed_special=() ) return len(tokens) def save_audio_file(url): try: yt_obj = YouTube(url) metadata = f"Title: {yt_obj.title}, Total Time: {yt_obj.length} seconds, Number of views: {yt_obj.views}, Rating: {yt_obj.rating}" metadata += f"""Description: {yt_obj.description} Uploader: {yt_obj.author} Upload Date: {yt_obj.publish_date} Thumbnail URL: {yt_obj.thumbnail_url} Channel URL: {yt_obj.channel_url} Age Restricted: {yt_obj.age_restricted} """ with open("yt_transcription.txt","w") as f: f.write(metadata) yt_audio_stream = yt_obj.streams.get_by_itag(139) yt_audio_stream.download("","yt_audio.mp4") except: print("Connection Error") def get_audio_transcription(): whisper = pipeline("automatic-speech-recognition", "openai/whisper-tiny.en") transcription = whisper("yt_audio.mp4", chunk_length_s=30, stride_length_s=5, batch_size=8) with open("yt_transcription.txt","a") as f: f.write(transcription['text']) def get_vectorstore(): model_name = "BAAI/bge-small-en" model_kwargs = {"device": "cpu"} encode_kwargs = {"normalize_embeddings": True} hf = HuggingFaceBgeEmbeddings( model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs ) f = open("yt_transcription.txt", "r") data = f.read() text_splitter = RecursiveCharacterTextSplitter( chunk_size=VECTOR_MAX_TOKENS, chunk_overlap=VECTORS_TOKEN_OVERLAP_SIZE, length_function=tiktoken_len, separators=["\n\n\n","\n\n", "\n", " ", ""] ) all_splits = text_splitter.split_text(data) docs = [Document(page_content=t) for t in all_splits] vectorstore = Chroma.from_texts(texts=all_splits ,embedding=hf) return vectorstore,docs