Spaces:
Sleeping
Sleeping
import tiktoken | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
from langchain_chroma import Chroma | |
from langchain_community.embeddings import HuggingFaceBgeEmbeddings | |
from transformers import pipeline | |
from app_config import VECTOR_MAX_TOKENS, VECTORS_TOKEN_OVERLAP_SIZE | |
from langchain.docstore.document import Document | |
from pytube import YouTube | |
from dotenv import load_dotenv | |
from pathlib import Path | |
import os | |
env_path = Path('.') / '.env' | |
load_dotenv(dotenv_path=env_path) | |
tokenizer = tiktoken.get_encoding('cl100k_base') | |
# create the length function | |
def tiktoken_len(text): | |
tokens = tokenizer.encode( | |
text, | |
disallowed_special=() | |
) | |
return len(tokens) | |
def save_audio_file(url): | |
try: | |
yt_obj = YouTube(url) | |
metadata = f"Title: {yt_obj.title}, Total Time: {yt_obj.length} seconds, Number of views: {yt_obj.views}, Rating: {yt_obj.rating}" | |
metadata += f"""Description: {yt_obj.description} Uploader: {yt_obj.author} Upload Date: {yt_obj.publish_date} | |
Thumbnail URL: {yt_obj.thumbnail_url} | |
Channel URL: {yt_obj.channel_url} | |
Age Restricted: {yt_obj.age_restricted} | |
""" | |
with open("yt_transcription.txt","w") as f: | |
f.write(metadata) | |
yt_audio_stream = yt_obj.streams.get_by_itag(139) | |
yt_audio_stream.download("","yt_audio.mp4") | |
except: | |
print("Connection Error") | |
def get_audio_transcription(): | |
whisper = pipeline("automatic-speech-recognition", | |
"openai/whisper-tiny.en") | |
transcription = whisper("yt_audio.mp4", | |
chunk_length_s=30, | |
stride_length_s=5, | |
batch_size=8) | |
with open("yt_transcription.txt","a") as f: | |
f.write(transcription['text']) | |
def get_vectorstore(): | |
model_name = "BAAI/bge-small-en" | |
model_kwargs = {"device": "cpu"} | |
encode_kwargs = {"normalize_embeddings": True} | |
hf = HuggingFaceBgeEmbeddings( | |
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs | |
) | |
f = open("yt_transcription.txt", "r") | |
data = f.read() | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=VECTOR_MAX_TOKENS, | |
chunk_overlap=VECTORS_TOKEN_OVERLAP_SIZE, | |
length_function=tiktoken_len, | |
separators=["\n\n\n","\n\n", "\n", " ", ""] | |
) | |
all_splits = text_splitter.split_text(data) | |
docs = [Document(page_content=t) for t in all_splits] | |
vectorstore = Chroma.from_texts(texts=all_splits ,embedding=hf) | |
return vectorstore,docs | |