Spaces:
Sleeping
Sleeping
File size: 2,634 Bytes
d30a28b a977402 9490959 a977402 d30a28b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import tiktoken
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from transformers import pipeline
from app_config import VECTOR_MAX_TOKENS, VECTORS_TOKEN_OVERLAP_SIZE
from langchain.docstore.document import Document
from pytube import YouTube
from dotenv import load_dotenv
from pathlib import Path
import os
env_path = Path('.') / '.env'
load_dotenv(dotenv_path=env_path)
tokenizer = tiktoken.get_encoding('cl100k_base')
# create the length function
def tiktoken_len(text):
tokens = tokenizer.encode(
text,
disallowed_special=()
)
return len(tokens)
def save_audio_file(url):
try:
yt_obj = YouTube(url)
metadata = f"Title: {yt_obj.title}, Total Time: {yt_obj.length} seconds, Number of views: {yt_obj.views}, Rating: {yt_obj.rating}"
metadata += f"""Description: {yt_obj.description} Uploader: {yt_obj.author} Upload Date: {yt_obj.publish_date}
Thumbnail URL: {yt_obj.thumbnail_url}
Channel URL: {yt_obj.channel_url}
Age Restricted: {yt_obj.age_restricted}
"""
with open("yt_transcription.txt","w") as f:
f.write(metadata)
yt_audio_stream = yt_obj.streams.get_by_itag(139)
yt_audio_stream.download("","yt_audio.mp4")
except:
print("Connection Error")
def get_audio_transcription():
whisper = pipeline("automatic-speech-recognition",
"openai/whisper-tiny.en")
transcription = whisper("yt_audio.mp4",
chunk_length_s=30,
stride_length_s=5,
batch_size=8)
with open("yt_transcription.txt","a") as f:
f.write(transcription['text'])
def get_vectorstore():
model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceBgeEmbeddings(
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)
f = open("yt_transcription.txt", "r")
data = f.read()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=VECTOR_MAX_TOKENS,
chunk_overlap=VECTORS_TOKEN_OVERLAP_SIZE,
length_function=tiktoken_len,
separators=["\n\n\n","\n\n", "\n", " ", ""]
)
all_splits = text_splitter.split_text(data)
docs = [Document(page_content=t) for t in all_splits]
vectorstore = Chroma.from_texts(texts=all_splits ,embedding=hf)
return vectorstore,docs
|