File size: 2,634 Bytes
d30a28b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a977402
9490959
a977402
d30a28b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import tiktoken
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from transformers import pipeline
from app_config import VECTOR_MAX_TOKENS, VECTORS_TOKEN_OVERLAP_SIZE
from langchain.docstore.document import Document
from pytube import YouTube
from dotenv import load_dotenv
from pathlib import Path
import os
env_path = Path('.') / '.env'
load_dotenv(dotenv_path=env_path)

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

def save_audio_file(url):
    try:
        yt_obj = YouTube(url)
        metadata = f"Title: {yt_obj.title}, Total Time: {yt_obj.length} seconds, Number of views: {yt_obj.views}, Rating: {yt_obj.rating}"

        metadata += f"""Description: {yt_obj.description} Uploader: {yt_obj.author} Upload Date: {yt_obj.publish_date}
                        Thumbnail URL: {yt_obj.thumbnail_url}
                        Channel URL: {yt_obj.channel_url}
                        Age Restricted: {yt_obj.age_restricted}

"""
        with open("yt_transcription.txt","w") as f:
            f.write(metadata)
        
        yt_audio_stream = yt_obj.streams.get_by_itag(139)
        yt_audio_stream.download("","yt_audio.mp4")
    except:
        print("Connection Error")

def get_audio_transcription():
    whisper  = pipeline("automatic-speech-recognition",
                    "openai/whisper-tiny.en")
   
    transcription = whisper("yt_audio.mp4",
                    chunk_length_s=30,
                    stride_length_s=5,
                    batch_size=8)
    
    with open("yt_transcription.txt","a") as f:
        f.write(transcription['text'])

def get_vectorstore():
    model_name = "BAAI/bge-small-en"
    model_kwargs = {"device": "cpu"}
    encode_kwargs = {"normalize_embeddings": True}
    hf = HuggingFaceBgeEmbeddings(
        model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
    )
    
    f = open("yt_transcription.txt", "r")
    data = f.read()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=VECTOR_MAX_TOKENS,
        chunk_overlap=VECTORS_TOKEN_OVERLAP_SIZE,
        length_function=tiktoken_len,
        separators=["\n\n\n","\n\n", "\n", " ", ""]
    )

    all_splits = text_splitter.split_text(data)
    docs = [Document(page_content=t) for t in all_splits]
    vectorstore = Chroma.from_texts(texts=all_splits ,embedding=hf)
    return vectorstore,docs