Spaces:
Runtime error
Runtime error
File size: 5,071 Bytes
1cc07e3 f71f3ee 464c4cb 1cc07e3 7230323 1cc07e3 399b3a7 ca413dd bd57dbf db6b197 451bab1 89ed426 db6b197 464c4cb db6b197 7f32613 1cc07e3 ca413dd 1cc07e3 7bc4a8c f71f3ee 1cc07e3 f71f3ee ca413dd db6b197 ca413dd f71f3ee ca413dd 7230323 550e253 1cc07e3 e73d4c4 1cc07e3 af8a075 1cc07e3 70b74ac 1cc07e3 70b74ac 1cc07e3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import os
import whisper
from io import BytesIO # BytesIO is a class in the io module that implements an in-memory file-like object.
import base64
import boto3 # AWS Polly
from pydub import AudioSegment # AudioSegment is a class in the pydub module that can be used to manipulate audio files.
from pydub.playback import play # play is a function in the pydub.playback module that can be used to play audio files.
import logging
import numpy as np
import openai
from langchain import OpenAI
from langchain.chains import RetrievalQA # RetrievalQA is a class in the langchain.chains module that can be used to build a retrieval-based question answering system.
from langchain.vectorstores import Chroma # Chroma is a class in the langchain.vectorstores module that can be used to store vectors.
from langchain.document_loaders import DirectoryLoader #
from langchain.embeddings.openai import OpenAIEmbeddings # OpenAIGPTEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter # CharacterTextSplitter is a class in the langchain.text_splitter module that can be used to split text into chunks.
#import streamlit as st
from langchain.indexes import VectorstoreIndexCreator #导入向量存储索引创建器
#from langchain.vectorstores import DocArrayInMemorySearch #向量存储
from tenacity import (
retry,
stop_after_attempt,
wait_random_exponential,
)
# FUNCTIONS
# get embeddings
@retry(wait=wait_random_exponential(min=21, max=60), stop=stop_after_attempt(100))
#@st.cache_data
def embedding_from_string(input: str, model: str) -> list:
response = openai.Embedding.create(input=input, model=model)
embedding = response["data"][0]["embedding"]
return embedding
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
AWS_REGION_NAME = 'ap-south-1'
logging.basicConfig(level="INFO",
filename='conversations.log',
filemode='a',
format='%(asctime)s %(message)s',
datefmt='%H:%M:%S')
def buzz_user():
input_prompt = AudioSegment.from_mp3('assets/timeout_audio.mp3')
play(input_prompt)
def initialize_knowledge_base():
loader = DirectoryLoader('profiles', glob='**/*.txt') #文件夹加载器 profiles文件夹下的所有txt文件
docs = loader.load()
#index = VectorstoreIndexCreator(
# vectorstore_cls=DocArrayInMemorySearch
#).from_loaders([loader])
char_text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) #文本分割器 chunk_size=1000, chunk_overlap=0
doc_texts = char_text_splitter.split_documents(docs) #文档分割器,作用是将文档分割成小块
# Embed each chunk of text
#embeddings = []
#openAI_embeddings = OpenAIEmbeddings()
#for doc in doc_texts:
# text = str(doc)
#embedding = openAI_embeddings.embed_documents(text)
#embeddings.append(embedding)
# embedding = embedding_from_string(text, "text-embedding-ada-002")
# embeddings.append(embedding)
#vStore = np.concatenate(embeddings, axis=0)
embedding = HuggingFaceEmbeddings(model_name='shibing624/text2vec-base-chinese')
#openAI_embeddings = OpenAIEmbeddings()
vStore = Chroma.from_documents(doc_texts, embedding) #Chroma是一个类,用于存储向量,from_documents是一个方法,用于从文档中创建向量存储器,openAI_embeddings是一个类,用于将文本转换为向量
conv_model = RetrievalQA.from_chain_type(
llm=OpenAI(model_name="gpt-3.5-turbo-16k"),
chain_type="stuff",
retriever=vStore.as_retriever(
search_kwargs={"k": 1}
)
)
voice_model = whisper.load_model("tiny") #加载模型 tiny模型 tiny模型是一个小型的语音识别模型,它的大小只有 50MB 左右,但是它的准确率却非常高,可以达到 95% 以上。
return conv_model, voice_model
def text_to_speech_gen(answer): #文字转语音
polly = boto3.client('polly',
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
region_name=AWS_REGION_NAME)
response = polly.synthesize_speech(
Text=answer,
#VoiceId='Matthew',
VoiceId='Zhiyu',
OutputFormat='mp3',
#Engine = "neural"
Engine = "standard")
audio_stream = response['AudioStream'].read()
audio_html = audio_to_html(audio_stream)
return audio_html
def audio_to_html(audio_bytes): #音频转html
audio_io = BytesIO(audio_bytes)
audio_io.seek(0)
audio_base64 = base64.b64encode(audio_io.read()).decode("utf-8")
audio_html = f'<audio src="data:audio/mpeg;base64,{audio_base64}" controls autoplay></audio>'
return audio_html
def get_chat_history(user_message, history):
return "", history + [[user_message, None]]
|