import gradio as gr from huggingface_hub import InferenceClient from langchain_community.chat_models import ChatOpenAI from langchain.chains.retrieval_qa.base import RetrievalQA from langchain_community.embeddings import OpenAIEmbeddings from langchain.schema import HumanMessage, SystemMessage from langchain_community.document_loaders import DirectoryLoader from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings.huggingface import HuggingFaceEmbeddings from langchain_community.embeddings import OpenAIEmbeddings from langchain_community.vectorstores import Chroma import requests from langchain_core.prompts import PromptTemplate """ For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference """ import gradio as gr from openai import OpenAI import os TOKEN = os.getenv("HF_TOKEN") def load_embedding_mode(): # embedding_model_dict = {"m3e-base": "/home/xiongwen/m3e-base"} encode_kwargs = {"normalize_embeddings": False} model_kwargs = {"device": 'cpu'} return HuggingFaceEmbeddings(model_name="BAAI/bge-m3", model_kwargs=model_kwargs, encode_kwargs=encode_kwargs) client = OpenAI( base_url="https://api-inference.huggingface.co/v1/", api_key=TOKEN, ) def qwen_api(user_message, top_p=0.9,temperature=0.7, system_message='', max_tokens=1024, gradio_history=[]): history = [] if gradio_history: for message in history: if message: history.append({"role": "user", "content": message[0]}) history.append({"role": "assistant", "content": message[1]}) if system_message!='': history.append({'role': 'system', 'content': system_message}) history.append({"role": "user", "content": user_message}) response = "" for message in client.chat.completions.create( model="meta-llama/Meta-Llama-3-8B-Instruct", max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, messages=history, ): token = message.choices[0].delta.content response += token return response os.environ["OPENAI_API_BASE"] = "https://api-inference.huggingface.co/v1/" os.environ["OPENAI_API_KEY"] = TOKEN llm = ChatOpenAI( model="meta-llama/Meta-Llama-3-8B-Instruct", temperature=0.8,) embedding = load_embedding_mode() db = Chroma(persist_directory='./VecterStore2_512_txt/VecterStore2_512_txt', embedding_function=embedding) prompt_template = """ {context} The above content is a form of biological background knowledge. Please answer the questions according to the above content. Please be sure to answer the questions according to the background knowledge and attach the doi number of the information source when answering. Question: {question} Answer in English:""" PROMPT = PromptTemplate( template=prompt_template, input_variables=["context", "question"] ) chain_type_kwargs = {"prompt": PROMPT} retriever = db.as_retriever() qa = RetrievalQA.from_chain_type( llm=llm, chain_type="stuff", retriever=retriever, chain_type_kwargs=chain_type_kwargs, return_source_documents=True ) def chat( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, ): if len(history) == 0: response = qa.invoke(message)['result'] else: response = qwen_api(message, gradio_history=history) print(response) yield response return response def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, ): messages = [{"role": "system", "content": system_message}] for val in history: if val[0]: messages.append({"role": "user", "content": val[0]}) if val[1]: messages.append({"role": "assistant", "content": val[1]}) messages.append({"role": "user", "content": message}) response = "" for message in client.chat.completions.create( model="meta-llama/Meta-Llama-3-8B-Instruct", max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, messages=messages, ): token = message.choices[0].delta.content response += token yield response chatbot = gr.Chatbot(height=600) demo = gr.ChatInterface( fn=chat, fill_height=True, chatbot=chatbot, additional_inputs=[ gr.Textbox(label="System message"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)", ), ], ) if __name__ == "__main__": demo.launch()