JoPmt's picture
Update app.py
d9eb9f8 verified
from huggingface_hub import login, InferenceClient
import os, gc, time, random, datetime, json, re
HF_TOKEN=os.getenv('HF_TOKEN')
SERP_API_KEY=os.getenv('SERP_KEY')
login(token=HF_TOKEN)
import gradio as gr
from transformers import CodeAgent, Tool, ToolCollection, load_tool, ReactCodeAgent, ReactJsonAgent
from transformers.agents import PythonInterpreterTool
from langchain.memory import ConversationBufferMemory
import bs4
import requests
from llm_engine import HfEngine
import datasets
import spaces
import tqdm
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.vectorstores import VectorStore
from transformers.agents.prompts import DEFAULT_REACT_CODE_SYSTEM_PROMPT, DEFAULT_REACT_JSON_SYSTEM_PROMPT
from transformers.agents.default_tools import Tool, PythonInterpreterTool
from duckduckgo_search import DDGS
from web_surfer import (SearchInformationTool, NavigationalSearchTool, VisitTool, DownloadTool, PageUpTool, PageDownTool, FinderTool, FindNextTool, ArchiveSearchTool,)
from mdconvert import MarkdownConverter
from visual_qa import VisualQATool, VisualQAGPT4Tool
HF_HUB_DISABLE_TELEMETRY=1
DO_NOT_TRACK=1
HF_HUB_ENABLE_HF_TRANSFER=0
def search_ducky(query):
with DDGS() as ddgs:
results = list(ddgs.text(query, max_results=10))
content = ''
if results:
for result in results:
content += result['body']
return content
knowledge_base = datasets.load_dataset("m-ric/huggingface_doc", split="train")
source_docs = [Document(page_content=doc["text"], metadata={"source": doc["source"].split("/")[1]}) for doc in knowledge_base]
docs_processed = RecursiveCharacterTextSplitter(chunk_size=500).split_documents(source_docs)[:1000]
embedding_model = HuggingFaceEmbeddings(model_name="thenlper/gte-small")
vectordb = FAISS.from_documents(documents=docs_processed, embedding=embedding_model)
all_sources = list(set([doc.metadata["source"] for doc in docs_processed]))
print(all_sources)
class RetrieverTool(Tool):
name = "retriever"
description = "Retrieves some documents from the knowledge base that have the closest embeddings to the input query."
inputs = {
"query": {
"type": "text",
"description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.",
},
"source": {
"type": "text",
"description": ""
},
}
output_type = "text"
def __init__(self, vectordb: VectorStore, all_sources: str, **kwargs):
super().__init__(**kwargs)
self.vectordb = vectordb
self.inputs["source"]["description"] = (f"The source of the documents to search, as a str representation of a list. Possible values in the list are: {all_sources}. If this argument is not provided, all sources will be searched.")
def forward(self, query: str, source: str = None) -> str:
assert isinstance(query, str), "Your search query must be a string"
if source:
if isinstance(source, str) and "[" not in str(source): # if the source is not representing a list
source = [source]
source = json.loads(str(source).replace("'", '"'))
docs = self.vectordb.similarity_search(query, filter=({"source": source} if source else None), k=3)
if len(docs) == 0:
return "No documents found with this filtering. Try removing the source filter."
return "Retrieved documents:\n\n" + "\n===Document===\n".join([doc.page_content for doc in docs])
memory = ConversationBufferMemory(memory_key="chat_history")
llm_engine = HfEngine(model="Jopmt/JoPmt")
##gradio_prompt_generator_tool = StableDiffusionPromptGeneratorTool()
##prompt_generator_tool = Tool.from_gradio(gradio_prompt_generator_tool)
##tools = [StableDiffusionTool().langchain, ImageCaptioningTool().langchain, StableDiffusionPromptGeneratorTool().langchain, TextToVideoTool().langchain]
##tools=[prompt_generator_tool(), image_generation_tool(), PythonInterpreterTool()]
class SearchTool(Tool):
name = "ask_search_agent"
description = "A search agent that will browse the internet to answer a question. Use it to gather informations, not for problem-solving."
inputs = {
"question": {
"description": "Your question, as a natural language sentence. You are talking to an agent, so provide them with as much context as possible.",
"type": "text",
}
}
output_type = "text"
def forward(self, question: str) -> str:
return websurfer_agent.run(question)
tools=[PythonInterpreterTool(),SearchTool(),RetrieverTool(vectordb, all_sources)]
additional_authorized_imports=['requests', 'bs4', 'os', 'time', 'datetime', 'json', 're']
WEB_TOOLS = [SearchInformationTool(), NavigationalSearchTool(), VisitTool(), DownloadTool(), PageUpTool(), PageDownTool(), FinderTool(), FindNextTool(), ArchiveSearchTool(),]
websurfer_agent = ReactJsonAgent(tools=WEB_TOOLS,llm_engine=llm_engine, add_base_tools=True,max_iterations=1)
reagent = ReactCodeAgent(tools=tools, llm_engine=llm_engine, add_base_tools=True,max_iterations=1,additional_authorized_imports=additional_authorized_imports)
def plix(inut, progress=gr.Progress(track_tqdm=True)):
goose=reagent.run(inut)
return goose
with gr.Blocks(theme=random.choice([gr.themes.Monochrome(),gr.themes.Base.from_hub("gradio/seafoam"),gr.themes.Base.from_hub("freddyaboulton/dracula_revamped"),gr.themes.Glass(),gr.themes.Base(),]),analytics_enabled=False) as iface:
out=gr.Textbox(label="🤗Output",lines=5,interactive=False)
inut=gr.Textbox(label="Prompt")
btn=gr.Button("GENERATE")
btn.click(fn=plix,inputs=inut,outputs=out)
iface.queue(max_size=1,api_open=False)
iface.launch(max_threads=20,inline=False,show_api=False)