Spaces:

JoPmt
/

Quantized_Web_RWKV_agents_RAG_tools_1

Running

App Files Files Community

Quantized_Web_RWKV_agents_RAG_tools_1 / app.py

JoPmt

Rename app (10).py to app.py

7e3715e verified 3 months ago

raw

history blame

6.04 kB

	from huggingface_hub import login, InferenceClient
	import os, gc, time, random, datetime, json, re
	HF_TOKEN=os.getenv('HF_TOKEN')
	SERP_API_KEY=os.getenv('SERP_KEY')
	login(token=HF_TOKEN)
	import gradio as gr
	from transformers import CodeAgent, Tool, ToolCollection, load_tool, ReactCodeAgent, ReactJsonAgent
	from transformers.agents import PythonInterpreterTool
	from langchain.memory import ConversationBufferMemory
	import bs4
	import requests
	from llm_engine import HfEngine
	import datasets
	import spaces
	import tqdm
	from langchain_huggingface.embeddings import HuggingFaceEmbeddings
	from langchain_community.vectorstores import FAISS
	from langchain.docstore.document import Document
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_core.vectorstores import VectorStore
	from transformers.agents.prompts import DEFAULT_REACT_CODE_SYSTEM_PROMPT, DEFAULT_REACT_JSON_SYSTEM_PROMPT
	from transformers.agents.default_tools import Tool, PythonInterpreterTool
	from duckduckgo_search import DDGS
	from web_surfer import (SearchInformationTool, NavigationalSearchTool, VisitTool, DownloadTool, PageUpTool, PageDownTool, FinderTool, FindNextTool, ArchiveSearchTool,)
	from mdconvert import MarkdownConverter
	from visual_qa import VisualQATool, VisualQAGPT4Tool
	HF_HUB_DISABLE_TELEMETRY=1
	DO_NOT_TRACK=1
	HF_HUB_ENABLE_HF_TRANSFER=0
	def search_ducky(query):
	with DDGS() as ddgs:
	results = list(ddgs.text(query, max_results=10))
	content = ''
	if results:
	for result in results:
	content += result['body']
	return content
	knowledge_base = datasets.load_dataset("m-ric/huggingface_doc", split="train")
	source_docs = [Document(page_content=doc["text"], metadata={"source": doc["source"].split("/")[1]}) for doc in knowledge_base]
	docs_processed = RecursiveCharacterTextSplitter(chunk_size=500).split_documents(source_docs)[:1000]
	embedding_model = HuggingFaceEmbeddings(model_name="thenlper/gte-small")
	vectordb = FAISS.from_documents(documents=docs_processed, embedding=embedding_model)
	all_sources = list(set([doc.metadata["source"] for doc in docs_processed]))
	print(all_sources)
	class RetrieverTool(Tool):
	name = "retriever"
	description = "Retrieves some documents from the knowledge base that have the closest embeddings to the input query."
	inputs = {
	"query": {
	"type": "text",
	"description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.",
	},
	"source": {
	"type": "text",
	"description": ""
	},
	}
	output_type = "text"

	def __init__(self, vectordb: VectorStore, all_sources: str, **kwargs):
	super().__init__(**kwargs)
	self.vectordb = vectordb
	self.inputs["source"]["description"] = (f"The source of the documents to search, as a str representation of a list. Possible values in the list are: {all_sources}. If this argument is not provided, all sources will be searched.")

	def forward(self, query: str, source: str = None) -> str:
	assert isinstance(query, str), "Your search query must be a string"

	if source:
	if isinstance(source, str) and "[" not in str(source): # if the source is not representing a list
	source = [source]
	source = json.loads(str(source).replace("'", '"'))

	docs = self.vectordb.similarity_search(query, filter=({"source": source} if source else None), k=3)

	if len(docs) == 0:
	return "No documents found with this filtering. Try removing the source filter."
	return "Retrieved documents:\n\n" + "\n===Document===\n".join([doc.page_content for doc in docs])
	memory = ConversationBufferMemory(memory_key="chat_history")
	llm_engine = HfEngine(model="Jopmt/JoPmt")
	##gradio_prompt_generator_tool = StableDiffusionPromptGeneratorTool()
	##prompt_generator_tool = Tool.from_gradio(gradio_prompt_generator_tool)
	##tools = [StableDiffusionTool().langchain, ImageCaptioningTool().langchain, StableDiffusionPromptGeneratorTool().langchain, TextToVideoTool().langchain]
	##tools=[prompt_generator_tool(), image_generation_tool(), PythonInterpreterTool()]
	class SearchTool(Tool):
	name = "ask_search_agent"
	description = "A search agent that will browse the internet to answer a question. Use it to gather informations, not for problem-solving."

	inputs = {
	"question": {
	"description": "Your question, as a natural language sentence. You are talking to an agent, so provide them with as much context as possible.",
	"type": "text",
	}
	}
	output_type = "text"

	def forward(self, question: str) -> str:
	return websurfer_agent.run(question)
	tools=[PythonInterpreterTool(),SearchTool(),RetrieverTool(vectordb, all_sources)]
	additional_authorized_imports=['requests', 'bs4', 'os', 'time', 'datetime', 'json', 're']
	WEB_TOOLS = [SearchInformationTool(), NavigationalSearchTool(), VisitTool(), DownloadTool(), PageUpTool(), PageDownTool(), FinderTool(), FindNextTool(), ArchiveSearchTool(),]
	websurfer_agent = ReactJsonAgent(tools=WEB_TOOLS,llm_engine=llm_engine, add_base_tools=True,max_iterations=1)
	reagent = ReactCodeAgent(tools=tools, llm_engine=llm_engine, add_base_tools=True,max_iterations=1,additional_authorized_imports=additional_authorized_imports)
	def plix(inut, progress=gr.Progress(track_tqdm=True)):
	goose=reagent.run(inut)
	return goose
	with gr.Blocks(theme=random.choice([gr.themes.Monochrome(),gr.themes.Base.from_hub("gradio/seafoam"),gr.themes.Base.from_hub("freddyaboulton/dracula_revamped"),gr.themes.Glass(),gr.themes.Base(),]),analytics_enabled=False) as iface:
	out=gr.Textbox(label="🤗Output",lines=5,interactive=False)
	inut=gr.Textbox(label="Prompt")
	btn=gr.Button("GENERATE")
	btn.click(fn=plix,inputs=inut,outputs=out)
	iface.queue(max_size=1,api_open=False)
	iface.launch(max_threads=20,inline=False,show_api=False)