RAGTheDocs-mila-qc

Sleeping

App Files Files Community

jerpint commited on Oct 31, 2023

Commit

ac493ec

0 Parent(s):

First commit

Browse files

Files changed (12) hide show

.gitignore +2 -0
app.py +161 -0
cfg.py +130 -0
requirements.txt +2 -0
rtd_scraper/__init__.py +0 -0
rtd_scraper/scrape_rtd.py +71 -0
rtd_scraper/scrapy.cfg +11 -0
rtd_scraper/tutorial/__init__.py +0 -0
rtd_scraper/tutorial/middlewares.py +102 -0
rtd_scraper/tutorial/settings.py +102 -0
rtd_scraper/tutorial/spiders/__init__.py +4 -0
rtd_scraper/tutorial/spiders/docs_spider.py +44 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ outputs/
2	+ __pycache__/

app.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import logging
+import os
+from typing import Optional, Tuple
+import gradio as gr
+import pandas as pd
+from buster.completers import Completion
+from buster.utils import extract_zip
+import cfg
+from cfg import setup_buster
+# Create a handler to control where log messages go (e.g., console, file)
+handler = (
+    logging.StreamHandler()
+)  # Console output, you can change it to a file handler if needed
+# Set the handler's level to INFO
+handler.setLevel(logging.INFO)
+logging.basicConfig(level=logging.INFO)
+# Check if an openai key is set as an env. variable
+if os.getenv("OPENAI_API_KEY") is None:
+    print(
+        "Warning: No openai key detected. You can set it with 'export OPENAI_API_KEY=sk-...'."
+    )
+# Typehint for chatbot history
+ChatHistory = list[list[Optional[str], Optional[str]]]
+buster = setup_buster(cfg.buster_cfg)
+def add_user_question(
+    user_question: str, chat_history: Optional[ChatHistory] = None
+) -> ChatHistory:
+    """Adds a user's question to the chat history.
+    If no history is provided, the first element of the history will be the user conversation.
+    """
+    if chat_history is None:
+        chat_history = []
+    chat_history.append([user_question, None])
+    return chat_history
+def format_sources(matched_documents: pd.DataFrame) -> str:
+    if len(matched_documents) == 0:
+        return ""
+    matched_documents.similarity_to_answer = (
+        matched_documents.similarity_to_answer * 100
+    )
+    # drop duplicate pages (by title), keep highest ranking ones
+    matched_documents = matched_documents.sort_values(
+        "similarity_to_answer", ascending=False
+    ).drop_duplicates("title", keep="first")
+    documents_answer_template: str = "📝 Here are the sources I used to answer your question:\n\n{documents}\n\n{footnote}"
+    document_template: str = "[🔗 {document.title}]({document.url}), relevance: {document.similarity_to_answer:2.1f} %"
+    documents = "\n".join(
+        [
+            document_template.format(document=document)
+            for _, document in matched_documents.iterrows()
+        ]
+    )
+    footnote: str = "I'm a bot 🤖 and not always perfect."
+    return documents_answer_template.format(documents=documents, footnote=footnote)
+def add_sources(history, completion):
+    if completion.answer_relevant:
+        formatted_sources = format_sources(completion.matched_documents)
+        history.append([None, formatted_sources])
+    return history
+def chat(chat_history: ChatHistory) -> Tuple[ChatHistory, Completion]:
+    """Answer a user's question using retrieval augmented generation."""
+    # We assume that the question is the user's last interaction
+    user_input = chat_history[-1][0]
+    # Do retrieval + augmented generation with buster
+    completion = buster.process_input(user_input)
+    # Stream tokens one at a time to the user
+    chat_history[-1][1] = ""
+    for token in completion.answer_generator:
+        chat_history[-1][1] += token
+        yield chat_history, completion
+demo = gr.Blocks()
+with demo:
+    with gr.Row():
+        gr.Markdown("<h3><center>RAGTheDocs</center></h3>")
+    chatbot = gr.Chatbot()
+    with gr.Row():
+        question = gr.Textbox(
+            label="What's your question?",
+            placeholder="Type your question here...",
+            lines=1,
+        )
+        submit = gr.Button(value="Send", variant="secondary")
+    examples = gr.Examples(
+        examples=[
+            "How can I install the library?",
+            "How do I deal with noisy data?",
+            "How do I deal with noisy data in 2 words?",
+        ],
+        inputs=question,
+    )
+    gr.Markdown(
+        "This application uses GPT to search the docs for relevant info and answer questions."
+    )
+    response = gr.State()
+    # fmt: off
+    submit.click(
+        add_user_question,
+        inputs=[question],
+        outputs=[chatbot]
+    ).then(
+        chat,
+        inputs=[chatbot],
+        outputs=[chatbot, response]
+    ).then(
+        add_sources,
+        inputs=[chatbot, response],
+        outputs=[chatbot]
+    )
+    question.submit(
+        add_user_question,
+        inputs=[question],
+        outputs=[chatbot],
+    ).then(
+        chat,
+        inputs=[chatbot],
+        outputs=[chatbot, response]
+    ).then(
+        add_sources,
+        inputs=[chatbot, response],
+        outputs=[chatbot]
+    )
+    # fmt: on
+demo.queue(concurrency_count=16)
+demo.launch(share=False)

cfg.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import logging
+import sys
+from buster.busterbot import Buster, BusterConfig
+from buster.completers import ChatGPTCompleter, DocumentAnswerer
+from buster.formatters.documents import DocumentsFormatterJSON
+from buster.formatters.prompts import PromptFormatter
+from buster.retriever import DeepLakeRetriever, Retriever
+from buster.tokenizers import GPTTokenizer
+from buster.validators import QuestionAnswerValidator, Validator
+from rtd_scraper.scrape_rtd import scrape_rtd
+# Set the root logger's level to INFO
+logging.basicConfig(level=logging.INFO)
+homepage_url = "https://buster.readthedocs.io/"
+scrape_rtd(homepage_url=homepage_url, save_directory="outputs/")
+# Disable logging for third-party libraries at DEBUG level
+for name in logging.root.manager.loggerDict:
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.INFO)
+buster_cfg = BusterConfig(
+    validator_cfg={
+        "unknown_response_templates": [
+            "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
+        ],
+        "unknown_threshold": 0.85,
+        "embedding_model": "text-embedding-ada-002",
+        "use_reranking": True,
+        "invalid_question_response": "This question does not seem relevant to my current knowledge.",
+        "check_question_prompt": """You are an chatbot answering questions on artificial intelligence.
+Your job is to determine wether or not a question is valid, and should be answered.
+More general questions are not considered valid, even if you might know the response.
+A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.
+For example:
+Q: What is backpropagation?
+true
+Q: What is the meaning of life?
+false
+A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
+        "completion_kwargs": {
+            "model": "gpt-3.5-turbo",
+            "stream": False,
+            "temperature": 0,
+        },
+    },
+    retriever_cfg={
+        "path": "outputs/deeplake_store",
+        "top_k": 3,
+        "thresh": 0.7,
+        "max_tokens": 2000,
+        "embedding_model": "text-embedding-ada-002",
+    },
+    documents_answerer_cfg={
+        "no_documents_message": "No documents are available for this question.",
+    },
+    completion_cfg={
+        "completion_kwargs": {
+            "model": "gpt-3.5-turbo",
+            "stream": True,
+            "temperature": 0,
+        },
+    },
+    tokenizer_cfg={
+        "model_name": "gpt-3.5-turbo",
+    },
+    documents_formatter_cfg={
+        "max_tokens": 3500,
+        "columns": ["content", "title", "source"],
+    },
+    prompt_formatter_cfg={
+        "max_tokens": 3500,
+        "text_before_docs": (
+            "You are a chatbot assistant answering technical questions about artificial intelligence (AI)."
+            "You can only respond to a question if the content necessary to answer the question is contained in the following provided documentation. "
+            "If the answer is in the documentation, summarize it in a helpful way to the user. "
+            "If it isn't, simply reply that you cannot answer the question. "
+            "Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
+            "Here is the documentation:\n"
+        ),
+        "text_after_docs": (
+            "REMEMBER:\n"
+            "You are a chatbot assistant answering technical questions about artificial intelligence (AI)."
+            "Here are the rules you must follow:\n"
+            "1) You must only respond with information contained in the documentation above. Say you do not know if the information is not provided.\n"
+            "2) Make sure to format your answers in Markdown format, including code block and snippets.\n"
+            "3) Do not reference any links, urls or hyperlinks in your answers.\n"
+            "4) If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n"
+            "5) Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
+            "'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?'"
+            "For example:\n"
+            "What is the meaning of life for an qa bot?\n"
+            "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?"
+            "Now answer the following question:\n"
+        ),
+    },
+)
+def setup_buster(buster_cfg: BusterConfig):
+    """initialize buster with a buster_cfg class"""
+    retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
+    tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
+    document_answerer: DocumentAnswerer = DocumentAnswerer(
+        completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
+        documents_formatter=DocumentsFormatterJSON(
+            tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg
+        ),
+        prompt_formatter=PromptFormatter(
+            tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg
+        ),
+        **buster_cfg.documents_answerer_cfg,
+    )
+    validator: Validator = QuestionAnswerValidator(**buster_cfg.validator_cfg)
+    buster: Buster = Buster(
+        retriever=retriever, document_answerer=document_answerer, validator=validator
+    )
+    return buster

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ buster-doctalk
2	+ scrapy

rtd_scraper/__init__.py ADDED Viewed

File without changes

rtd_scraper/scrape_rtd.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import logging
+import os
+from buster.docparser import get_all_documents
+from buster.documents_manager import DeepLakeDocumentsManager
+from buster.parser import SphinxParser
+from scrapy.crawler import CrawlerProcess
+from scrapy.exceptions import CloseSpider
+from scrapy.utils.project import get_project_settings
+from rtd_scraper.tutorial.spiders.docs_spider import DocsSpider
+# When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here...
+for name in logging.root.manager.loggerDict:
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.INFO)
+def run_spider(homepage_url, save_directory):
+    #  settings_file_path = 'rtd_scraper.tutorial.settings' # The path seen from top-level, ie. from cfg.py
+    #  os.environ.setdefault('SCRAPY_SETTINGS_MODULE', settings_file_path)
+    process = CrawlerProcess(settings=get_project_settings())
+    process.crawl(DocsSpider, homepage_url=homepage_url, save_dir=save_directory)
+    # Start the crawling process
+    process.start()
+    # To stop the crawling process gracefully
+    process.stop()
+def scrape_rtd(homepage_url, save_directory):
+    # Crawl the website using scrapy
+    run_spider(homepage_url, save_directory=save_directory)
+    # Convert the .html pages into chunks using Buster's SphinxParser
+    root_dir = os.path.join(save_directory, homepage_url.split("https://")[1])
+    # root_dir is the folder containing the scraped content e.g. crawled_outputs/buster.readthedocs.io/
+    df = get_all_documents(
+        root_dir=root_dir,
+        base_url=homepage_url,
+        parser_cls=SphinxParser,
+        min_section_length=100,
+        max_section_length=1000,
+    )
+    # Add the source column
+    df["source"] = "readthedocs"
+    #  #  Initialize the DeepLake vector store
+    #  dm = DeepLakeDocumentsManager(
+    #      vector_store_path=os.path.join(save_directory, "deeplake_store"),
+    #      overwrite=True,
+    #      required_columns=["url", "content", "source", "title"],
+    #  )
+    #
+    #  # Add all embeddings to the vector store
+    #  dm.batch_add(
+    #      df=df,
+    #      batch_size=3000,
+    #      min_time_interval=60,
+    #      num_workers=32,
+    #  )
+    #
+if __name__ == "__main__":
+    homepage_url = "https://buster.readthedocs.io/"
+    scrape_rtd(homepage_url=homepage_url, save_directory="outputs/")

rtd_scraper/scrapy.cfg ADDED Viewed

	@@ -0,0 +1,11 @@

+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+[settings]
+default = tutorial.settings
+[deploy]
+#url = http://localhost:6800/
+project = tutorial

rtd_scraper/tutorial/__init__.py ADDED Viewed

File without changes

rtd_scraper/tutorial/middlewares.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+# useful for handling different item types with a single interface
+from itemadapter import ItemAdapter, is_item
+from scrapy import signals
+class TutorialSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+        # Should return None or raise an exception.
+        return None
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+        # Should return either None or an iterable of Request or item objects.
+        pass
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+class TutorialDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)

rtd_scraper/tutorial/settings.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# Scrapy settings for tutorial project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+from scrapy.utils.log import configure_logging
+# Disable default Scrapy log settings.
+configure_logging(install_root_handler=False)
+BOT_NAME = "tutorial"
+SPIDER_MODULES = ["rtd_scraper.tutorial.spiders"]
+NEWSPIDER_MODULE = "rtd_scraper.tutorial.spiders"
+#  SPIDER_MODULES = ["tutorial.spiders"]
+#  NEWSPIDER_MODULE = "tutorial.spiders"
+LOG_ENABLED = False
+LOG_LEVEL = "INFO"
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+# USER_AGENT = "tutorial (+http://www.yourdomain.com)"
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+# CONCURRENT_REQUESTS = 32
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+# DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
+# CONCURRENT_REQUESTS_PER_IP = 16
+# Disable cookies (enabled by default)
+# COOKIES_ENABLED = False
+# Disable Telnet Console (enabled by default)
+# TELNETCONSOLE_ENABLED = False
+# Override the default request headers:
+# DEFAULT_REQUEST_HEADERS = {
+#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+#    "Accept-Language": "en",
+# }
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+# SPIDER_MIDDLEWARES = {
+#    "tutorial.middlewares.TutorialSpiderMiddleware": 543,
+# }
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+# DOWNLOADER_MIDDLEWARES = {
+#    "tutorial.middlewares.TutorialDownloaderMiddleware": 543,
+# }
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+# EXTENSIONS = {
+#    "scrapy.extensions.telnet.TelnetConsole": None,
+# }
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+# ITEM_PIPELINES = {
+#    "tutorial.pipelines.TutorialPipeline": 300,
+# }
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+# AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+# AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+# AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+# AUTOTHROTTLE_DEBUG = False
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+# HTTPCACHE_ENABLED = True
+# HTTPCACHE_EXPIRATION_SECS = 0
+# HTTPCACHE_DIR = "httpcache"
+# HTTPCACHE_IGNORE_HTTP_CODES = []
+# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+# Set settings whose default value is deprecated to a future-proof value
+REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
+TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
+FEED_EXPORT_ENCODING = "utf-8"

rtd_scraper/tutorial/spiders/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.

rtd_scraper/tutorial/spiders/docs_spider.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import logging
+from pathlib import Path
+from urllib.parse import urlparse
+import scrapy
+logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.ERROR)
+class DocsSpider(scrapy.Spider):
+    name = "docs"
+    def __init__(self, homepage_url: str, save_dir="crawled_pages", *args, **kwargs):
+        super(DocsSpider, self).__init__(*args, **kwargs)
+        if not homepage_url.startswith("https://"):
+            homepage_url = "https://" + homepage_url
+        project: str = homepage_url.split(".")[0].split("https://")[1]
+        self.allowed_domains = [f"{project}.readthedocs.io"]
+        self.start_urls = [homepage_url]
+        self.base_dir = Path(save_dir)
+    def parse(self, response):
+        parsed_uri = urlparse(response.url)
+        # Create a Path from the parsed URL. If it ends with '/', we add 'index.html' as the filename.
+        if parsed_uri.path.endswith("/"):
+            filepath = (
+                self.base_dir
+                / parsed_uri.netloc
+                / parsed_uri.path.strip("/")
+                / "index.html"
+            )
+        else:
+            filepath = self.base_dir / parsed_uri.netloc / parsed_uri.path.strip("/")
+        filepath.parent.mkdir(parents=True, exist_ok=True)
+        print(f"{filepath=}")
+        with open(filepath, "wb") as f:
+            f.write(response.body)
+        # Follow links to other documentation pages
+        for href in response.css("a::attr(href)").getall():
+            yield response.follow(href, self.parse)