Spaces:
Sleeping
Sleeping
Commit
·
ac493ec
0
Parent(s):
First commit
Browse files- .gitignore +2 -0
- app.py +161 -0
- cfg.py +130 -0
- requirements.txt +2 -0
- rtd_scraper/__init__.py +0 -0
- rtd_scraper/scrape_rtd.py +71 -0
- rtd_scraper/scrapy.cfg +11 -0
- rtd_scraper/tutorial/__init__.py +0 -0
- rtd_scraper/tutorial/middlewares.py +102 -0
- rtd_scraper/tutorial/settings.py +102 -0
- rtd_scraper/tutorial/spiders/__init__.py +4 -0
- rtd_scraper/tutorial/spiders/docs_spider.py +44 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
outputs/
|
2 |
+
__pycache__/
|
app.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
from typing import Optional, Tuple
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
import pandas as pd
|
7 |
+
from buster.completers import Completion
|
8 |
+
from buster.utils import extract_zip
|
9 |
+
|
10 |
+
import cfg
|
11 |
+
from cfg import setup_buster
|
12 |
+
|
13 |
+
# Create a handler to control where log messages go (e.g., console, file)
|
14 |
+
handler = (
|
15 |
+
logging.StreamHandler()
|
16 |
+
) # Console output, you can change it to a file handler if needed
|
17 |
+
|
18 |
+
# Set the handler's level to INFO
|
19 |
+
handler.setLevel(logging.INFO)
|
20 |
+
logging.basicConfig(level=logging.INFO)
|
21 |
+
|
22 |
+
# Check if an openai key is set as an env. variable
|
23 |
+
if os.getenv("OPENAI_API_KEY") is None:
|
24 |
+
print(
|
25 |
+
"Warning: No openai key detected. You can set it with 'export OPENAI_API_KEY=sk-...'."
|
26 |
+
)
|
27 |
+
|
28 |
+
# Typehint for chatbot history
|
29 |
+
ChatHistory = list[list[Optional[str], Optional[str]]]
|
30 |
+
|
31 |
+
buster = setup_buster(cfg.buster_cfg)
|
32 |
+
|
33 |
+
|
34 |
+
def add_user_question(
|
35 |
+
user_question: str, chat_history: Optional[ChatHistory] = None
|
36 |
+
) -> ChatHistory:
|
37 |
+
"""Adds a user's question to the chat history.
|
38 |
+
|
39 |
+
If no history is provided, the first element of the history will be the user conversation.
|
40 |
+
"""
|
41 |
+
if chat_history is None:
|
42 |
+
chat_history = []
|
43 |
+
chat_history.append([user_question, None])
|
44 |
+
return chat_history
|
45 |
+
|
46 |
+
|
47 |
+
def format_sources(matched_documents: pd.DataFrame) -> str:
|
48 |
+
if len(matched_documents) == 0:
|
49 |
+
return ""
|
50 |
+
|
51 |
+
matched_documents.similarity_to_answer = (
|
52 |
+
matched_documents.similarity_to_answer * 100
|
53 |
+
)
|
54 |
+
|
55 |
+
# drop duplicate pages (by title), keep highest ranking ones
|
56 |
+
matched_documents = matched_documents.sort_values(
|
57 |
+
"similarity_to_answer", ascending=False
|
58 |
+
).drop_duplicates("title", keep="first")
|
59 |
+
|
60 |
+
documents_answer_template: str = "📝 Here are the sources I used to answer your question:\n\n{documents}\n\n{footnote}"
|
61 |
+
document_template: str = "[🔗 {document.title}]({document.url}), relevance: {document.similarity_to_answer:2.1f} %"
|
62 |
+
|
63 |
+
documents = "\n".join(
|
64 |
+
[
|
65 |
+
document_template.format(document=document)
|
66 |
+
for _, document in matched_documents.iterrows()
|
67 |
+
]
|
68 |
+
)
|
69 |
+
footnote: str = "I'm a bot 🤖 and not always perfect."
|
70 |
+
|
71 |
+
return documents_answer_template.format(documents=documents, footnote=footnote)
|
72 |
+
|
73 |
+
|
74 |
+
def add_sources(history, completion):
|
75 |
+
if completion.answer_relevant:
|
76 |
+
formatted_sources = format_sources(completion.matched_documents)
|
77 |
+
history.append([None, formatted_sources])
|
78 |
+
|
79 |
+
return history
|
80 |
+
|
81 |
+
|
82 |
+
def chat(chat_history: ChatHistory) -> Tuple[ChatHistory, Completion]:
|
83 |
+
"""Answer a user's question using retrieval augmented generation."""
|
84 |
+
|
85 |
+
# We assume that the question is the user's last interaction
|
86 |
+
user_input = chat_history[-1][0]
|
87 |
+
|
88 |
+
# Do retrieval + augmented generation with buster
|
89 |
+
completion = buster.process_input(user_input)
|
90 |
+
|
91 |
+
# Stream tokens one at a time to the user
|
92 |
+
chat_history[-1][1] = ""
|
93 |
+
for token in completion.answer_generator:
|
94 |
+
chat_history[-1][1] += token
|
95 |
+
|
96 |
+
yield chat_history, completion
|
97 |
+
|
98 |
+
|
99 |
+
demo = gr.Blocks()
|
100 |
+
with demo:
|
101 |
+
with gr.Row():
|
102 |
+
gr.Markdown("<h3><center>RAGTheDocs</center></h3>")
|
103 |
+
|
104 |
+
chatbot = gr.Chatbot()
|
105 |
+
|
106 |
+
with gr.Row():
|
107 |
+
question = gr.Textbox(
|
108 |
+
label="What's your question?",
|
109 |
+
placeholder="Type your question here...",
|
110 |
+
lines=1,
|
111 |
+
)
|
112 |
+
submit = gr.Button(value="Send", variant="secondary")
|
113 |
+
|
114 |
+
examples = gr.Examples(
|
115 |
+
examples=[
|
116 |
+
"How can I install the library?",
|
117 |
+
"How do I deal with noisy data?",
|
118 |
+
"How do I deal with noisy data in 2 words?",
|
119 |
+
],
|
120 |
+
inputs=question,
|
121 |
+
)
|
122 |
+
|
123 |
+
gr.Markdown(
|
124 |
+
"This application uses GPT to search the docs for relevant info and answer questions."
|
125 |
+
)
|
126 |
+
|
127 |
+
response = gr.State()
|
128 |
+
|
129 |
+
# fmt: off
|
130 |
+
submit.click(
|
131 |
+
add_user_question,
|
132 |
+
inputs=[question],
|
133 |
+
outputs=[chatbot]
|
134 |
+
).then(
|
135 |
+
chat,
|
136 |
+
inputs=[chatbot],
|
137 |
+
outputs=[chatbot, response]
|
138 |
+
).then(
|
139 |
+
add_sources,
|
140 |
+
inputs=[chatbot, response],
|
141 |
+
outputs=[chatbot]
|
142 |
+
)
|
143 |
+
|
144 |
+
question.submit(
|
145 |
+
add_user_question,
|
146 |
+
inputs=[question],
|
147 |
+
outputs=[chatbot],
|
148 |
+
).then(
|
149 |
+
chat,
|
150 |
+
inputs=[chatbot],
|
151 |
+
outputs=[chatbot, response]
|
152 |
+
).then(
|
153 |
+
add_sources,
|
154 |
+
inputs=[chatbot, response],
|
155 |
+
outputs=[chatbot]
|
156 |
+
)
|
157 |
+
# fmt: on
|
158 |
+
|
159 |
+
|
160 |
+
demo.queue(concurrency_count=16)
|
161 |
+
demo.launch(share=False)
|
cfg.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import sys
|
3 |
+
|
4 |
+
from buster.busterbot import Buster, BusterConfig
|
5 |
+
from buster.completers import ChatGPTCompleter, DocumentAnswerer
|
6 |
+
from buster.formatters.documents import DocumentsFormatterJSON
|
7 |
+
from buster.formatters.prompts import PromptFormatter
|
8 |
+
from buster.retriever import DeepLakeRetriever, Retriever
|
9 |
+
from buster.tokenizers import GPTTokenizer
|
10 |
+
from buster.validators import QuestionAnswerValidator, Validator
|
11 |
+
|
12 |
+
from rtd_scraper.scrape_rtd import scrape_rtd
|
13 |
+
|
14 |
+
# Set the root logger's level to INFO
|
15 |
+
logging.basicConfig(level=logging.INFO)
|
16 |
+
|
17 |
+
|
18 |
+
homepage_url = "https://buster.readthedocs.io/"
|
19 |
+
|
20 |
+
|
21 |
+
scrape_rtd(homepage_url=homepage_url, save_directory="outputs/")
|
22 |
+
|
23 |
+
# Disable logging for third-party libraries at DEBUG level
|
24 |
+
for name in logging.root.manager.loggerDict:
|
25 |
+
logger = logging.getLogger(name)
|
26 |
+
logger.setLevel(logging.INFO)
|
27 |
+
|
28 |
+
|
29 |
+
buster_cfg = BusterConfig(
|
30 |
+
validator_cfg={
|
31 |
+
"unknown_response_templates": [
|
32 |
+
"I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
|
33 |
+
],
|
34 |
+
"unknown_threshold": 0.85,
|
35 |
+
"embedding_model": "text-embedding-ada-002",
|
36 |
+
"use_reranking": True,
|
37 |
+
"invalid_question_response": "This question does not seem relevant to my current knowledge.",
|
38 |
+
"check_question_prompt": """You are an chatbot answering questions on artificial intelligence.
|
39 |
+
|
40 |
+
Your job is to determine wether or not a question is valid, and should be answered.
|
41 |
+
More general questions are not considered valid, even if you might know the response.
|
42 |
+
A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.
|
43 |
+
|
44 |
+
For example:
|
45 |
+
|
46 |
+
Q: What is backpropagation?
|
47 |
+
true
|
48 |
+
|
49 |
+
Q: What is the meaning of life?
|
50 |
+
false
|
51 |
+
|
52 |
+
A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
|
53 |
+
"completion_kwargs": {
|
54 |
+
"model": "gpt-3.5-turbo",
|
55 |
+
"stream": False,
|
56 |
+
"temperature": 0,
|
57 |
+
},
|
58 |
+
},
|
59 |
+
retriever_cfg={
|
60 |
+
"path": "outputs/deeplake_store",
|
61 |
+
"top_k": 3,
|
62 |
+
"thresh": 0.7,
|
63 |
+
"max_tokens": 2000,
|
64 |
+
"embedding_model": "text-embedding-ada-002",
|
65 |
+
},
|
66 |
+
documents_answerer_cfg={
|
67 |
+
"no_documents_message": "No documents are available for this question.",
|
68 |
+
},
|
69 |
+
completion_cfg={
|
70 |
+
"completion_kwargs": {
|
71 |
+
"model": "gpt-3.5-turbo",
|
72 |
+
"stream": True,
|
73 |
+
"temperature": 0,
|
74 |
+
},
|
75 |
+
},
|
76 |
+
tokenizer_cfg={
|
77 |
+
"model_name": "gpt-3.5-turbo",
|
78 |
+
},
|
79 |
+
documents_formatter_cfg={
|
80 |
+
"max_tokens": 3500,
|
81 |
+
"columns": ["content", "title", "source"],
|
82 |
+
},
|
83 |
+
prompt_formatter_cfg={
|
84 |
+
"max_tokens": 3500,
|
85 |
+
"text_before_docs": (
|
86 |
+
"You are a chatbot assistant answering technical questions about artificial intelligence (AI)."
|
87 |
+
"You can only respond to a question if the content necessary to answer the question is contained in the following provided documentation. "
|
88 |
+
"If the answer is in the documentation, summarize it in a helpful way to the user. "
|
89 |
+
"If it isn't, simply reply that you cannot answer the question. "
|
90 |
+
"Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
|
91 |
+
"Here is the documentation:\n"
|
92 |
+
),
|
93 |
+
"text_after_docs": (
|
94 |
+
"REMEMBER:\n"
|
95 |
+
"You are a chatbot assistant answering technical questions about artificial intelligence (AI)."
|
96 |
+
"Here are the rules you must follow:\n"
|
97 |
+
"1) You must only respond with information contained in the documentation above. Say you do not know if the information is not provided.\n"
|
98 |
+
"2) Make sure to format your answers in Markdown format, including code block and snippets.\n"
|
99 |
+
"3) Do not reference any links, urls or hyperlinks in your answers.\n"
|
100 |
+
"4) If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n"
|
101 |
+
"5) Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
|
102 |
+
"'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?'"
|
103 |
+
"For example:\n"
|
104 |
+
"What is the meaning of life for an qa bot?\n"
|
105 |
+
"I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?"
|
106 |
+
"Now answer the following question:\n"
|
107 |
+
),
|
108 |
+
},
|
109 |
+
)
|
110 |
+
|
111 |
+
|
112 |
+
def setup_buster(buster_cfg: BusterConfig):
|
113 |
+
"""initialize buster with a buster_cfg class"""
|
114 |
+
retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
|
115 |
+
tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
|
116 |
+
document_answerer: DocumentAnswerer = DocumentAnswerer(
|
117 |
+
completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
|
118 |
+
documents_formatter=DocumentsFormatterJSON(
|
119 |
+
tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg
|
120 |
+
),
|
121 |
+
prompt_formatter=PromptFormatter(
|
122 |
+
tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg
|
123 |
+
),
|
124 |
+
**buster_cfg.documents_answerer_cfg,
|
125 |
+
)
|
126 |
+
validator: Validator = QuestionAnswerValidator(**buster_cfg.validator_cfg)
|
127 |
+
buster: Buster = Buster(
|
128 |
+
retriever=retriever, document_answerer=document_answerer, validator=validator
|
129 |
+
)
|
130 |
+
return buster
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
buster-doctalk
|
2 |
+
scrapy
|
rtd_scraper/__init__.py
ADDED
File without changes
|
rtd_scraper/scrape_rtd.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
|
4 |
+
from buster.docparser import get_all_documents
|
5 |
+
from buster.documents_manager import DeepLakeDocumentsManager
|
6 |
+
from buster.parser import SphinxParser
|
7 |
+
from scrapy.crawler import CrawlerProcess
|
8 |
+
from scrapy.exceptions import CloseSpider
|
9 |
+
from scrapy.utils.project import get_project_settings
|
10 |
+
|
11 |
+
from rtd_scraper.tutorial.spiders.docs_spider import DocsSpider
|
12 |
+
|
13 |
+
# When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here...
|
14 |
+
for name in logging.root.manager.loggerDict:
|
15 |
+
logger = logging.getLogger(name)
|
16 |
+
logger.setLevel(logging.INFO)
|
17 |
+
|
18 |
+
|
19 |
+
def run_spider(homepage_url, save_directory):
|
20 |
+
# settings_file_path = 'rtd_scraper.tutorial.settings' # The path seen from top-level, ie. from cfg.py
|
21 |
+
# os.environ.setdefault('SCRAPY_SETTINGS_MODULE', settings_file_path)
|
22 |
+
|
23 |
+
process = CrawlerProcess(settings=get_project_settings())
|
24 |
+
process.crawl(DocsSpider, homepage_url=homepage_url, save_dir=save_directory)
|
25 |
+
|
26 |
+
# Start the crawling process
|
27 |
+
process.start()
|
28 |
+
|
29 |
+
# To stop the crawling process gracefully
|
30 |
+
process.stop()
|
31 |
+
|
32 |
+
|
33 |
+
def scrape_rtd(homepage_url, save_directory):
|
34 |
+
# Crawl the website using scrapy
|
35 |
+
run_spider(homepage_url, save_directory=save_directory)
|
36 |
+
|
37 |
+
# Convert the .html pages into chunks using Buster's SphinxParser
|
38 |
+
root_dir = os.path.join(save_directory, homepage_url.split("https://")[1])
|
39 |
+
|
40 |
+
# root_dir is the folder containing the scraped content e.g. crawled_outputs/buster.readthedocs.io/
|
41 |
+
df = get_all_documents(
|
42 |
+
root_dir=root_dir,
|
43 |
+
base_url=homepage_url,
|
44 |
+
parser_cls=SphinxParser,
|
45 |
+
min_section_length=100,
|
46 |
+
max_section_length=1000,
|
47 |
+
)
|
48 |
+
|
49 |
+
# Add the source column
|
50 |
+
df["source"] = "readthedocs"
|
51 |
+
|
52 |
+
# # Initialize the DeepLake vector store
|
53 |
+
# dm = DeepLakeDocumentsManager(
|
54 |
+
# vector_store_path=os.path.join(save_directory, "deeplake_store"),
|
55 |
+
# overwrite=True,
|
56 |
+
# required_columns=["url", "content", "source", "title"],
|
57 |
+
# )
|
58 |
+
#
|
59 |
+
# # Add all embeddings to the vector store
|
60 |
+
# dm.batch_add(
|
61 |
+
# df=df,
|
62 |
+
# batch_size=3000,
|
63 |
+
# min_time_interval=60,
|
64 |
+
# num_workers=32,
|
65 |
+
# )
|
66 |
+
#
|
67 |
+
|
68 |
+
|
69 |
+
if __name__ == "__main__":
|
70 |
+
homepage_url = "https://buster.readthedocs.io/"
|
71 |
+
scrape_rtd(homepage_url=homepage_url, save_directory="outputs/")
|
rtd_scraper/scrapy.cfg
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Automatically created by: scrapy startproject
|
2 |
+
#
|
3 |
+
# For more information about the [deploy] section see:
|
4 |
+
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
5 |
+
|
6 |
+
[settings]
|
7 |
+
default = tutorial.settings
|
8 |
+
|
9 |
+
[deploy]
|
10 |
+
#url = http://localhost:6800/
|
11 |
+
project = tutorial
|
rtd_scraper/tutorial/__init__.py
ADDED
File without changes
|
rtd_scraper/tutorial/middlewares.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Define here the models for your spider middleware
|
2 |
+
#
|
3 |
+
# See documentation in:
|
4 |
+
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
5 |
+
|
6 |
+
# useful for handling different item types with a single interface
|
7 |
+
from itemadapter import ItemAdapter, is_item
|
8 |
+
from scrapy import signals
|
9 |
+
|
10 |
+
|
11 |
+
class TutorialSpiderMiddleware:
|
12 |
+
# Not all methods need to be defined. If a method is not defined,
|
13 |
+
# scrapy acts as if the spider middleware does not modify the
|
14 |
+
# passed objects.
|
15 |
+
|
16 |
+
@classmethod
|
17 |
+
def from_crawler(cls, crawler):
|
18 |
+
# This method is used by Scrapy to create your spiders.
|
19 |
+
s = cls()
|
20 |
+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
21 |
+
return s
|
22 |
+
|
23 |
+
def process_spider_input(self, response, spider):
|
24 |
+
# Called for each response that goes through the spider
|
25 |
+
# middleware and into the spider.
|
26 |
+
|
27 |
+
# Should return None or raise an exception.
|
28 |
+
return None
|
29 |
+
|
30 |
+
def process_spider_output(self, response, result, spider):
|
31 |
+
# Called with the results returned from the Spider, after
|
32 |
+
# it has processed the response.
|
33 |
+
|
34 |
+
# Must return an iterable of Request, or item objects.
|
35 |
+
for i in result:
|
36 |
+
yield i
|
37 |
+
|
38 |
+
def process_spider_exception(self, response, exception, spider):
|
39 |
+
# Called when a spider or process_spider_input() method
|
40 |
+
# (from other spider middleware) raises an exception.
|
41 |
+
|
42 |
+
# Should return either None or an iterable of Request or item objects.
|
43 |
+
pass
|
44 |
+
|
45 |
+
def process_start_requests(self, start_requests, spider):
|
46 |
+
# Called with the start requests of the spider, and works
|
47 |
+
# similarly to the process_spider_output() method, except
|
48 |
+
# that it doesn’t have a response associated.
|
49 |
+
|
50 |
+
# Must return only requests (not items).
|
51 |
+
for r in start_requests:
|
52 |
+
yield r
|
53 |
+
|
54 |
+
def spider_opened(self, spider):
|
55 |
+
spider.logger.info("Spider opened: %s" % spider.name)
|
56 |
+
|
57 |
+
|
58 |
+
class TutorialDownloaderMiddleware:
|
59 |
+
# Not all methods need to be defined. If a method is not defined,
|
60 |
+
# scrapy acts as if the downloader middleware does not modify the
|
61 |
+
# passed objects.
|
62 |
+
|
63 |
+
@classmethod
|
64 |
+
def from_crawler(cls, crawler):
|
65 |
+
# This method is used by Scrapy to create your spiders.
|
66 |
+
s = cls()
|
67 |
+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
68 |
+
return s
|
69 |
+
|
70 |
+
def process_request(self, request, spider):
|
71 |
+
# Called for each request that goes through the downloader
|
72 |
+
# middleware.
|
73 |
+
|
74 |
+
# Must either:
|
75 |
+
# - return None: continue processing this request
|
76 |
+
# - or return a Response object
|
77 |
+
# - or return a Request object
|
78 |
+
# - or raise IgnoreRequest: process_exception() methods of
|
79 |
+
# installed downloader middleware will be called
|
80 |
+
return None
|
81 |
+
|
82 |
+
def process_response(self, request, response, spider):
|
83 |
+
# Called with the response returned from the downloader.
|
84 |
+
|
85 |
+
# Must either;
|
86 |
+
# - return a Response object
|
87 |
+
# - return a Request object
|
88 |
+
# - or raise IgnoreRequest
|
89 |
+
return response
|
90 |
+
|
91 |
+
def process_exception(self, request, exception, spider):
|
92 |
+
# Called when a download handler or a process_request()
|
93 |
+
# (from other downloader middleware) raises an exception.
|
94 |
+
|
95 |
+
# Must either:
|
96 |
+
# - return None: continue processing this exception
|
97 |
+
# - return a Response object: stops process_exception() chain
|
98 |
+
# - return a Request object: stops process_exception() chain
|
99 |
+
pass
|
100 |
+
|
101 |
+
def spider_opened(self, spider):
|
102 |
+
spider.logger.info("Spider opened: %s" % spider.name)
|
rtd_scraper/tutorial/settings.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Scrapy settings for tutorial project
|
2 |
+
#
|
3 |
+
# For simplicity, this file contains only settings considered important or
|
4 |
+
# commonly used. You can find more settings consulting the documentation:
|
5 |
+
#
|
6 |
+
# https://docs.scrapy.org/en/latest/topics/settings.html
|
7 |
+
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
8 |
+
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
9 |
+
|
10 |
+
from scrapy.utils.log import configure_logging
|
11 |
+
|
12 |
+
# Disable default Scrapy log settings.
|
13 |
+
configure_logging(install_root_handler=False)
|
14 |
+
BOT_NAME = "tutorial"
|
15 |
+
|
16 |
+
SPIDER_MODULES = ["rtd_scraper.tutorial.spiders"]
|
17 |
+
NEWSPIDER_MODULE = "rtd_scraper.tutorial.spiders"
|
18 |
+
|
19 |
+
# SPIDER_MODULES = ["tutorial.spiders"]
|
20 |
+
# NEWSPIDER_MODULE = "tutorial.spiders"
|
21 |
+
|
22 |
+
LOG_ENABLED = False
|
23 |
+
LOG_LEVEL = "INFO"
|
24 |
+
|
25 |
+
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
26 |
+
# USER_AGENT = "tutorial (+http://www.yourdomain.com)"
|
27 |
+
|
28 |
+
# Obey robots.txt rules
|
29 |
+
ROBOTSTXT_OBEY = True
|
30 |
+
|
31 |
+
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
32 |
+
# CONCURRENT_REQUESTS = 32
|
33 |
+
|
34 |
+
# Configure a delay for requests for the same website (default: 0)
|
35 |
+
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
36 |
+
# See also autothrottle settings and docs
|
37 |
+
# DOWNLOAD_DELAY = 3
|
38 |
+
# The download delay setting will honor only one of:
|
39 |
+
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
40 |
+
# CONCURRENT_REQUESTS_PER_IP = 16
|
41 |
+
|
42 |
+
# Disable cookies (enabled by default)
|
43 |
+
# COOKIES_ENABLED = False
|
44 |
+
|
45 |
+
# Disable Telnet Console (enabled by default)
|
46 |
+
# TELNETCONSOLE_ENABLED = False
|
47 |
+
|
48 |
+
# Override the default request headers:
|
49 |
+
# DEFAULT_REQUEST_HEADERS = {
|
50 |
+
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
51 |
+
# "Accept-Language": "en",
|
52 |
+
# }
|
53 |
+
|
54 |
+
# Enable or disable spider middlewares
|
55 |
+
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
56 |
+
# SPIDER_MIDDLEWARES = {
|
57 |
+
# "tutorial.middlewares.TutorialSpiderMiddleware": 543,
|
58 |
+
# }
|
59 |
+
|
60 |
+
# Enable or disable downloader middlewares
|
61 |
+
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
62 |
+
# DOWNLOADER_MIDDLEWARES = {
|
63 |
+
# "tutorial.middlewares.TutorialDownloaderMiddleware": 543,
|
64 |
+
# }
|
65 |
+
|
66 |
+
# Enable or disable extensions
|
67 |
+
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
68 |
+
# EXTENSIONS = {
|
69 |
+
# "scrapy.extensions.telnet.TelnetConsole": None,
|
70 |
+
# }
|
71 |
+
|
72 |
+
# Configure item pipelines
|
73 |
+
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
74 |
+
# ITEM_PIPELINES = {
|
75 |
+
# "tutorial.pipelines.TutorialPipeline": 300,
|
76 |
+
# }
|
77 |
+
|
78 |
+
# Enable and configure the AutoThrottle extension (disabled by default)
|
79 |
+
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
80 |
+
# AUTOTHROTTLE_ENABLED = True
|
81 |
+
# The initial download delay
|
82 |
+
# AUTOTHROTTLE_START_DELAY = 5
|
83 |
+
# The maximum download delay to be set in case of high latencies
|
84 |
+
# AUTOTHROTTLE_MAX_DELAY = 60
|
85 |
+
# The average number of requests Scrapy should be sending in parallel to
|
86 |
+
# each remote server
|
87 |
+
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
88 |
+
# Enable showing throttling stats for every response received:
|
89 |
+
# AUTOTHROTTLE_DEBUG = False
|
90 |
+
|
91 |
+
# Enable and configure HTTP caching (disabled by default)
|
92 |
+
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
93 |
+
# HTTPCACHE_ENABLED = True
|
94 |
+
# HTTPCACHE_EXPIRATION_SECS = 0
|
95 |
+
# HTTPCACHE_DIR = "httpcache"
|
96 |
+
# HTTPCACHE_IGNORE_HTTP_CODES = []
|
97 |
+
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
98 |
+
|
99 |
+
# Set settings whose default value is deprecated to a future-proof value
|
100 |
+
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
101 |
+
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
102 |
+
FEED_EXPORT_ENCODING = "utf-8"
|
rtd_scraper/tutorial/spiders/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This package will contain the spiders of your Scrapy project
|
2 |
+
#
|
3 |
+
# Please refer to the documentation for information on how to create and manage
|
4 |
+
# your spiders.
|
rtd_scraper/tutorial/spiders/docs_spider.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from pathlib import Path
|
3 |
+
from urllib.parse import urlparse
|
4 |
+
|
5 |
+
import scrapy
|
6 |
+
|
7 |
+
logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.ERROR)
|
8 |
+
|
9 |
+
|
10 |
+
class DocsSpider(scrapy.Spider):
|
11 |
+
name = "docs"
|
12 |
+
|
13 |
+
def __init__(self, homepage_url: str, save_dir="crawled_pages", *args, **kwargs):
|
14 |
+
super(DocsSpider, self).__init__(*args, **kwargs)
|
15 |
+
|
16 |
+
if not homepage_url.startswith("https://"):
|
17 |
+
homepage_url = "https://" + homepage_url
|
18 |
+
|
19 |
+
project: str = homepage_url.split(".")[0].split("https://")[1]
|
20 |
+
self.allowed_domains = [f"{project}.readthedocs.io"]
|
21 |
+
self.start_urls = [homepage_url]
|
22 |
+
self.base_dir = Path(save_dir)
|
23 |
+
|
24 |
+
def parse(self, response):
|
25 |
+
parsed_uri = urlparse(response.url)
|
26 |
+
# Create a Path from the parsed URL. If it ends with '/', we add 'index.html' as the filename.
|
27 |
+
if parsed_uri.path.endswith("/"):
|
28 |
+
filepath = (
|
29 |
+
self.base_dir
|
30 |
+
/ parsed_uri.netloc
|
31 |
+
/ parsed_uri.path.strip("/")
|
32 |
+
/ "index.html"
|
33 |
+
)
|
34 |
+
else:
|
35 |
+
filepath = self.base_dir / parsed_uri.netloc / parsed_uri.path.strip("/")
|
36 |
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
37 |
+
|
38 |
+
print(f"{filepath=}")
|
39 |
+
with open(filepath, "wb") as f:
|
40 |
+
f.write(response.body)
|
41 |
+
|
42 |
+
# Follow links to other documentation pages
|
43 |
+
for href in response.css("a::attr(href)").getall():
|
44 |
+
yield response.follow(href, self.parse)
|