Spaces:
Sleeping
Sleeping
Stefano Fiorucci
commited on
Commit
•
82fe524
1
Parent(s):
a251941
crawler refactoring
Browse files- crawler/README.md +15 -0
- crawler/data/.gitkeep +0 -0
- crawler/requirements.txt +1 -1
- crawler/tpcrawler/tpcrawler/spiders/tpcrawler.py +32 -30
crawler/README.md
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Twin Peaks crawler
|
2 |
+
|
3 |
+
This crawler download texts and metadata from [Twin Peaks Fandom Wiki](https://twinpeaks.fandom.com/wiki/Twin_Peaks_Wiki). The output format is JSON. The crawler is based on the combination of [Scrapy](https://github.com/scrapy/scrapy) and [fandom-py](https://github.com/NikolajDanger/fandom-py).
|
4 |
+
|
5 |
+
*Several wiki pages are discarded, since they are not related to Twin Peaks plot and create noise in the Question Answering index.*
|
6 |
+
|
7 |
+
## Installation
|
8 |
+
- `pip install -r requirements.txt`
|
9 |
+
- copy this folder (if needed, see [stackoverflow](https://stackoverflow.com/questions/7106012/download-a-single-folder-or-directory-from-a-github-repo))
|
10 |
+
|
11 |
+
## Usage
|
12 |
+
- (if needed, activate the virtual environment)
|
13 |
+
- `cd tpcrawler`
|
14 |
+
- `scrapy crawl tpcrawler`
|
15 |
+
- you can find the downloaded pages in `data` subfolder
|
crawler/data/.gitkeep
ADDED
File without changes
|
crawler/requirements.txt
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
fandom-py==0.2.1
|
2 |
-
Scrapy==2.
|
|
|
1 |
fandom-py==0.2.1
|
2 |
+
Scrapy==2.6.1
|
crawler/tpcrawler/tpcrawler/spiders/tpcrawler.py
CHANGED
@@ -1,14 +1,11 @@
|
|
1 |
import scrapy
|
2 |
-
from scrapy.utils.response import open_in_browser
|
3 |
from scrapy.http import TextResponse
|
4 |
import re
|
5 |
import fandom
|
6 |
import json
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
article_id_pattern="wgArticleId\"\:([0-9]+)"
|
11 |
-
categories_xpath="//div[@class='page-header__categories']/a//text()"
|
12 |
excluded_categories=set("""Twin Peaks (2017) crew
|
13 |
Actors
|
14 |
Camera and electrical department
|
@@ -34,7 +31,13 @@ Decades
|
|
34 |
Days
|
35 |
Production timeline""".split("\n"))
|
36 |
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
class Tpcrawler(scrapy.Spider):
|
40 |
name = 'tpcrawler'
|
@@ -43,40 +46,39 @@ class Tpcrawler(scrapy.Spider):
|
|
43 |
|
44 |
|
45 |
def parse(self, response):
|
46 |
-
|
47 |
-
|
48 |
-
hrefs = response.xpath("//ul[@class='mw-allpages-chunk']/li/a[not(contains(@class, 'redirect'))]/@href").extract()
|
49 |
for href in hrefs:
|
50 |
-
yield scrapy.Request(url=response.urljoin(href),
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
next_page = response.xpath("//div[@class='mw-allpages-nav']/a[contains(.,'Next page')]/@href").extract_first()
|
55 |
|
|
|
56 |
if next_page:
|
57 |
-
yield scrapy.Request(url=response.urljoin(next_page),
|
|
|
58 |
|
59 |
def parse_page(self, response: TextResponse):
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
categories = set(response.xpath(categories_xpath).extract())
|
61 |
-
|
|
|
|
|
62 |
if len(categories.intersection(excluded_categories))==0:
|
63 |
name = response.meta['name']
|
64 |
url = response.meta['url']
|
65 |
article_id = int(re.findall(article_id_pattern, response.text)[0])
|
66 |
|
67 |
-
|
68 |
-
page = fandom.page(pageid = article_id)
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
json_content={'name': name, 'url':url, 'text':text}
|
73 |
-
|
74 |
with open(f'./data/{name}.json','w', encoding='utf-8') as fout:
|
75 |
-
json.dump(json_content, fout)
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
1 |
import scrapy
|
|
|
2 |
from scrapy.http import TextResponse
|
3 |
import re
|
4 |
import fandom
|
5 |
import json
|
6 |
|
7 |
+
# Categories unrelated to Twin Peaks plot
|
8 |
+
# (they make noise in the index)
|
|
|
|
|
9 |
excluded_categories=set("""Twin Peaks (2017) crew
|
10 |
Actors
|
11 |
Camera and electrical department
|
|
|
31 |
Days
|
32 |
Production timeline""".split("\n"))
|
33 |
|
34 |
+
fandom.set_wiki("Twinpeaks")
|
35 |
+
|
36 |
+
article_id_pattern = "wgArticleId\"\:([0-9]+)"
|
37 |
+
categories_xpath = "//div[@class='page-header__categories']/a//text()"
|
38 |
+
wiki_page_href_xpath = "//ul[@class='mw-allpages-chunk']/li/a[not(contains(@class, 'redirect'))]/@href"
|
39 |
+
next_page_href_xpath = "//div[@class='mw-allpages-nav']/a[contains(.,'Next page')]/@href"
|
40 |
+
|
41 |
|
42 |
class Tpcrawler(scrapy.Spider):
|
43 |
name = 'tpcrawler'
|
|
|
46 |
|
47 |
|
48 |
def parse(self, response):
|
49 |
+
"""Start from wiki "all pages" list and open them"""
|
50 |
+
hrefs = response.xpath(wiki_page_href_xpath).extract()
|
|
|
51 |
for href in hrefs:
|
52 |
+
yield scrapy.Request(url=response.urljoin(href),
|
53 |
+
callback=self.parse_page, dont_filter=True,
|
54 |
+
meta={'name':href.rpartition('/')[-1],
|
55 |
+
'url':response.urljoin(href)})
|
|
|
56 |
|
57 |
+
next_page = response.xpath(next_page_href_xpath).extract_first()
|
58 |
if next_page:
|
59 |
+
yield scrapy.Request(url=response.urljoin(next_page),
|
60 |
+
callback=self.parse, dont_filter=True)
|
61 |
|
62 |
def parse_page(self, response: TextResponse):
|
63 |
+
"""
|
64 |
+
Collect all interesting pages IDs
|
65 |
+
and use the Fandom API to crawl them.
|
66 |
+
Save the output in JSON format.
|
67 |
+
"""
|
68 |
+
|
69 |
categories = set(response.xpath(categories_xpath).extract())
|
70 |
+
|
71 |
+
# the wiki page is interesting only if related to plot
|
72 |
+
# (= not contained in excluded categories)
|
73 |
if len(categories.intersection(excluded_categories))==0:
|
74 |
name = response.meta['name']
|
75 |
url = response.meta['url']
|
76 |
article_id = int(re.findall(article_id_pattern, response.text)[0])
|
77 |
|
78 |
+
# once the ID is found, use the Fandom API to retrieve the clean page text
|
79 |
+
page = fandom.page(pageid = article_id)
|
80 |
+
text = page.plain_text.split('\nAppearances\n')[0]\
|
81 |
+
.split('\nReferences\n')[0]
|
|
|
82 |
json_content={'name': name, 'url':url, 'text':text}
|
|
|
83 |
with open(f'./data/{name}.json','w', encoding='utf-8') as fout:
|
84 |
+
json.dump(json_content, fout)
|
|
|
|
|
|
|
|
|
|
|
|
|
|