Stefano Fiorucci commited on
Commit
82fe524
1 Parent(s): a251941

crawler refactoring

Browse files
crawler/README.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Twin Peaks crawler
2
+
3
+ This crawler download texts and metadata from [Twin Peaks Fandom Wiki](https://twinpeaks.fandom.com/wiki/Twin_Peaks_Wiki). The output format is JSON. The crawler is based on the combination of [Scrapy](https://github.com/scrapy/scrapy) and [fandom-py](https://github.com/NikolajDanger/fandom-py).
4
+
5
+ *Several wiki pages are discarded, since they are not related to Twin Peaks plot and create noise in the Question Answering index.*
6
+
7
+ ## Installation
8
+ - `pip install -r requirements.txt`
9
+ - copy this folder (if needed, see [stackoverflow](https://stackoverflow.com/questions/7106012/download-a-single-folder-or-directory-from-a-github-repo))
10
+
11
+ ## Usage
12
+ - (if needed, activate the virtual environment)
13
+ - `cd tpcrawler`
14
+ - `scrapy crawl tpcrawler`
15
+ - you can find the downloaded pages in `data` subfolder
crawler/data/.gitkeep ADDED
File without changes
crawler/requirements.txt CHANGED
@@ -1,2 +1,2 @@
1
  fandom-py==0.2.1
2
- Scrapy==2.5.1
 
1
  fandom-py==0.2.1
2
+ Scrapy==2.6.1
crawler/tpcrawler/tpcrawler/spiders/tpcrawler.py CHANGED
@@ -1,14 +1,11 @@
1
  import scrapy
2
- from scrapy.utils.response import open_in_browser
3
  from scrapy.http import TextResponse
4
  import re
5
  import fandom
6
  import json
7
 
8
- fandom.set_wiki("Twinpeaks")
9
-
10
- article_id_pattern="wgArticleId\"\:([0-9]+)"
11
- categories_xpath="//div[@class='page-header__categories']/a//text()"
12
  excluded_categories=set("""Twin Peaks (2017) crew
13
  Actors
14
  Camera and electrical department
@@ -34,7 +31,13 @@ Decades
34
  Days
35
  Production timeline""".split("\n"))
36
 
37
- #print(excluded_categories)
 
 
 
 
 
 
38
 
39
  class Tpcrawler(scrapy.Spider):
40
  name = 'tpcrawler'
@@ -43,40 +46,39 @@ class Tpcrawler(scrapy.Spider):
43
 
44
 
45
  def parse(self, response):
46
- #open_in_browser(response)
47
-
48
- hrefs = response.xpath("//ul[@class='mw-allpages-chunk']/li/a[not(contains(@class, 'redirect'))]/@href").extract()
49
  for href in hrefs:
50
- yield scrapy.Request(url=response.urljoin(href), callback=self.parse_page, dont_filter=True,
51
- meta={'name':href.rpartition('/')[-1],
52
- 'url':response.urljoin(href)})
53
-
54
- next_page = response.xpath("//div[@class='mw-allpages-nav']/a[contains(.,'Next page')]/@href").extract_first()
55
 
 
56
  if next_page:
57
- yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse, dont_filter=True)
 
58
 
59
  def parse_page(self, response: TextResponse):
 
 
 
 
 
 
60
  categories = set(response.xpath(categories_xpath).extract())
61
- #print(categories)
 
 
62
  if len(categories.intersection(excluded_categories))==0:
63
  name = response.meta['name']
64
  url = response.meta['url']
65
  article_id = int(re.findall(article_id_pattern, response.text)[0])
66
 
67
- # una volta trovato l'id, usa l'API di fandom per recuperare solo il testo della voce
68
- page = fandom.page(pageid = article_id)
69
-
70
- text = page.plain_text.split('\nAppearances\n')[0].split('\nReferences\n')[0]
71
-
72
  json_content={'name': name, 'url':url, 'text':text}
73
-
74
  with open(f'./data/{name}.json','w', encoding='utf-8') as fout:
75
- json.dump(json_content, fout)
76
-
77
-
78
-
79
-
80
-
81
-
82
-
 
1
  import scrapy
 
2
  from scrapy.http import TextResponse
3
  import re
4
  import fandom
5
  import json
6
 
7
+ # Categories unrelated to Twin Peaks plot
8
+ # (they make noise in the index)
 
 
9
  excluded_categories=set("""Twin Peaks (2017) crew
10
  Actors
11
  Camera and electrical department
 
31
  Days
32
  Production timeline""".split("\n"))
33
 
34
+ fandom.set_wiki("Twinpeaks")
35
+
36
+ article_id_pattern = "wgArticleId\"\:([0-9]+)"
37
+ categories_xpath = "//div[@class='page-header__categories']/a//text()"
38
+ wiki_page_href_xpath = "//ul[@class='mw-allpages-chunk']/li/a[not(contains(@class, 'redirect'))]/@href"
39
+ next_page_href_xpath = "//div[@class='mw-allpages-nav']/a[contains(.,'Next page')]/@href"
40
+
41
 
42
  class Tpcrawler(scrapy.Spider):
43
  name = 'tpcrawler'
 
46
 
47
 
48
  def parse(self, response):
49
+ """Start from wiki "all pages" list and open them"""
50
+ hrefs = response.xpath(wiki_page_href_xpath).extract()
 
51
  for href in hrefs:
52
+ yield scrapy.Request(url=response.urljoin(href),
53
+ callback=self.parse_page, dont_filter=True,
54
+ meta={'name':href.rpartition('/')[-1],
55
+ 'url':response.urljoin(href)})
 
56
 
57
+ next_page = response.xpath(next_page_href_xpath).extract_first()
58
  if next_page:
59
+ yield scrapy.Request(url=response.urljoin(next_page),
60
+ callback=self.parse, dont_filter=True)
61
 
62
  def parse_page(self, response: TextResponse):
63
+ """
64
+ Collect all interesting pages IDs
65
+ and use the Fandom API to crawl them.
66
+ Save the output in JSON format.
67
+ """
68
+
69
  categories = set(response.xpath(categories_xpath).extract())
70
+
71
+ # the wiki page is interesting only if related to plot
72
+ # (= not contained in excluded categories)
73
  if len(categories.intersection(excluded_categories))==0:
74
  name = response.meta['name']
75
  url = response.meta['url']
76
  article_id = int(re.findall(article_id_pattern, response.text)[0])
77
 
78
+ # once the ID is found, use the Fandom API to retrieve the clean page text
79
+ page = fandom.page(pageid = article_id)
80
+ text = page.plain_text.split('\nAppearances\n')[0]\
81
+ .split('\nReferences\n')[0]
 
82
  json_content={'name': name, 'url':url, 'text':text}
 
83
  with open(f'./data/{name}.json','w', encoding='utf-8') as fout:
84
+ json.dump(json_content, fout)