File size: 1,514 Bytes
59b8c1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import urllib.request
import re
import json
import urllib.parse
from urllib.parse import urlsplit, quote
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd

url = 'https://pokemon.fandom.com/ko/wiki/이상해씨_(포켓몬)'

url_info = urlsplit(url)
encoded_url = f'{url_info.scheme}://{url_info.netloc}{quote(url_info.path)}'

info = []
erros = []
target_number = 1017
cnt = 0
for _ in tqdm(range(target_number+2)):
    cnt += 1
    req = Request(encoded_url, headers={'User-Agent': 'Mozilla/5.0'})
    res = urlopen(req)
    html = res.read()
    soup = BeautifulSoup(html, 'html.parser')

    name = soup.find("div", {"class": "name-ko"}).text.strip()
    number = soup.find("div", {"class": "index"}).text.strip()
    doc_text = '\n'.join([p.text.replace('\n', '').strip() for p in soup.find_all("p")])
    types = [poke_type['title'].split(' ')[0].strip() for poke_type in soup.select('tbody > tr > td > div')[0].select('span > a')]

    evol_tables = soup.find("table", style=re.compile("^margin:auto; text-align:center;"))
    info.append(dict(name=name, evolve=[e.span.text for e in evol_tables.find_all("table")]))
    next_monster = soup.find("table").findAll("a")[-1]['href']
    encoded_url = "https://pokemon.fandom.com" + next_monster
    if number == f"No.{target_number:04d}":
        break

    if cnt >= target_number:
        break

with open('pokemon_evolve.json', 'w') as f:
    json.dump(info, f, ensure_ascii=False, indent=4)