math-quiz / make_evolve_dataset.py
Yoon-gu Hwang
포켓몬 진화 데이터 추가
59b8c1b
raw
history blame
No virus
1.51 kB
import urllib.request
import re
import json
import urllib.parse
from urllib.parse import urlsplit, quote
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
url = 'https://pokemon.fandom.com/ko/wiki/이상해씨_(포켓몬)'
url_info = urlsplit(url)
encoded_url = f'{url_info.scheme}://{url_info.netloc}{quote(url_info.path)}'
info = []
erros = []
target_number = 1017
cnt = 0
for _ in tqdm(range(target_number+2)):
cnt += 1
req = Request(encoded_url, headers={'User-Agent': 'Mozilla/5.0'})
res = urlopen(req)
html = res.read()
soup = BeautifulSoup(html, 'html.parser')
name = soup.find("div", {"class": "name-ko"}).text.strip()
number = soup.find("div", {"class": "index"}).text.strip()
doc_text = '\n'.join([p.text.replace('\n', '').strip() for p in soup.find_all("p")])
types = [poke_type['title'].split(' ')[0].strip() for poke_type in soup.select('tbody > tr > td > div')[0].select('span > a')]
evol_tables = soup.find("table", style=re.compile("^margin:auto; text-align:center;"))
info.append(dict(name=name, evolve=[e.span.text for e in evol_tables.find_all("table")]))
next_monster = soup.find("table").findAll("a")[-1]['href']
encoded_url = "https://pokemon.fandom.com" + next_monster
if number == f"No.{target_number:04d}":
break
if cnt >= target_number:
break
with open('pokemon_evolve.json', 'w') as f:
json.dump(info, f, ensure_ascii=False, indent=4)