import urllib.request import re import json import urllib.parse from urllib.parse import urlsplit, quote from urllib.request import Request, urlopen from bs4 import BeautifulSoup from tqdm import tqdm import pandas as pd url = 'https://pokemon.fandom.com/ko/wiki/이상해씨_(포켓몬)' url_info = urlsplit(url) encoded_url = f'{url_info.scheme}://{url_info.netloc}{quote(url_info.path)}' info = [] erros = [] target_number = 1017 cnt = 0 for _ in tqdm(range(target_number+2)): cnt += 1 req = Request(encoded_url, headers={'User-Agent': 'Mozilla/5.0'}) res = urlopen(req) html = res.read() soup = BeautifulSoup(html, 'html.parser') name = soup.find("div", {"class": "name-ko"}).text.strip() number = soup.find("div", {"class": "index"}).text.strip() doc_text = '\n'.join([p.text.replace('\n', '').strip() for p in soup.find_all("p")]) types = [poke_type['title'].split(' ')[0].strip() for poke_type in soup.select('tbody > tr > td > div')[0].select('span > a')] evol_tables = soup.find("table", style=re.compile("^margin:auto; text-align:center;")) info.append(dict(name=name, evolve=[e.span.text for e in evol_tables.find_all("table")])) next_monster = soup.find("table").findAll("a")[-1]['href'] encoded_url = "https://pokemon.fandom.com" + next_monster if number == f"No.{target_number:04d}": break if cnt >= target_number: break with open('pokemon_evolve.json', 'w') as f: json.dump(info, f, ensure_ascii=False, indent=4)