Spaces:
Runtime error
Runtime error
File size: 3,236 Bytes
c914273 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import requests
from bs4 import BeautifulSoup as bs
import json
import argparse
from pathlib import Path
import os
import pandas as pd
import re
from tqdm import tqdm
def scrape_song_library(page_count=2054) -> pd.DataFrame:
columns = [
"Title",
"Artist",
"Length",
"Tempo",
"Beat",
"Energy",
"Danceability",
"Valence",
"Sample",
"Tags",
"DanceRating",
]
song_df = pd.DataFrame(columns=columns)
for i in tqdm(range(1, page_count + 1), desc="Pages processed"):
link = "https://www.music4dance.net/song/Index?filter=v2-Index&page=" + str(i)
page = requests.get(link)
soup = bs(page.content, "html.parser")
songs = pd.DataFrame(get_songs(soup))
song_df = pd.concat([song_df, songs], axis=0, ignore_index=True)
return song_df
def get_songs(soup: bs) -> dict:
js_obj = re.compile(r"{(.|\n)*}")
reset_keys = [
"Title",
"Artist",
"Length",
"Tempo",
"Beat",
"Energy",
"Danceability",
"Valence",
"Sample",
]
song_text = [str(v) for v in soup.find_all("script") if "histories" in str(v)][0]
songs_data = json.loads(js_obj.search(song_text).group(0))
songs = []
for song_data in songs_data["histories"]:
song = {"Tags": set(), "DanceRating": {}}
for feature in song_data["properties"]:
if "name" not in feature or "value" not in feature:
continue
key = feature["name"]
value = feature["value"]
if key in reset_keys:
song[key] = value
elif key == "Tag+":
song["Tags"].add(value)
elif key == "DeleteTag":
try:
song["Tags"].remove(value)
except:
continue
elif key == "DanceRating":
dance = value.replace("+1", "")
prev = song["DanceRating"].get(dance, 0)
song["DanceRating"][dance] = prev + 1
songs.append(song)
return songs
def scrape_dance_info() -> pd.DataFrame:
js_obj = re.compile(r"{(.|\n)*}")
link = "https://www.music4dance.net/song/Index?filter=v2-Index"
page = requests.get(link)
soup = bs(page.content, "html.parser")
dance_info_text = [str(v) for v in soup.find_all("script") if "environment" in str(v)][0]
dance_info = json.loads(js_obj.search(dance_info_text).group(0))
dance_info = dance_info["dances"]
wanted_keys = ["name", "id", "synonyms", "tempoRange", "songCount"]
dance_df = pd.DataFrame([{k:v for k, v in dance.items() if k in wanted_keys}
for dance
in dance_info])
return dance_df
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--page-count", default=2, type=int)
parser.add_argument("--out", default="data/song.csv")
args = parser.parse_args()
out_path = Path(args.out)
out_dir = os.path.dirname(out_path)
if not os.path.exists(out_dir):
print(f"Output location does not exist: {out_dir}")
df = scrape_song_library(args.page_count)
df.to_csv(out_path)
|