File size: 3,236 Bytes
c914273
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import requests
from bs4 import BeautifulSoup as bs
import json
import argparse
from pathlib import Path
import os
import pandas as pd
import re
from tqdm import tqdm




def scrape_song_library(page_count=2054) -> pd.DataFrame:
    columns = [
        "Title",
        "Artist",
        "Length",
        "Tempo",
        "Beat",
        "Energy",
        "Danceability",
        "Valence",
        "Sample",
        "Tags",
        "DanceRating",
    ]
    song_df = pd.DataFrame(columns=columns)
    for i in tqdm(range(1, page_count + 1), desc="Pages processed"):
        link = "https://www.music4dance.net/song/Index?filter=v2-Index&page=" + str(i)
        page = requests.get(link)
        soup = bs(page.content, "html.parser")
        songs = pd.DataFrame(get_songs(soup))
        song_df = pd.concat([song_df, songs], axis=0, ignore_index=True)
    return song_df


def get_songs(soup: bs) -> dict:
    js_obj = re.compile(r"{(.|\n)*}")
    reset_keys = [
        "Title",
        "Artist",
        "Length",
        "Tempo",
        "Beat",
        "Energy",
        "Danceability",
        "Valence",
        "Sample",
    ]
    song_text = [str(v) for v in soup.find_all("script") if "histories" in str(v)][0]
    songs_data = json.loads(js_obj.search(song_text).group(0))
    songs = []
    for song_data in songs_data["histories"]:
        song = {"Tags": set(), "DanceRating": {}}
        for feature in song_data["properties"]:
            if "name" not in feature or "value" not in feature:
                continue
            key = feature["name"]
            value = feature["value"]
            if key in reset_keys:
                song[key] = value
            elif key == "Tag+":
                song["Tags"].add(value)
            elif key == "DeleteTag":
                try:
                    song["Tags"].remove(value)
                except:
                    continue
            elif key == "DanceRating":
                dance = value.replace("+1", "")
                prev = song["DanceRating"].get(dance, 0)
                song["DanceRating"][dance] = prev + 1
        songs.append(song)
    return songs



def scrape_dance_info() -> pd.DataFrame:
    js_obj = re.compile(r"{(.|\n)*}")
    link = "https://www.music4dance.net/song/Index?filter=v2-Index"
    page = requests.get(link)
    soup = bs(page.content, "html.parser")

    dance_info_text = [str(v) for v in soup.find_all("script") if "environment" in str(v)][0]
    dance_info = json.loads(js_obj.search(dance_info_text).group(0))
    dance_info = dance_info["dances"]
    wanted_keys = ["name", "id", "synonyms", "tempoRange", "songCount"]
    dance_df = pd.DataFrame([{k:v for k, v in dance.items() if k in wanted_keys}
     for dance 
     in dance_info])
    return dance_df



if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--page-count", default=2, type=int)
    parser.add_argument("--out", default="data/song.csv")

    args = parser.parse_args()
    out_path = Path(args.out)
    out_dir = os.path.dirname(out_path)
    if not os.path.exists(out_dir):
        print(f"Output location does not exist: {out_dir}")
    df = scrape_song_library(args.page_count)
    df.to_csv(out_path)