File size: 2,747 Bytes
c25690f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
import shutil
import pandas as pd
from vidfetch import compress_folder, pull_from_hf


def download_video_links(hf_token: str, filename: str, save_dir: str):
    # check save dir
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    # download
    pull_from_hf(
        hf_token=hf_token,
        hf_repo_id="OpenVideo/Panda-70M-Original-Links",
        filename=filename,
        save_dir=save_dir
    )
    

def download_videos_by_csv(
    csv_file_path: str,
    save_dir: str, 
    targz_filename: str,
):
    try:
        import youtube_dl
    except:
        raise ModuleNotFoundError(
            "youtube_dl missed, please install it by ``vidfetch.package.youtube.youtube_dl_install_helper``"
        )
    # path/dir
    folder_name = targz_filename.replace(".tar.gz", "")
    download_videos_dir = os.path.join(save_dir, folder_name, "download_raw")    
    log_path = os.path.join(download_videos_dir, "log.txt")
    targz_path = os.path.join(save_dir, targz_filename)
    
    # make dirs
    if not os.path.exists(download_videos_dir):
        os.makedirs(download_videos_dir)
    
    # read from csv
    csv_filename = os.path.basename(csv_file_path)
    shutil.copy(src=csv_file_path, dst=os.path.join(download_videos_dir, csv_filename))
    data = pd.read_csv(csv_file_path)
    links = data["url"].tolist()
    videos_id = data["videoID"].to_list()    

    failed_links = [] # record failed links
    for link, video_id in zip(links, videos_id):
        # check if downloaded
        video_save_path = os.path.join(download_videos_dir, video_id[1:]+".mp4")
        if os.path.exists(video_save_path):
            continue
        
        # download
        ydl_opts = {
            'format': 'best',
            'quiet': False,
            'outtmpl': os.path.join(download_videos_dir, video_id[1:]+".mp4"),
        }
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            try:
                ydl.download([link])
            except:
                failed_links.append(link)

    # delete videos larger than 100MB
    video_files = os.listdir(download_videos_dir)
    delete_videos = []
    for file in video_files:
        file_path = os.path.join(download_videos_dir, file)
        file_size_mb = os.path.getsize(file_path) / (1024 * 1024)  # Convert to megabytes
        if file_size_mb > 500:
            delete_videos.append(file_path)
            os.remove(file_path)
    
    # Write  to log file
    with open(log_path, 'w') as file:
        file.write('Fail to download\n')
        file.write('\n'.join(failed_links))
        file.write('Delete videos larger than 500MB\n')
        file.write('\n'.join(failed_links))
    
    compress_folder(download_videos_dir, targz_path)