File size: 4,266 Bytes
14e4843
 
034968f
 
 
84f0fa3
034968f
 
84f0fa3
14e4843
 
 
 
d6d7ec6
 
 
14e4843
3237d78
 
14e4843
d6d7ec6
14e4843
 
 
 
 
d6d7ec6
 
14e4843
 
 
 
 
 
 
d6d7ec6
14e4843
d6d7ec6
14e4843
 
034968f
 
 
 
 
 
 
 
 
 
 
84f0fa3
034968f
84f0fa3
034968f
 
84f0fa3
 
 
 
 
 
 
034968f
 
84f0fa3
034968f
 
 
 
 
84f0fa3
 
 
 
034968f
 
 
 
84f0fa3
 
034968f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84f0fa3
034968f
 
 
84f0fa3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
034968f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import pandas as pd
from huggingface_hub import snapshot_download
import subprocess
import re
try:
    from src.display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
except:
    print("local debug: from display.utils")
    from display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name

def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
    for i in range(10):
        try:
            snapshot_download(
                repo_id=repo_id, revision=revision, local_dir=local_dir, repo_type=repo_type, max_workers=max_workers
            )
            return
        except Exception as e:
            print(f"Failed to download {repo_id} at {revision} with error: {e}. Retrying...")
            import time

            time.sleep(60)
    return


def get_dataset_url(row):
    dataset_name = row["Benchmark"]
    dataset_url = row["Dataset Link"]
    benchmark = f'<a target="_blank" href="{dataset_url}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{dataset_name}</a>'
    return benchmark


def get_dataset_summary_table(file_path):
    df = pd.read_csv(file_path)

    df["Benchmark"] = df.apply(lambda x: get_dataset_url(x), axis=1)

    df = df[["Category", "Benchmark", "Data Split", "Data Size", "Language"]]

    return df

def parse_nvidia_smi():
    # Execute the nvidia-smi command
    result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
    output = result.stdout.strip()

    # Initialize data storage
    gpu_stats = []

    # Regex to extract the relevant data for each GPU
    gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
    gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+?\d+GB)')
    lines = output.split('\n')
    gpu_name = ""
    for line in lines:
        match = gpu_info_pattern.search(line)
        name_match = gpu_name_pattern.search(line)

        gpu_info = {}

        if name_match:
            # print(name_match)
            gpu_name = name_match.group(1).strip()
        if match:
            temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
            gpu_info.update({
                GPU_TEMP: temp,
                GPU_Power: power_usage,
                GPU_Mem: mem_usage,
                GPU_Util: gpu_util
            })
        # print(f"gpu_info: {gpu_info}")
        if len(gpu_info) >= 4:
            gpu_stats.append(gpu_info)
    gpu_name = f"{len(gpu_stats)}x{gpu_name}"
    gpu_stats_total = {
                        GPU_TEMP: 0,
                        GPU_Power: 0,
                        GPU_Mem: 0,
                        GPU_Util: 0,
                        GPU_Name: gpu_name
                    }
    for gpu_stat in gpu_stats:
        gpu_stats_total[GPU_TEMP] += gpu_stat[GPU_TEMP]
        gpu_stats_total[GPU_Power] += gpu_stat[GPU_Power]
        gpu_stats_total[GPU_Mem] += gpu_stat[GPU_Mem]
        gpu_stats_total[GPU_Util] += gpu_stat[GPU_Util]

    gpu_stats_total[GPU_TEMP] /= len(gpu_stats)
    gpu_stats_total[GPU_Power] /= len(gpu_stats)
    gpu_stats_total[GPU_Util] /= len(gpu_stats)
    return [gpu_stats_total]

def monitor_gpus(stop_event, interval, stats_list):
    while not stop_event.is_set():
        gpu_stats = parse_nvidia_smi()
        if gpu_stats:
            stats_list.extend(gpu_stats)
        stop_event.wait(interval)

def analyze_gpu_stats(stats_list):
    # Check if the stats_list is empty, and return None if it is
    if not stats_list:
        return None

    # Initialize dictionaries to store the stats
    avg_stats = {}
    max_stats = {}

    # Calculate average stats, excluding 'GPU_Mem'
    for key in stats_list[0].keys():
        if key != GPU_Mem and key != GPU_Name:
            total = sum(d[key] for d in stats_list)
            avg_stats[key] = total / len(stats_list)

    # Calculate max stats for 'GPU_Mem'
    max_stats[GPU_Mem] = max(d[GPU_Mem] for d in stats_list)
    if GPU_Name in stats_list[0]:
        avg_stats[GPU_Name] = stats_list[0][GPU_Name]
    # Update average stats with max GPU memory usage
    avg_stats.update(max_stats)

    return avg_stats

if __name__ == "__main__":
    print(analyze_gpu_stats(parse_nvidia_smi()))