File size: 5,703 Bytes
d27cdb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import os
from datetime import datetime
import json
from huggingface_hub import snapshot_download
from collections import defaultdict
import pandas as pd
import streamlit as st
from datetime import datetime, timedelta
import matplotlib.pyplot as plt

user_input = st.text_input("Enter your text here:")

libraries = [
    "open-source-metrics/accelerate-dependents",
    "open-source-metrics/hub-docs-dependents",
    "open-source-metrics/huggingface_hub-dependents",
    "open-source-metrics/evaluate-dependents",
    "open-source-metrics/datasets-dependents",
    "open-source-metrics/pytorch-image-models-dependents",
    "open-source-metrics/tokenizers-dependents",
    "open-source-metrics/transformers-dependents",
    "open-source-metrics/diffusers-dependents",
    "open-source-metrics/gradio-dependents",
    "open-source-metrics/optimum-dependents",
    "open-source-metrics/accelerate-dependents",
]

option = st.selectbox(
    'Choose library',
    libraries 
)

cached_folder = snapshot_download("open-source-metrics/transformers-dependents", repo_type="dataset")

num_dependents = defaultdict(int)
num_stars_all_dependents = defaultdict(int)

def load_json_files(directory):
    for subdir, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(subdir, file)
                date = "_".join(file_path.split(".")[-2].split("/")[-3:])
                with open(file_path, 'r') as f:
                    data = json.load(f)
                    # Process the JSON data as needed
                    if "name" in data and "stars" in data:
                        num_dependents[date] = len(data["name"])
                        num_stars_all_dependents[date] = sum(data["stars"])

# Replace 'your_directory_path' with the path to the directory containing your '11' and '12' folders
load_json_files(cached_folder)

def sort_dict_by_date(d):
    # Convert date strings to datetime objects and sort
    sorted_tuples = sorted(d.items(), key=lambda x: datetime.strptime(x[0], '%Y_%m_%d'))
    # Convert back to dictionary if needed
    return defaultdict(int, sorted_tuples)

def remove_incorrect_entries(data):
    # Convert string dates to datetime objects for easier comparison
    sorted_data = sorted(data.items(), key=lambda x: datetime.strptime(x[0], '%Y_%m_%d'))
    
    # Initialize a new dictionary to store the corrected data
    corrected_data = defaultdict(int)
    
    # Variable to keep track of the number of dependents on the previous date
    previous_dependents = None

    for date, dependents in sorted_data:
        # If the current number of dependents is not less than the previous, add it to the corrected data
        if previous_dependents is None or dependents >= previous_dependents:
            corrected_data[date] = dependents
            previous_dependents = dependents

    return corrected_data

def interpolate_missing_dates(data):
    # Convert string dates to datetime objects
    temp_data = {datetime.strptime(date, '%Y_%m_%d'): value for date, value in data.items()}
    
    # Find the min and max dates to establish the range
    min_date, max_date = min(temp_data.keys()), max(temp_data.keys())

    # Generate a date range
    current_date = min_date
    while current_date <= max_date:
        # If the current date is missing
        if current_date not in temp_data:
            # Find previous and next dates that are present
            prev_date = current_date - timedelta(days=1)
            next_date = current_date + timedelta(days=1)
            while prev_date not in temp_data:
                prev_date -= timedelta(days=1)
            while next_date not in temp_data:
                next_date += timedelta(days=1)

            # Linear interpolation
            prev_value = temp_data[prev_date]
            next_value = temp_data[next_date]
            interpolated_value = prev_value + ((next_value - prev_value) * ((current_date - prev_date) / (next_date - prev_date)))
            temp_data[current_date] = interpolated_value

        current_date += timedelta(days=1)

    # Convert datetime objects back to string format
    interpolated_data = defaultdict(int, {date.strftime('%Y_%m_%d'): int(value) for date, value in temp_data.items()})
    
    return interpolated_data

num_dependents = remove_incorrect_entries(num_dependents)
num_stars_all_dependents = remove_incorrect_entries(num_stars_all_dependents)

num_dependents = interpolate_missing_dates(num_dependents)
num_stars_all_dependents = interpolate_missing_dates(num_stars_all_dependents)

num_dependents = sort_dict_by_date(num_dependents)
num_stars_all_dependents = sort_dict_by_date(num_stars_all_dependents)

num_dependents_df = pd.DataFrame(list(num_dependents.items()), columns=['Date', 'Value'])
num_cum_stars_df = pd.DataFrame(list(num_stars_all_dependents.items()), columns=['Date', 'Value'])

num_dependents_df['Date'] = pd.to_datetime(num_dependents_df['Date'], format='%Y_%m_%d')
num_cum_stars_df['Date'] = pd.to_datetime(num_cum_stars_df['Date'], format='%Y_%m_%d')

num_dependents_df.set_index('Date', inplace=True)
num_dependents_df = num_dependents_df.resample('D').asfreq()
num_dependents_df['Value'] = num_dependents_df['Value'].interpolate()

num_cum_stars_df.set_index('Date', inplace=True)
num_cum_stars_df = num_cum_stars_df.resample('D').asfreq()
num_cum_stars_df['Value'] = num_cum_stars_df['Value'].interpolate()

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(num_dependents_df.index, num_dependents_df['Value'], marker='o')
plt.xlabel('Date')
plt.ylabel('Number of Dependents')
plt.title('Dependencies History')

# Display in Streamlit
st.pyplot(plt)