lysandre HF staff commited on
Commit
19d66b4
1 Parent(s): 7de9cd4

Preprocessing

Browse files
Files changed (2) hide show
  1. app.py +23 -152
  2. index.js +4 -4
app.py CHANGED
@@ -15,91 +15,21 @@ HfFolder.save_token(HF_TOKEN)
15
 
16
 
17
  datasets = {
18
- "stars": load_dataset("open-source-metrics/stars").sort('dates'),
19
- "issues": load_dataset("open-source-metrics/issues").sort('dates'),
20
  "pip": load_dataset("open-source-metrics/pip").sort('day'),
21
  }
22
 
23
  external_datasets = {
24
- "stars": load_dataset("open-source-metrics/stars-external").sort('dates'),
25
- "issues": load_dataset("open-source-metrics/issues-external").sort('dates'),
26
  "pip": load_dataset("open-source-metrics/pip-external").sort('day')
27
  }
28
 
29
- val = 0
30
 
31
-
32
- def _range(e):
33
- global val
34
- e['range'] = val
35
- val += 1
36
-
37
- current_date = datetime.strptime(e['dates'], "%Y-%m-%dT%H:%M:%SZ")
38
- first_date = datetime.fromtimestamp(1)
39
- week = abs(current_date - first_date).days // 7
40
- e['week'] = week
41
-
42
- return e
43
-
44
-
45
- def _ignore_org_members(e):
46
- global val
47
- e['range_non_org'] = val
48
-
49
- if e['type']['authorAssociation'] != 'MEMBER':
50
- val += 1
51
-
52
- return e
53
-
54
-
55
- stars = {}
56
- for k, v in datasets['stars'].items():
57
- stars[k] = v.map(_range)
58
- val = 0
59
-
60
- stars_external = {}
61
- for k, v in external_datasets['stars'].items():
62
- stars_external[k] = v.map(_range)
63
- val = 0
64
-
65
-
66
- issues = {}
67
- for k, v in datasets['issues'].items():
68
- issues[k] = v.map(_range)
69
- val = 0
70
- issues[k] = issues[k].map(_ignore_org_members)
71
- val = 0
72
-
73
-
74
- issues_external = {}
75
- for k, v in external_datasets['issues'].items():
76
- issues_external[k] = v.map(_range)
77
- val = 0
78
- issues_external[k] = issues_external[k].map(_ignore_org_members)
79
- val = 0
80
-
81
- datasets['stars'] = DatasetDict(**stars)
82
- datasets['issues'] = DatasetDict(**issues)
83
- external_datasets['stars'] = DatasetDict(**stars_external)
84
- external_datasets['issues'] = DatasetDict(**issues_external)
85
-
86
-
87
- def link_values(library_names, returned_values):
88
- previous_values = {library_name: None for library_name in library_names}
89
- for library_name in library_names:
90
- for i in returned_values.keys():
91
- if library_name not in returned_values[i]:
92
- returned_values[i][library_name] = previous_values[library_name]
93
- else:
94
- previous_values[library_name] = returned_values[i][library_name]
95
-
96
- return returned_values
97
-
98
-
99
- def running_mean(x, N, total_length=-1):
100
- cumsum = np.cumsum(np.insert(x, 0, 0))
101
- to_pad = max(total_length - len(cumsum), 0)
102
- return np.pad(cumsum[N:] - cumsum[:-N], (to_pad, 0)) / float(N)
103
 
104
 
105
  def parse_name_and_options(path):
@@ -152,10 +82,12 @@ class RequestHandler(SimpleHTTPRequestHandler):
152
  external_dataset_with_most_splits = list(external_dataset_with_most_splits)
153
  external_dataset_with_most_splits.sort()
154
 
 
 
155
  res = {
156
  'internal': dataset_with_most_splits,
157
  'external': external_dataset_with_most_splits,
158
- 'warnings': warnings
159
  }
160
 
161
  print(f"Returning: {res}")
@@ -215,90 +147,29 @@ class RequestHandler(SimpleHTTPRequestHandler):
215
  return self.response(output)
216
 
217
  if self.path.startswith("/retrieveStars"):
218
- errors = []
219
  library_names, options = parse_name_and_options(self.path)
220
- returned_values = {}
221
- dataset_dict = datasets['stars']
222
- external_dataset_dict = external_datasets['stars']
223
  week_over_week = '1' in options
224
 
225
- for library_name in library_names:
226
- if library_name in dataset_dict:
227
- dataset = dataset_dict[library_name]
228
- elif library_name in external_dataset_dict:
229
- dataset = external_dataset_dict[library_name]
230
- else:
231
- errors.append(f"No {library_name} found in internal or external datasets for stars.")
232
- return {'errors': errors}
233
-
234
- last_value = 0
235
- last_week = dataset[0]['week']
236
- for i in dataset:
237
- if week_over_week and last_week == i['week']:
238
- continue
239
- if i['dates'] in returned_values:
240
- returned_values[i['dates']][library_name] = i['range'] - last_value
241
- else:
242
- returned_values[i['dates']] = {library_name: i['range'] - last_value}
243
-
244
- last_value = i['range'] if week_over_week else 0
245
- last_week = i['week']
246
-
247
- returned_values = collections.OrderedDict(sorted(returned_values.items()))
248
- returned_values = link_values(library_names, returned_values)
249
- output = {l: [k[l] for k in returned_values.values()][::-1] for l in library_names}
250
- output['day'] = list(returned_values.keys())[::-1]
251
-
252
- # Trim down to a smaller number of points.
253
- output = {k: [v for i, v in enumerate(value) if i % max(1, int(len(value) / 100)) == 0] for k, value in output.items()}
254
-
255
- return self.response(output)
256
-
257
 
258
  if self.path.startswith("/retrieveIssues"):
259
- errors = []
260
  library_names, options = parse_name_and_options(self.path)
261
-
262
  exclude_org_members = '1' in options
263
  week_over_week = '2' in options
264
 
265
- returned_values = {}
266
- dataset_dict = datasets['issues']
267
- external_dataset_dict = external_datasets['issues']
268
- range_id = 'range' if not exclude_org_members else 'range_non_org'
269
-
270
- for library_name in library_names:
271
- if library_name in dataset_dict:
272
- dataset = dataset_dict[library_name]
273
- elif library_name in external_dataset_dict:
274
- dataset = external_dataset_dict[library_name]
275
  else:
276
- errors.append(f"No {library_name} found in internal or external datasets for stars.")
277
- return {'errors': errors}
278
-
279
- last_value = 0
280
- last_week = dataset[0]['week']
281
- for i in dataset:
282
- if week_over_week and last_week == i['week']:
283
- continue
284
-
285
- if i['dates'] in returned_values:
286
- returned_values[i['dates']][library_name] = i[range_id] - last_value
287
- else:
288
- returned_values[i['dates']] = {library_name: i[range_id] - last_value}
289
-
290
- last_value = i[range_id] if week_over_week else 0
291
- last_week = i['week']
292
-
293
- returned_values = collections.OrderedDict(sorted(returned_values.items()))
294
- returned_values = link_values(library_names, returned_values)
295
- output = {l: [k[l] for k in returned_values.values()][::-1] for l in library_names}
296
- output['day'] = list(returned_values.keys())[::-1]
297
-
298
- # Trim down to a smaller number of points.
299
- output = {k: [v for i, v in enumerate(value) if i % max(1, int(len(value) / 100)) == 0] for k, value in output.items()}
300
-
301
- return self.response(output)
302
 
303
  return SimpleHTTPRequestHandler.do_GET(self)
304
 
 
15
 
16
 
17
  datasets = {
18
+ "stars": load_dataset("open-source-metrics/preprocessed_stars"),
19
+ "issues": load_dataset("open-source-metrics/preprocessed_issues"),
20
  "pip": load_dataset("open-source-metrics/pip").sort('day'),
21
  }
22
 
23
  external_datasets = {
 
 
24
  "pip": load_dataset("open-source-metrics/pip-external").sort('day')
25
  }
26
 
 
27
 
28
+ def cut_output(full_output: Dataset, library_names: list):
29
+ output = full_output.to_dict().items()
30
+ output = {k: v + [None] for k, v in output if k in library_names + ['day']}
31
+ last_value = max(output[k].index(None) for k in output.keys() if k != 'day')
32
+ return {k: v[:last_value] for k, v in output.items()}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
 
35
  def parse_name_and_options(path):
 
82
  external_dataset_with_most_splits = list(external_dataset_with_most_splits)
83
  external_dataset_with_most_splits.sort()
84
 
85
+ warnings.append("Selecting PyTorch and/or TensorFlow will take a while to compute, and may timeout for issues/PRs..")
86
+
87
  res = {
88
  'internal': dataset_with_most_splits,
89
  'external': external_dataset_with_most_splits,
90
+ 'warnings': []
91
  }
92
 
93
  print(f"Returning: {res}")
 
147
  return self.response(output)
148
 
149
  if self.path.startswith("/retrieveStars"):
 
150
  library_names, options = parse_name_and_options(self.path)
 
 
 
151
  week_over_week = '1' in options
152
 
153
+ if week_over_week:
154
+ return self.response({k: v for k, v in datasets['stars']['wow'].to_dict().items() if k in library_names + ['day']})
155
+ else:
156
+ return self.response({k: v for k, v in datasets['stars']['wow'].to_dict().items() if k in library_names + ['day']})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
  if self.path.startswith("/retrieveIssues"):
 
159
  library_names, options = parse_name_and_options(self.path)
 
160
  exclude_org_members = '1' in options
161
  week_over_week = '2' in options
162
 
163
+ if week_over_week:
164
+ if exclude_org_members:
165
+ return self.response(cut_output(datasets['issues']['eom_wow'], library_names))
 
 
 
 
 
 
 
166
  else:
167
+ return self.response({k: v for k, v in datasets['issues']['wow'].to_dict().items() if k in library_names + ['day']})
168
+ else:
169
+ if exclude_org_members:
170
+ return self.response({k: v for k, v in datasets['issues']['eom'].to_dict().items() if k in library_names + ['day']})
171
+ else:
172
+ return self.response({k: v for k, v in datasets['issues']['raw'].to_dict().items() if k in library_names + ['day']})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
  return SimpleHTTPRequestHandler.do_GET(self)
175
 
index.js CHANGED
@@ -122,16 +122,16 @@ const initialize = async () => {
122
  graphSelector.appendChild(graphSpan);
123
 
124
  if (inferJson.warnings.length > 0) {
125
- const div = document.createElement('div');
126
- div.classList.add('warning-div')
127
-
128
  for (const warning of inferJson.warnings) {
 
 
 
129
  const labelSpan = document.createElement('span');
130
  labelSpan.textContent = `Warning: ${warning}`;
131
 
132
  div.appendChild(labelSpan);
 
133
  }
134
- warnings.appendChild(div);
135
  }
136
 
137
  for (const element of inferJson.internal) {
 
122
  graphSelector.appendChild(graphSpan);
123
 
124
  if (inferJson.warnings.length > 0) {
 
 
 
125
  for (const warning of inferJson.warnings) {
126
+ const div = document.createElement('div');
127
+ div.classList.add('warning-div')
128
+
129
  const labelSpan = document.createElement('span');
130
  labelSpan.textContent = `Warning: ${warning}`;
131
 
132
  div.appendChild(labelSpan);
133
+ warnings.appendChild(div);
134
  }
 
135
  }
136
 
137
  for (const element of inferJson.internal) {