meg-huggingface
commited on
Commit
•
258cdcb
1
Parent(s):
9cfc9cd
Rolling back to 8 hours ago
Browse files- entrypoint.sh +3 -3
- failed_run.py +22 -31
entrypoint.sh
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
#!/bin/bash
|
2 |
|
3 |
-
export SPACE="
|
4 |
|
5 |
echo "Not checking h100 -- already know it's not there."
|
6 |
#python /check_h100.py
|
@@ -19,7 +19,7 @@ python /parse_requests.py | while read -r line; do
|
|
19 |
mkdir -p "$run_dir"
|
20 |
|
21 |
# Let the benchmarking begin!
|
22 |
-
optimum-benchmark --config-name "${experiment_name}" --config-dir /optimum-benchmark/examples/energy_star/ backend.model="${backend_model}" backend.processor="${backend_model}" hydra.run.dir="${run_dir}" 2> "${run_dir}/error.log" || (python /failed_run.py --run_dir "${run_dir}" --model_name "${backend_model}" && rm -rf $run_dir)
|
23 |
done
|
24 |
|
25 |
echo "Finished; uploading dataset results"
|
@@ -31,4 +31,4 @@ python /upload_run_folder.py --run_dir "/runs"
|
|
31 |
# Pausing space
|
32 |
echo "Pausing space."
|
33 |
python /pause_space.py
|
34 |
-
echo "Done."
|
|
|
1 |
#!/bin/bash
|
2 |
|
3 |
+
export SPACE="EnergyStarAI/launch-computation-example"
|
4 |
|
5 |
echo "Not checking h100 -- already know it's not there."
|
6 |
#python /check_h100.py
|
|
|
19 |
mkdir -p "$run_dir"
|
20 |
|
21 |
# Let the benchmarking begin!
|
22 |
+
optimum-benchmark --config-name "${experiment_name}" --config-dir /optimum-benchmark/examples/energy_star/ backend.model="${backend_model}" backend.processor="${backend_model}" hydra.run.dir="${run_dir}" 2> "${run_dir}/error.log" || (python /failed_run.py --run_dir "${run_dir}" --model_name "${backend_model}" && rm -rf $run_dir)
|
23 |
done
|
24 |
|
25 |
echo "Finished; uploading dataset results"
|
|
|
31 |
# Pausing space
|
32 |
echo "Pausing space."
|
33 |
python /pause_space.py
|
34 |
+
echo "Done."
|
failed_run.py
CHANGED
@@ -4,7 +4,6 @@ import os
|
|
4 |
from datasets import load_dataset, Dataset
|
5 |
from huggingface_hub import HfApi
|
6 |
|
7 |
-
|
8 |
TOKEN = os.environ.get("DEBUG")
|
9 |
api = HfApi(token=TOKEN)
|
10 |
|
@@ -23,39 +22,31 @@ parser.add_argument(
|
|
23 |
required=True,
|
24 |
help="Model to benchmark.",
|
25 |
)
|
26 |
-
parser.add_argument(
|
27 |
-
"--reason",
|
28 |
-
default=None,
|
29 |
-
type=str,
|
30 |
-
required=False,
|
31 |
-
help="Reason for failure -- to update in the requests file",
|
32 |
-
)
|
33 |
|
34 |
args = parser.parse_args()
|
35 |
|
36 |
# Updating request
|
37 |
-
dataset = load_dataset("
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
-
# If we have a custom reason for failure, add that instead of generic FAILED.
|
42 |
-
if args.reason:
|
43 |
-
dataset.loc[dataset["model"].isin([args.model_name]), ['status']] = args.reason
|
44 |
-
else:
|
45 |
-
# TODO: This doesn't have to be try-except, we could actually check if the file is there...
|
46 |
-
try:
|
47 |
-
# Read error message
|
48 |
-
with open(f"{args.run_dir}/error.log", 'r') as file:
|
49 |
-
for f in file.readlines():
|
50 |
-
if 'Traceback (most recent call last):' in f:
|
51 |
-
error_message = f
|
52 |
-
dataset.loc[dataset["model"].isin([args.model_name]), ['status']] = "FAILED"
|
53 |
-
print("Status set to FAILED")
|
54 |
-
else:
|
55 |
-
dataset.loc[dataset["model"].isin([args.model_name]), ['status']] = "COMPLETED"
|
56 |
-
# Add a new column for the error message if necessary
|
57 |
-
except FileNotFoundError as e:
|
58 |
-
print(f"Could not find {args.run_dir}/error.log")
|
59 |
-
|
60 |
updated_dataset = Dataset.from_pandas(dataset)
|
61 |
-
updated_dataset.push_to_hub("
|
|
|
|
4 |
from datasets import load_dataset, Dataset
|
5 |
from huggingface_hub import HfApi
|
6 |
|
|
|
7 |
TOKEN = os.environ.get("DEBUG")
|
8 |
api = HfApi(token=TOKEN)
|
9 |
|
|
|
22 |
required=True,
|
23 |
help="Model to benchmark.",
|
24 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
args = parser.parse_args()
|
27 |
|
28 |
# Updating request
|
29 |
+
dataset = load_dataset("EnergyStarAI/requests_debug", split="test",
|
30 |
+
token=TOKEN).to_pandas()
|
31 |
+
|
32 |
+
# Set benchmark to failed
|
33 |
+
# TODO: This doesn't have to be try-except, we could actually check if the file is there.
|
34 |
+
try:
|
35 |
+
# Read error message
|
36 |
+
with open(f"{args.run_dir}/error.log", 'r') as file:
|
37 |
+
for f in file.readlines():
|
38 |
+
if 'Traceback (most recent call last):' in f:
|
39 |
+
error_message = f
|
40 |
+
dataset.loc[dataset["model"].isin([args.model_name]), [
|
41 |
+
'status']] = "FAILED"
|
42 |
+
print("Status set to FAILED")
|
43 |
+
else:
|
44 |
+
dataset.loc[dataset["model"].isin([args.model_name]), [
|
45 |
+
'status']] = "COMPLETED"
|
46 |
+
# Add a new column for the error message if necessary
|
47 |
+
except FileNotFoundError as e:
|
48 |
+
print(f"Could not find {args.run_dir}/error.log")
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
updated_dataset = Dataset.from_pandas(dataset)
|
51 |
+
updated_dataset.push_to_hub("EnergyStarAI/requests_debug", split="test",
|
52 |
+
token=TOKEN)
|