import os import subprocess import time import shutil import re import hashlib #''' fn_list = [ 'tf2_ssd_mobilenet_v2_coco17_ptq', 'ssd_mobilenet_v2_coco_quant_postprocess', 'ssdlite_mobiledet_coco_qat_postprocess', 'ssd_mobilenet_v1_coco_quant_postprocess', 'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq', 'efficientdet_lite0_320_ptq', 'efficientdet_lite1_384_ptq', 'efficientdet_lite2_448_ptq', 'efficientdet_lite3_512_ptq', 'efficientdet_lite3x_640_ptq', 'yolov5n-int8', 'yolov5s-int8', 'yolov5m-int8', 'yolov5l-int8', ['yolov8n_416_640px', 'yolov8n_384_640px', 'yolov8n_384_608px', 'yolov8n_352_608px'], ['yolov8s_416_640px', 'yolov8s_384_640px', 'yolov8s_384_608px', 'yolov8s_352_608px'], ['yolov8m_416_640px', 'yolov8m_384_640px', 'yolov8m_384_608px', 'yolov8m_352_608px'], ['yolov8l_416_640px', 'yolov8l_384_640px', 'yolov8l_384_608px', 'yolov8l_352_608px'], ['yolov9t_416_640px', 'yolov9t_384_640px', 'yolov9t_384_608px', 'yolov9t_352_608px', 'yolov9t_352_576px'], ['yolov9s_416_640px', 'yolov9s_384_640px', 'yolov9s_384_608px', 'yolov9s_352_608px', 'yolov9s_352_576px'], ['yolov9m_416_640px', 'yolov9m_384_640px', 'yolov9m_384_608px', 'yolov9m_352_608px', 'yolov9m_352_576px'], ['yolov9c_416_640px', 'yolov9c_384_640px', 'yolov9c_384_608px', 'yolov9c_352_608px', 'yolov9c_352_576px'], 'ipcam-general-v8' ] custom_args = { 'tf2_ssd_mobilenet_v2_coco17_ptq': { 2: ["--diff_threshold_ns","100000"]}, 'ssd_mobilenet_v2_coco_quant_postprocess': { 5: ["--undefok=enable_multiple_subgraphs","--enable_multiple_subgraphs","--partition_search_step","3"]}, 'ssdlite_mobiledet_coco_qat_postprocess': { 2: ["--diff_threshold_ns","100000"]}, 'efficientdet_lite3_512_ptq': { 2: ["--undefok=enable_multiple_subgraphs","--enable_multiple_subgraphs"], 3: ["--undefok=enable_multiple_subgraphs","--enable_multiple_subgraphs"], 4: ["--undefok=enable_multiple_subgraphs","--enable_multiple_subgraphs"], 5: ["--undefok=enable_multiple_subgraphs","--enable_multiple_subgraphs"], 6: ["--undefok=enable_multiple_subgraphs","--enable_multiple_subgraphs"], 7: ["--undefok=enable_multiple_subgraphs","--enable_multiple_subgraphs"]}, 'efficientdet_lite3x_640_ptq': { 5: ["--undefok=enable_multiple_subgraphs","--enable_multiple_subgraphs","--partition_search_step","2"], 6: ["--undefok=enable_multiple_subgraphs","--enable_multiple_subgraphs","--partition_search_step","3"]}, 'yolov5n-int8': { 5: ["--partition_search_step","2"], 6: ["--partition_search_step","2"], 7: ["--partition_search_step","2"], 8: ["--partition_search_step","2"]}, 'yolov5s-int8': { 5: ["--partition_search_step","2"], 6: ["--partition_search_step","2"], 7: ["--partition_search_step","2"], 8: ["--partition_search_step","2"]}, 'yolov5m-int8': { 5: ["--partition_search_step","2"], 6: ["--partition_search_step","2"], 7: ["--partition_search_step","2"], 8: ["--partition_search_step","2"]}, 'yolov5l-int8': { 5: ["--undefok=enable_multiple_subgraphs","--enable_multiple_subgraphs","--partition_search_step","2"], 6: ["--partition_search_step","2"], 7: ["--partition_search_step","2"], 8: ["--partition_search_step","2"]}, 'yolov8m_416_640px': { 5: ["--partition_search_step","2"], 6: ["--partition_search_step","3"], 7: ["--partition_search_step","4"], 8: ["--partition_search_step","5"]}, 'yolov8l_416_640px': { 4: ["--partition_search_step","2"], 5: ["--partition_search_step","2"], 6: ["--partition_search_step","3"], 7: ["--partition_search_step","4"], 8: ["--partition_search_step","5"]}, 'yolov9c_416_640px': { 2: ["--delegate_search_step","10"]}, 'yolov9c_384_640px': { 1: ["--delegate_search_step","10"], 2: ["--delegate_search_step","10"]}, 'yolov9c_384_608px': { 1: ["--delegate_search_step","10"], 2: ["--delegate_search_step","10"]}, 'yolov9c_352_608px': { 1: ["--delegate_search_step","10"], 2: ["--delegate_search_step","10"]}, 'yolov9c_352_576px': { 1: ["--delegate_search_step","10"], 2: ["--delegate_search_step","10"]}}#''' ''' fn_list = [ # 'yolov5n-int8', # 'yolov5s-int8', # 'yolov5m-int8', # 'yolov5l-int8', # 'yolov8n_full_integer_quant', # 'yolov8s_full_integer_quant', # 'yolov8m_full_integer_quant', # 'yolov8l_full_integer_quant', # 'yolov8n_480px', # 'yolov8s_480px', # 'yolov8m_480px', # 'yolov8l_480px', # 'yolov8n_512px', # 'yolov8s_512px', # 'yolov8m_512px', # 'yolov8l_512px', # 'yolov8s_544px', # 'yolov8m_544px', # lg 1st seg # 'yolov8l_544px', # lg 1st seg # 'yolov8s_576px', # 'yolov8m_576px', # lg 1st seg # 'yolov8l_576px', # lg 1st seg # 'yolov8s_608px', # 'yolov8m_608px', # lg 1st seg # 'yolov8l_608px', # 'yolov8n_640px', # 'yolov8s_640px', # 'yolov8m_640px', # lg 1st seg # 'yolov8l_640px', # lg 1st seg # 'yolov8n_416_640px', # lg 1st seg 'yolov8s_416_640px', # lg 1st seg 'yolov8m_416_640px', # lg 1st seg 'yolov8l_416_640px'] # lg 1st seg # 'ipcam-general-v8'] #''' ''' custom_args = { 'yolov8n_full_integer_quant': { 2: ["--diff_threshold_ns","100000"], 3: ["--diff_threshold_ns","200000"]}, 'yolov8s_full_integer_quant': { 2: ["--diff_threshold_ns","200000"]}, 'yolov8l_full_integer_quant': { 5: ["--partition_search_step","2"]}, 'yolov8n_480px': { 2: ["--diff_threshold_ns","100000"], 3: ["--diff_threshold_ns","200000"]}, 'yolov8s_480px': { 2: ["--diff_threshold_ns","200000"]}, 'yolov8m_480px': { 5: ["--partition_search_step","2"]}, 'yolov8n_512px': { 2: ["--diff_threshold_ns","1200000"], 3: ["--diff_threshold_ns","600000"]}, 'yolov8s_512px': { 2: ["--diff_threshold_ns","200000"]}, 'yolov8m_640px': { 2: ["--diff_threshold_ns","200000", "--undefok=timeout_sec","--timeout_sec=360"]}, 'yolov8l_640px': { 2: ["--undefok=timeout_sec","--timeout_sec=360"]}, 'yolov8n_416_640px': { 5: ["--partition_search_step","2"]}, 'yolov8s_416_640px': { 5: ["--partition_search_step","2"]}, 'yolov8m_416_640px': { 5: ["--initial_lower_bound_ns","44658311","--initial_upper_bound_ns","45466138","--partition_search_step","2"], 6: ["--initial_lower_bound_ns","39444004","--initial_upper_bound_ns","40071927","--partition_search_step","3"], 7: ["--initial_lower_bound_ns","36028652","--initial_upper_bound_ns","37012866","--partition_search_step","4"], 8: ["--initial_lower_bound_ns","33892323","--initial_upper_bound_ns","34856571","--partition_search_step","5"]}, 'yolov8l_416_640px': { 5: ["--initial_lower_bound_ns","82297482","--initial_upper_bound_ns","82892528","--partition_search_step","2"], 6: ["--initial_lower_bound_ns","69966647","--initial_upper_bound_ns","70757195","--partition_search_step","3"], 7: ["--initial_lower_bound_ns","69067450","--initial_upper_bound_ns","69599451","--partition_search_step","4"], 8: ["--initial_lower_bound_ns","55889854","--initial_upper_bound_ns","56444625","--partition_search_step","5"]}}#''' ''' diff_threshold_ns = { 'yolov8s_416_640px': { 2: 4000000}, 'yolov8m_416_640px': { 4: 40000000, 5: 30000000}, 'yolov8l_416_640px': { 7: 90000000, 8: 70000000}}#''' ''' custom_args = { 'yolov8m_416_640px': { 5: ["--partition_search_step","2"], 6: ["--partition_search_step","3"], 7: ["--partition_search_step","4"], 8: ["--partition_search_step","5"]}, 'yolov8l_416_640px': { 4: ["--partition_search_step","2"], 5: ["--partition_search_step","2"], 6: ["--partition_search_step","3"], 7: ["--partition_search_step","4"], 8: ["--partition_search_step","5"]}}#''' seg_dir = "/home/seth/Documents/all_segments/" seg_types = ['', '2x_first_seg/', '15x_first_seg/', '3x_first_seg/', '4x_first_seg/', '15x_last_seg/', '2x_last_seg/', 'dumb/'] def seg_exists(filename, segment_type, segment_count): if segment_type == 'orig_code': segment_type = '' if segment_count == 1: seg_list = [seg_dir+segment_type+filename+'_edgetpu.tflite'] else: seg_list = [seg_dir+segment_type+filename+'_segment_{}_of_{}_edgetpu.tflite'.format(i, segment_count) for i in range(segment_count)] return (seg_list, any([True for s in seg_list if not os.path.exists(s)])) MAX_TPU_COUNT = 5 ''' # Generate segment files for sn in range(1,MAX_TPU_COUNT+1): flat_fn_list = [] for fn in fn_list: if isinstance(fn, list): flat_fn_list += fn else: flat_fn_list.append(fn) for fn in flat_fn_list: for seg_type in seg_types: seg_list, file_missing = seg_exists(fn, seg_type, sn) if not file_missing: continue if sn == 1: cmd = ["/usr/bin/edgetpu_compiler","-s","-d","--out_dir",seg_dir+seg_type,seg_dir+fn+".tflite"] elif 'dumb' in seg_type: cmd = ["/usr/bin/edgetpu_compiler","-s","-d","-n",str(sn),"--out_dir",seg_dir+seg_type,seg_dir+fn+".tflite"] elif 'saturated' in seg_type: try: cmd = ["libcoral/out/k8/tools/partitioner/partition_with_profiling","--output_dir",seg_dir+seg_type,"--edgetpu_compiler_binary", "/usr/bin/edgetpu_compiler","--model_path",seg_dir+fn+".tflite","--num_segments",str(sn), "--diff_threshold_ns", str(diff_threshold_ns[fn][sn])] except: # Note: "Saturated segments" is an attempt to load as much of the model as possible onto segments # while ignoring the latency incurred by slower segments. We assume we'll be able to "speed up" # these slower segments simply by running more copies of them. The faster segments ideally will # be optimized to all run at roughly the same speed. Thus the overall inference throughput will # be limited by how many multiples of the slowest segment we can run. # # diff_threshold_ns key entries only exist where we want to create "saturated segments". More would # mean the model is too sparse across segments. We create saturated segments by adjusting the # diff_threshold_ns until the compiler just starts pushing parameters off of the TPUs. Ideally # this will result in one or two slow segments and the rest of the segments are roughly equally # fast. continue else: if '2x_first_seg' in seg_type: #+++ b/coral/tools/partitioner/profiling_based_partitioner.cc #@@ -190,6 +190,8 @@ int64_t ProfilingBasedPartitioner::PartitionCompileAndAnalyze( # latencies = std::get<2>(coral::BenchmarkPartitionedModel( # tmp_edgetpu_segment_paths, &edgetpu_contexts(), kNumInferences)); #+ latencies[0] /= 2; # if (kUseCache) { # for (int i = 0; i < num_segments_; ++i) { # segment_latency_cache_[{segment_starts[i], num_ops[i]}] = latencies[i]; #@@ -211,10 +213,11 @@ std::pair ProfilingBasedPartitioner::GetBounds( # num_segments_, /*search_delegate=*/true, # delegate_search_step)) # << "Can not compile initial partition."; #- const auto latencies = std::get<2>(coral::BenchmarkPartitionedModel( #+ auto latencies = std::get<2>(coral::BenchmarkPartitionedModel( # tmp_edgetpu_segment_paths, &edgetpu_contexts(), kNumInferences)); # # DeleteFolder(tmp_dir); #+ latencies[0] /= 4; # # int64_t lower_bound = std::numeric_limits::max(), upper_bound = 0; # for (auto latency : latencies) { # # sudo make DOCKER_IMAGE="ubuntu:20.04" DOCKER_CPUS="k8" DOCKER_TARGETS="tools" docker-build #// Encourage each segment slower than the previous to spread out the bottlenecks #double latency_adjust = 1.0; #for (int i = 1; i < num_segments_; ++i) #{ # if (latencies[i-1] < latencies[i]) # latency_adjust *= 0.97; # latencies[i-1] *= latency_adjust; #} #latencies[num_segments_-1] *= latency_adjust; partition_with_profiling_dir = "libcoral/tools.2" elif '15x_first_seg' in seg_type: partition_with_profiling_dir = "libcoral/tools.15" elif '133x_first_seg' in seg_type: partition_with_profiling_dir = "libcoral/tools.133" elif '166x_first_seg' in seg_type: partition_with_profiling_dir = "libcoral/tools.166" elif '3x_first_seg' in seg_type: partition_with_profiling_dir = "libcoral/tools.3" elif '4x_first_seg' in seg_type: partition_with_profiling_dir = "libcoral/tools.4" elif '15x_last_seg' in seg_type: partition_with_profiling_dir = "libcoral/tools.last15" elif '2x_last_seg' in seg_type: partition_with_profiling_dir = "libcoral/tools.last2" elif '125x_last_inc_seg/' == seg_type: partition_with_profiling_dir = "libcoral/tools.last125_inc_seg" elif '2x_first_125x_last_inc_seg/' == seg_type: partition_with_profiling_dir = "libcoral/tools.2last125_inc_seg" elif 'inc_seg/' == seg_type: partition_with_profiling_dir = "libcoral/tools.inc_seg" else: partition_with_profiling_dir = "libcoral/tools.orig" cmd = [partition_with_profiling_dir+"/partitioner/partition_with_profiling","--output_dir",seg_dir+seg_type,"--edgetpu_compiler_binary", "/usr/bin/edgetpu_compiler","--model_path",seg_dir+fn+".tflite","--num_segments",str(sn)] try: cmd += custom_args[fn][sn] except: pass print(cmd) subprocess.run(cmd)#''' seg_types += ['133x_first_seg/', '166x_first_seg/', 'inc_seg/', '125x_last_inc_seg/', '2x_first_125x_last_inc_seg/'] # Test timings fin_timings = {} fin_fnames = {} for fn in fn_list: if isinstance(fn, list): fn_size_list = fn fn = fn[0] else: fn_size_list = [fn] timings = [] fin_timings[fn] = {} fin_fnames[fn] = {} for num_tpus in range(1,MAX_TPU_COUNT+1): for this_fn in fn_size_list: for seg_type in seg_types: max_seg = 0 for sn in range(1,num_tpus+1): # No need to run many slow single TPU tests, just one if sn == 1 and seg_type != '': continue # Test against orig code exe_file = "/home/seth/CodeProject.AI-ObjectDetectionCoral/objectdetection_coral_multitpu.py" # Get file types seg_list, file_missing = seg_exists(this_fn, seg_type, sn) if file_missing: continue max_seg = sn cmd = ["python3.9",exe_file,"--model"] + \ seg_list + ["--labels","coral/pycoral/test_data/coco_labels.txt","--input","/home/seth/coral/pycoral/test_data/grace_hopper.bmp", "--count","4000","--num-tpus",str(num_tpus)] print(cmd) # Clock runtime #start_time = time.perf_counter() #subprocess.run(cmd) #ms_time = 1000 * (time.perf_counter() - start_time) / 4000 # ms * total time / iterations # Last quarter runtime try: c = subprocess.run(cmd, check=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=3600*2) except subprocess.TimeoutExpired: print("Timed out!") continue print(c.stdout) print(c.stderr) ms_time = float(re.compile(r'threads; ([\d\.]+)ms ea').findall(c.stderr)[0]) mpps_time = float(re.compile(r'; ([\d\.]+) tensor MPx').findall(c.stderr)[0]) timings.append((ms_time, num_tpus, this_fn, seg_type, sn, mpps_time)) subprocess.run(['uptime']) timings = sorted(timings, key=lambda t: t[5], reverse=True) if not any(timings): continue # Print the top ten print(f"TIMINGS FOR {num_tpus} TPUs AND {fn} MODEL:") for t in range(min(10,len(timings))): print(timings[t]) # Get best segments, but # Skip if it's not 'orig_code' and > 1 segment t = [t for t in timings if t[3] != 'orig_code'][0] fin_timings[fn][num_tpus] = timings[0] # Add segment to the final list # Copy best to local dir seg_list, _ = seg_exists(t[2], t[3], t[4]) fin_fnames[fn][num_tpus] = [] for s in seg_list: file_components = os.path.normpath(s).split("/") out_fname = file_components[-2]+"_"+file_components[-1] shutil.copyfile(s, out_fname) checksum = hashlib.md5(open(out_fname,'rb').read()).hexdigest() fin_fnames[fn][num_tpus].append((out_fname, checksum)) # Create archive for this model / TPU count #if len(fin_fnames[fn][num_tpus]) > 1 or num_tpus == 1: # zip_name = f'objectdetection-{fn}-{num_tpus}-edgetpu.zip' # cmd = ['zip', '-9', zip_name] + fin_fnames[fn][num_tpus] # print(cmd) # if os.path.exists(zip_name): # os.unlink(zip_name) # subprocess.run(cmd) print(fin_timings) print(fin_fnames) # Pretty print all of the segments we've timed and selected for fn, v in fin_fnames.items(): print(" '%s': {" % fn) for tpu_count, timing in fin_timings[fn].items(): if tpu_count in v: seg_str = f"{len(v[tpu_count])} segments" else: seg_str = "1 segment " fps = 1000.0 / timing[0] print(f"#{timing[0]:6.1f} ms/inference ({fps:5.1f} FPS;{timing[5]:5.1f} tensor MPx/sec) for {tpu_count} TPUs using {seg_str}: {timing[2]}") for tpu_count, out_fnames in v.items(): if len(out_fnames) > 1: print(f"{tpu_count}: "+str(out_fnames)+",") if 1 in v: print(f" '_tflite': '{v[1][0]}'") print(" },")