diff --git a/compiler_flags.py b/compiler_flags.py index 26c88ca..309e519 100644 --- a/compiler_flags.py +++ b/compiler_flags.py @@ -47,7 +47,7 @@ def get_command_line_string(self, value): else: return "" else: - return "%s %s" % (self.name, value.__str__( )) + return "%s=%s" % (self.name, value.__str__( )) class Size: """Models a tile, block or grid size""" @@ -202,7 +202,15 @@ def random_value(self): self.block_size.random_value(), self.grid_size.random_value()) return per_kernel_size_info + + def init_value(self, tile_size, block_size, grid_size): + per_kernel_size_info = collections.OrderedDict() + per_kernel_size_info[SizesFlag.ALL_KERNELS_SENTINEL] = SizeTuple(TileSize(tile_size), + BlockSize(block_size), + GridSize(grid_size)) + return per_kernel_size_info + def permute(self, value): per_kernel_size_info = collections.OrderedDict() for kernel_number, size_tuple in value.iteritems(): @@ -301,8 +309,8 @@ class PPCG: flag_map[no_isl_schedule_separate_components] = EnumerationFlag(no_isl_schedule_separate_components) flag_map[no_wrap] = EnumerationFlag(no_wrap) flag_map[no_scale_tile_loops] = EnumerationFlag(no_scale_tile_loops) - flag_map[no_shared_memory] = EnumerationFlag(no_shared_memory) - flag_map[no_private_memory] = EnumerationFlag(no_private_memory) + flag_map[no_shared_memory] = EnumerationFlag(no_shared_memory, [True, False]) + flag_map[no_private_memory] = EnumerationFlag(no_private_memory, [True, False]) flag_map[no_live_range_reordering] = EnumerationFlag(no_live_range_reordering) optimisation_flags = [] diff --git a/enums.py b/enums.py index c67f0e0..7c0d7bb 100644 --- a/enums.py +++ b/enums.py @@ -1,6 +1,7 @@ class Targets: cuda = "cuda" opencl = "opencl" + prl = "prl" class Crossover: one_point = "one_point" @@ -18,8 +19,11 @@ class SearchStrategy: ga = "ga" random = "random" simulated_annealing = "simulated-annealing" + exhaustive = "exhaustive" class Status: passed = "passed" failed = "failed" - \ No newline at end of file + timeout = "timedout" + ppcgtimeout = "ppcg_timeout" + diff --git a/explore-params.py b/explore-params.py new file mode 100644 index 0000000..be92d6b --- /dev/null +++ b/explore-params.py @@ -0,0 +1,21 @@ +# This file contains the PPCG parameter values that are explored. +# The exploration script considers each combination of the parameter values. + +([ + # Tile sizes + [(16,16), (32,32), (64,64)], + + # Block sizes + [(1,1), (1,2), (1,4), (1,8), + (16,16), (32,32), (64,64)], + + # Grid sizes + [(16,16), (32,32), (256,256), (1024,1024)], + + #Shared memory + [False], + + #private memory + [False, True], + +]) diff --git a/heuristic_search.py b/heuristic_search.py index caece42..3994bfa 100644 --- a/heuristic_search.py +++ b/heuristic_search.py @@ -9,6 +9,11 @@ import individual import collections import internal_exceptions +import itertools +import os +from Queue import Queue +from threading import Thread +import sys class SearchStrategy: """Abstract class for a search strategy""" @@ -20,9 +25,16 @@ def run(self): @abc.abstractmethod def summarise(self): pass + + @abc.abstractmethod + def logall(self): + pass class GA(SearchStrategy): """Search using a genetic algorithm""" + + def logall(self): + return def set_child_flags(self, child, the_flags, the_flag_values): for idx, flag in enumerate(the_flags): @@ -291,22 +303,357 @@ def summarise(self): except internal_exceptions.NoFittestException: pass + def logall(self): + for i in self.individuals: + debug.summary_message(i.ppcg_cmd_line_flags, False) + return + +compile_queue = Queue(10) +run_queue = Queue(10) + +class CompileThread(Thread): + def run(self): + global compile_queue + global run_queue + while True: + testcase = compile_queue.get() + if isinstance(testcase, individual.EndOfQueue): + run_queue.put(testcase) + break + + testcase.ppcg() + testcase.build() + run_queue.put(testcase) + +class RunThread(Thread): + def __init__(self, num_threads): + super(RunThread, self).__init__() + self.num_threads = num_threads + self.individuals = [] + + def run(self): + global run_queue + best_time = float("inf") + f = open(config.Arguments.results_file + ".log", 'a') + f_iter = open('.lastiter', 'w') + while True: + #print('***run thread waiting') + testcase = run_queue.get() + if isinstance(testcase, individual.EndOfQueue): + self.num_threads = self.num_threads - 1 + print('***remaining threads: ' + str(self.num_threads)) + if self.num_threads<=0: + try: + os.remove('.lastiter') + self.summarise() + self.logall() + except: + pass + print('***run thread exiting') + break + continue + #print('***run thread got job') + testcase.binary(best_time) + f_iter.seek(0) + f_iter.write(str(testcase.get_ID())) + + if testcase.execution_time < best_time and testcase.execution_time != 0 and testcase.status == enums.Status.passed: + self.individuals.append(testcase) + best_time = testcase.execution_time + f.write("\n Best iter so far = \n") + f.write(str(testcase)) + f.flush() + + def summarise(self): + print("%s Summary of %s %s" % ('*' * 30, __name__, '*' * 30)) + try: + fittest = individual.get_fittest(self.individuals) + debug.summary_message("The fittest individual had execution time %f seconds" % (fittest.execution_time)) + debug.summary_message("To replicate, pass the following to PPCG:") + debug.summary_message(fittest.ppcg_cmd_line_flags, False) + except internal_exceptions.NoFittestException: + pass + + def logall(self): + print("%s Log of all runs %s" %('*' * 30, '*' * 30)) + for i in self.individuals: + print(i) + debug.summary_message(i.ppcg_cmd_line_flags, False) + pass + +class Exhaustive(SearchStrategy): + """Exhaustive search all the values in the specified range or """ + """all combinations provided in explore-params.py file""" + + def readParamValues(self): + f = open('explore-params.py', 'r') + paramValues = eval(f.read()) + f.close() + return paramValues + + def countConfigs(self, paramValues): + n = 1 + for i in paramValues: + n *= len(i) + return n + + def createExhaConfigs(self): + tile_size_lb = config.Arguments.tile_size_range[0] + tile_size_ub = config.Arguments.tile_size_range[1] + if config.Arguments.only_powers_of_two: + tile_size_range = [2**i for i in range(tile_size_lb, tile_size_ub)] + else: + tile_size_range = range(tile_size_lb, tile_size_ub) + + tile_sizes = itertools.product(tile_size_range, repeat=config.Arguments.tile_dimensions) + + block_size_lb = config.Arguments.block_size_range[0] + block_size_ub = config.Arguments.block_size_range[1] + if config.Arguments.only_powers_of_two: + block_size_range = [2**i for i in range(block_size_lb, block_size_ub)] + else: + block_size_range = range(block_size_lb, block_size_ub) + + block_sizes = itertools.product(block_size_range, repeat=config.Arguments.block_dimensions) + + grid_size_lb = config.Arguments.grid_size_range[0] + grid_size_ub = config.Arguments.grid_size_range[1] + if config.Arguments.only_powers_of_two: + grid_size_range = [2**i for i in range(grid_size_lb, grid_size_ub)] + else: + grid_size_range = range(grid_size_lb, grid_size_ub) + + grid_sizes = itertools.product(grid_size_range, repeat=config.Arguments.grid_dimensions) + + if config.Arguments.no_shared_memory: + shared_mem = [True, False] + else: + shared_mem = [False] + + if config.Arguments.no_private_memory: + private_mem = [True, False] + else: + private_mem = [False] + + paramValues = [tile_sizes, block_sizes, grid_sizes, shared_mem, private_mem] + return paramValues + + def get_last_iter(self): + if os.path.isfile(".lastiter"): + print("found last iter") + try: + f_iter = open(".lastiter", 'r+') + start_iter = int(f_iter.readline()) + except: + start_iter = 0 + pass + f_iter.close() + print("starting from test case = ", start_iter) + else: + start_iter = 0 + + return start_iter + + def pipelineExec(self, combs): + + start_iter = self.get_last_iter() + num_threads = config.Arguments.num_compile_threads + for i in range(num_threads): + t = CompileThread() + t.daemon = True + t.start() + + RunThread(num_threads).start() + + cnt = 0 + for conf in combs: + if cnt < start_iter: + cnt += 1 + continue + print '---- Configuration ' + str(cnt) + ': ' + str(conf) + cur = individual.create_test_case(conf[0], conf[1], conf[2], conf[3], conf[4]) + cur.set_ID(cnt) + cnt += 1 + compile_queue.put(cur) + + for i in range(num_threads): + compile_queue.put(individual.EndOfQueue()) # So every CompileThread fetches one EndOfQueue element + + def tile_size_multiple_filter(self, conf): + tile_size = conf[0] + block_size = conf[1] + + work_group_size = reduce(lambda x,y: x*y, block_size) + if work_group_size > config.Arguments.max_work_group_size: + return False + + if work_group_size < config.Arguments.min_work_group_size: + return False + + mul_factor = 1 + for t, b in zip(tile_size, block_size): + if t < b: + return False + if t % b != 0: + return False + mul_factor *= t/b + + if mul_factor > 36: + return False + + return True + + def run(self): + self.individuals = [] + self.multi_kernel = False + self.output_stream = open(config.Arguments.results_file, 'w') + if config.Arguments.no_concurrent_kernel_tuning: + self.multi_kernel = True + self.tune_kernel(compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL) + self.print_summary() + return + + for k in config.Arguments.kernels_to_tune: + self.individuals = [] + self.tune_kernel(k) + self.print_summary() + self.output_stream.close() + + def tune_kernel(self, ker_num): + + if config.Arguments.params_from_file: + paramValues = self.readParamValues() + else: + paramValues = self.createExhaConfigs() + + cnt = 0 + combs = itertools.product(*paramValues) + + + if config.Arguments.filter_testcases: + #Filter out only test cases based on heusristics such as tile size is multiple of block size etc.. + combs = filter(self.tile_size_multiple_filter, combs) + #Filter out only test cases where shared memory is true + #combs = filter(lambda conf: conf[3] == True, combs) + #Filter out only test cases where private memory is true + #combs = filter(lambda conf: conf[4] == True, combs) + + if config.Arguments.parallelize_compilation: + self.pipelineExec(combs) + return + + start_iter = self.get_last_iter() + + f = open(config.Arguments.results_file + ".log", 'a') + f_iter = open('.lastiter', 'w') + + best_time = float("inf") + best_kernel_time = [] + self.best_kernel_run = [] + if self.multi_kernel: + for s in config.Arguments.kernels_to_tune: + best_kernel_time.append(float("inf")) + self.best_kernel_run.append(0) + #print 'Parameter values to be explored: ' + str(paramValues) + #print 'Number of configurations: ' + str(self.countConfigs(paramValues)) + for conf in combs: + if cnt < start_iter: + cnt += 1 + continue + print '---- Configuration ' + str(cnt) + ': ' + str(conf) + cur = individual.create_test_case(conf[0], conf[1], conf[2], conf[3], conf[4], ker_num) + cur.set_ID(cnt) + cnt += 1 + cur.run(best_time) + if cur.status == enums.Status.ppcgtimeout : + f.write("\nppcg timeout") + f.write(str(best_run)) + f.flush() + continue + + if cur.execution_time == 0: + continue + + if self.multi_kernel: + #f.write("\n====================================\n") + for k in config.Arguments.kernels_to_tune: + if cur.per_kernel_time[k] < best_kernel_time[k]: + best_kernel_time[k] = cur.per_kernel_time[k] + self.best_kernel_run[k] = cur + f.write("\n Best time so far for kernel "+str(k) + " ID " + str(cnt) + " kernel time = " + str(best_kernel_time[k])) + f.write(str(cur.ppcg_cmd_line_flags) + str("\n")) + f.flush() + + if cur.execution_time < best_time and cur.status == enums.Status.passed: + self.individuals.append(cur) + best_time = cur.execution_time + best_run = cur + f.write("\n Best iter so far = "+ str(cnt) + "\n") + f.write(str(best_run)) + f.flush() + + f_iter.seek(0) + f_iter.write(str(cur.get_ID())) + + + def summarise_per_kernel(self): + for k in config.Arguments.kernels_to_tune: + print "Best config for kernel " + str(k) + print("had execution time %f ms" % (self.best_kernel_time[k])) + print("To replicate, use the following configuration:") + print(self.best_kernel_run[k].ppcg_cmd_line_flags, False) + + def summarise(self): + print("%s Summary of %s %s" % ('*' * 30, __name__, '*' * 30)) + try: + fittest = individual.get_fittest(self.individuals) + debug.summary_message("The fittest individual had execution time %f seconds" % (fittest.execution_time)) + debug.summary_message("To replicate, pass the following to PPCG:") + debug.summary_message(fittest.ppcg_cmd_line_flags, False) + except internal_exceptions.NoFittestException: + pass + + def logall(self): + print("%s Log of all runs %s" %('*' * 30, '*' * 30)) + for i in self.individuals: + print(i) + debug.summary_message(i.ppcg_cmd_line_flags, False) + pass + + + def print_summary(self): + old_stdout = sys.stdout + try: + if config.Arguments.results_file is not None: + sys.stdout = self.output_stream + if self.multi_kernel: + self.summarise_per_kernel() + else: + self.summarise() + #self.logall() + finally: + if config.Arguments.results_file is not None: + sys.stdout = old_stdout + class SimulatedAnnealing(SearchStrategy): - """Search using simulated annealing""" - - def acceptance_probability(self, currentEnergy, newEnergy, temperature): + """Search using simulated annealing""" + + def acceptance_probability(self, currentEnergy, newEnergy, temperature): if newEnergy < currentEnergy: return 1.0 - return math.exp((currentEnergy - newEnergy) / temperature) - - def mutate_backend_flags(self, clone_flags, solution_flags): + return math.exp((currentEnergy - newEnergy) / temperature) + + def logall(self): + return + + def mutate_backend_flags(self, clone_flags, solution_flags): for the_flag in solution_flags.keys(): if bool(random.getrandbits(1)): idx = the_flag.possible_values.index(solution_flags[the_flag]) newIdx = (idx + 1) % len(the_flag.possible_values) clone_flags[the_flag] = the_flag.possible_values[newIdx] - def mutate(self, solution): + def mutate(self, solution): clone = copy.deepcopy(solution) clone.ID = individual.Individual.get_ID() for the_flag in solution.ppcg_flags.keys(): @@ -324,7 +671,7 @@ def mutate(self, solution): self.mutate_backend_flags(clone.nvcc_flags, solution.nvcc_flags) return clone - def run(self): + def run(self): debug.verbose_message("Creating initial solution", __name__) current = individual.create_random() current.run() @@ -344,8 +691,8 @@ def run(self): if current.execution_time < self.fittest.execution_time: self.fittest = current - def summarise(self): + def summarise(self): debug.summary_message("The final individual had execution time %f seconds" % (self.fittest.execution_time)) debug.summary_message("To replicate, pass the following to PPCG:") debug.summary_message(self.fittest.ppcg_cmd_line_flags, False) - \ No newline at end of file + diff --git a/individual.py b/individual.py index b5d23e3..e8b08ee 100644 --- a/individual.py +++ b/individual.py @@ -7,7 +7,15 @@ import enums import collections import subprocess +import threading import internal_exceptions +import time + +class EndOfQueue: + def __init__(self): + pass + + def get_fittest(population): fittest = None @@ -22,9 +30,43 @@ def get_fittest(population): raise internal_exceptions.NoFittestException("None of the individuals among this population completed successfully, hence there is no fittest individual") return fittest +def create_test_case(tile_size, block_size, grid_size, shared_mem=True, private_mem=True, k=compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL): + individual = Individual() + per_kernel_size_info = collections.OrderedDict() + per_kernel_size_info[k] = compiler_flags.SizeTuple(tile_size, block_size, grid_size) + individual.kernel_num = k + + #for flag in compiler_flags.PPCG.optimisation_flags: + # print(flag) + + #TODO: Get a better way of getting size_data_flag + flag = compiler_flags.PPCG.optimisation_flags[4] + individual.ppcg_flags[flag] = per_kernel_size_info + + if not shared_mem: + flag = compiler_flags.PPCG.optimisation_flags[0] + #individual.ppcg_flags[flag] = compiler_flags.EnumerationFlag(flag) + individual.ppcg_flags[flag] = True + + + if not private_mem: + flag = compiler_flags.PPCG.optimisation_flags[7] + #individual.ppcg_flags[flag] = compiler_flags.EnumerationFlag(flag) + individual.ppcg_flags[flag] = True + + #Set isl fusion flag + #flag = compiler_flags.PPCG.optimisation_flags[6] + #individual.ppcg_flags[flag] = fusion + #string = individual.ppcg_flags[flag].get_command_line_string(1024) + #print(string) + #print("end") + return individual + + def create_random(): individual = Individual() for flag in compiler_flags.PPCG.optimisation_flags: + print(flag) individual.ppcg_flags[flag] = flag.random_value() for flag in compiler_flags.CC.optimisation_flags: individual.cc_flags[flag] = flag.random_value() @@ -39,17 +81,36 @@ class Individual: ID = 0 @staticmethod - def get_ID(): + def get_ID_init(): Individual.ID += 1 return Individual.ID + def file_name(self): + if config.Arguments.binary_file_name: + return config.Arguments.binary_file_name + + return 'testcase'+str(self.ID) + #return 'gemm' + + def set_ID(self, num): + self.ID = num + + def get_ID(self): + return self.ID + def __init__(self): - self.ID = Individual.get_ID() + self.ID = Individual.get_ID_init() self.ppcg_flags = collections.OrderedDict() self.cc_flags = collections.OrderedDict() self.cxx_flags = collections.OrderedDict() self.nvcc_flags = collections.OrderedDict() self.status = enums.Status.failed + self.execution_time = float("inf") + self.num = 0 + self.kernel_num=compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL + self.per_kernel_time = [] + for k in config.Arguments.kernels_to_tune: + self.per_kernel_time.append(float("inf")) def all_flags(self): return self.ppcg_flags.keys() + self.cc_flags.keys() + self.cxx_flags.keys() + self.nvcc_flags.keys() @@ -57,12 +118,15 @@ def all_flags(self): def all_flag_values(self): return self.ppcg_flags.values() + self.cc_flags.values() + self.cxx_flags.values() + self.nvcc_flags.values() - def run(self): + def run(self, timeout): try: - self.compile() + self.compile(timeout) if self.status == enums.Status.passed: # Fitness is inversely proportional to execution time - self.fitness = 1/self.execution_time + if self.execution_time == 0: + self.fitness = float("inf") + else: + self.fitness = 1/self.execution_time debug.verbose_message("Individual %d: execution time = %f, fitness = %f" \ % (self.ID, self.execution_time, self.fitness), __name__) else: @@ -70,68 +134,205 @@ def run(self): except internal_exceptions.FailedCompilationException as e: debug.exit_message(e) - def compile(self): + + def checkforpause(self): + while(1): + if os.path.isfile('.pause'): + print("Auto tuning paused, remove .pause to restart") + time.sleep(20) + else: + #print("Auto tuning restarted") + break + + def compile(self, timeout=float("inf")): + self.checkforpause() self.ppcg() + #sucess=self.ppcg_with_timeout(timeout) + #if not sucess: + # return self.build() - self.binary() + self.binary(timeout) def ppcg(self): self.ppcg_cmd_line_flags = "--target=%s --dump-sizes %s" % (config.Arguments.target, ' '.join(flag.get_command_line_string(self.ppcg_flags[flag]) for flag in self.ppcg_flags.keys())) os.environ["AUTOTUNER_PPCG_FLAGS"] = self.ppcg_cmd_line_flags - debug.verbose_message("Running '%s'" % config.Arguments.ppcg_cmd, __name__) + + if config.Arguments.cmd_string_complete: + cmd = config.Arguments.ppcg_cmd+ ' '+self.ppcg_cmd_line_flags + elif config.Arguments.target == enums.Targets.cuda: + cmd = config.Arguments.ppcg_cmd + ' '+self.ppcg_cmd_line_flags+' -o '+self.file_name() + else: + cmd = config.Arguments.ppcg_cmd + ' '+self.ppcg_cmd_line_flags+' -o '+self.file_name()+'_host.c' + + debug.verbose_message("Running '%s'" % cmd, __name__) + #debug.verbose_message("Running '%s'" % self.ppcg_cmd_line_flags , __name__) start = timeit.default_timer() - proc = subprocess.Popen(config.Arguments.ppcg_cmd, shell=True, stderr=subprocess.PIPE) - stderr = proc.communicate()[1] + self.ppcg_proc = subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE) + stderr = self.ppcg_proc.communicate()[1] end = timeit.default_timer() config.time_PPCG += end - start - if proc.returncode: + if self.ppcg_proc.returncode: raise internal_exceptions.FailedCompilationException("FAILED: '%s'" % config.Arguments.ppcg_cmd) - # Store the sizes used by PPCG - self.size_data = compiler_flags.SizesFlag.parse_PPCG_dump_sizes(stderr) + + def ppcg_with_timeout(self, timeout=float("inf")): + thread = threading.Thread(target=self.ppcg) + thread.start() + thread.join(timeout) + if thread.is_alive(): + print("Timeout: terminating the ppcg ") + self.ppcg_proc.terminate() + thread.join(timeout) + self.status = enums.Status.ppcgtimeout + return False + return True + def build(self): - debug.verbose_message("Running '%s'" % config.Arguments.build_cmd, __name__) + if config.Arguments.cmd_string_complete: + build_cmd = config.Arguments.build_cmd + elif config.Arguments.target == enums.Targets.cuda: + build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.cu ' + self.file_name()+ '_kernel.cu '+ '-o '+ self.file_name()+'.exe' + else: + build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.c ' + '-o '+ self.file_name()+'.exe' + ' -lprl -lOpenCL' + debug.verbose_message("Running '%s'" % build_cmd, __name__) start = timeit.default_timer() - proc = subprocess.Popen(config.Arguments.build_cmd, shell=True) + proc = subprocess.Popen(build_cmd, shell=True) stderr = proc.communicate()[1] end = timeit.default_timer() config.time_backend += end - start if proc.returncode: raise internal_exceptions.FailedCompilationException("FAILED: '%s'" % config.Arguments.build_cmd) + + - def binary(self): - time_regex = re.compile(r'^(\d*\.\d+|\d+)$') + def deleteFile(self, fileName): + try: + if os.path.exists(fileName): + os.remove(fileName) + except: + pass + + def extract_kernel_time(self, kernel_num, stdout): + re_str = r'kernel'+str(kernel_num)+'\s*:\s*(\d*.\d+)ms' + time_regex = re.compile(re_str) + total_time = 0.0 + + nmatchedlines = 0 + for line in stdout.split(os.linesep): + line = line.strip() + matches = time_regex.findall(line) + if matches: + nmatchedlines += 1 + try: + total_time += float(matches[0]) + except: + raise internal_exceptions.BinaryRunException("Execution time '%s' is not in the required format" % matches[0]) + if nmatchedlines == 0: + total_time = float("inf") + return total_time + + def update_kernel_times(self, stdout): + if not config.Arguments.prl_profiling: + return + for k in config.Arguments.kernels_to_tune: + self.per_kernel_time[k] = self.extract_kernel_time(k, stdout) + + def binary(self, best_execution_time=float("inf")): + #time_regex = re.compile(r'^(\d*\.\d+|\d+)$') + #print config.Arguments.execution_time_regex + if config.Arguments.prl_profiling: + if self.kernel_num == compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL: + re_str = r'compute\s*:\s*(\d*.\d+)ms' + else: + re_str = r'kernel'+str(self.kernel_num)+'\s*:\s*(\d*.\d+)ms' + else: + re_str = config.Arguments.execution_time_regex + + print re_str + time_regex = re.compile(re_str) total_time = 0.0 status = enums.Status.passed + num_actual_runs = 0 for run in xrange(1,config.Arguments.runs+1): - debug.verbose_message("Run #%d of '%s'" % (run, config.Arguments.run_cmd), __name__) + if config.Arguments.cmd_string_complete: + run_cmd = config.Arguments.run_cmd + else: + run_cmd = './'+self.file_name()+'.exe '+config.Arguments.run_cmd_input + #run_cmd = config.Arguments.run_cmd + debug.verbose_message("Run #%d of '%s'" % (run, run_cmd), __name__) start = timeit.default_timer() - proc = subprocess.Popen(config.Arguments.run_cmd, shell=True, stdout=subprocess.PIPE) - stdout, stderr = proc.communicate() + self.proc = subprocess.Popen(run_cmd, shell=True, stdout=subprocess.PIPE) + stdout, stderr = self.proc.communicate() end = timeit.default_timer() - if proc.returncode: - status = enums.Status.failed + if self.proc.returncode: + sper_kernel_size_infotatus = enums.Status.failed debug.warning_message("FAILED: '%s'" % config.Arguments.run_cmd) continue if config.Arguments.execution_time_from_binary: if not stdout: raise internal_exceptions.BinaryRunException("Expected the binary to dump its execution time. Found nothing") + self.update_kernel_times(stdout) + nmatchedlines = 0 for line in stdout.split(os.linesep): line = line.strip() matches = time_regex.findall(line) if matches: + nmatchedlines += 1 try: total_time += float(matches[0]) except: raise internal_exceptions.BinaryRunException("Execution time '%s' is not in the required format" % matches[0]) + if nmatchedlines == 0: + raise internal_exceptions.BinaryRunException("Regular expression did not match anything on the program's output") else: total_time += end - start + + num_actual_runs +=1 + per_var = 1 + config.Arguments.max_exec_time_var/100 + time = per_var * best_execution_time + if total_time > time * num_actual_runs: + #print "Execution time of cur test case is worst than the best so far, stopping at first run" + break + self.status = status config.time_binary += total_time - self.execution_time = total_time/config.Arguments.runs + if num_actual_runs != 0: + self.execution_time = total_time/num_actual_runs + else: + self.execution_time = total_time + + self.deleteFile(self.file_name()+'.exe') + self.deleteFile(self.file_name()+'_host.c') + self.deleteFile(self.file_name()+'_host_kernel.cl') + self.deleteFile(self.file_name()+'_host.cu') + self.deleteFile(self.file_name()+'_kernel.cu') + self.deleteFile(self.file_name()+'_kernel.hu') + self.deleteFile(self.file_name()+'_host_kernel.hu') + self.deleteFile(self.file_name()+'_host_kernel.h') + self.deleteFile(self.file_name()) + + + def run_with_timeout(self, timeout=2): + print "executing task " + str(self.ID) + timeout = config.Arguments.timeout_ppcg + try: + thread = threading.Thread(target=self.binary) + thread.start() + thread.join(timeout) + if thread.is_alive(): + print("Timeout: terminating the procs") + self.proc.terminate() + thread.join() + self.status = enums.Status.timeout + except: + print("Exception running"+str(self.ID)) + self.status = enums.Status.timeout + + return + def __str__(self): - return "ID %d: fitness %f" % (self.ID, self.fitness) - \ No newline at end of file + return "ID %4d: execution time = %3f, ppcg = %s, status = %s" % (self.ID, self.execution_time, self.ppcg_cmd_line_flags, self.status) + diff --git a/main.py b/main.py index dddba88..5e04a9e 100755 --- a/main.py +++ b/main.py @@ -9,6 +9,7 @@ import sys def print_summary(search): + return try: if config.Arguments.results_file is not None: old_stdout = sys.stdout @@ -16,6 +17,7 @@ def print_summary(search): sys.stdout = output_stream config.summarise_timing() search.summarise() + search.logall() finally: if config.Arguments.results_file is not None: output_stream.close() @@ -26,6 +28,8 @@ def autotune(): search = heuristic_search.GA() elif config.Arguments.autotune_subcommand == enums.SearchStrategy.random: search = heuristic_search.Random() + elif config.Arguments.autotune_subcommand == enums.SearchStrategy.exhaustive: + search = heuristic_search.Exhaustive() elif config.Arguments.autotune_subcommand == enums.SearchStrategy.simulated_annealing: search = heuristic_search.SimulatedAnnealing() else: @@ -127,9 +131,22 @@ def string_csv(string): building_and_running_group.add_argument("--run-cmd", metavar="", help="how to run the generated binary from the auto-tuner", - required=True) + required=False) + - runs = 5 + building_and_running_group.add_argument("--run-cmd-input", + metavar="", + help="input to the generated binary from the auto-tuner", + required=False, + default="") + + + building_and_running_group.add_argument("--cmd-string-complete", + action="store_true", + help="dont modify the cmd string, note the output file nmaes should be part of cmd lines", + default=False) + + runs = 1 building_and_running_group.add_argument("--runs", type=int, metavar="", @@ -141,11 +158,32 @@ def string_csv(string): help="assume that the binary prints its execution time to standard output (rather than measuring the execution time through Python)", default=False) + building_and_running_group.add_argument("--binary-file-name", + metavar="", + help="name of the generated binary from the auto-tuner", + required=False, + default="") + + building_and_running_group.add_argument("--execution-time-regex", + type=str, + help="regular expression format for execution time", + default=r'^(\d*\.\d+|\d+)$') + + + building_and_running_group.add_argument("--prl-profiling", + action="store_true", + help="Using prl profiling, used to extract timing info from prl profiling output", + default=False) + + building_and_running_group.add_argument("--no-concurrent-kernel-tuning", + action="store_false", + help="Do not tune multiple kernels at the same time", + default=True) # PPCG options ppcg_group = parser.add_argument_group("PPCG arguments") ppcg_group.add_argument("--target", - choices=[enums.Targets.cuda, enums.Targets.opencl], + choices=[enums.Targets.cuda, enums.Targets.opencl, enums.Targets.prl], help="the target to generate code for", default=enums.Targets.opencl) @@ -165,6 +203,13 @@ def string_csv(string): metavar="", help="consider only these values when tuning the shared memory size (default: %s)" % (shared_memory_possibilties), default=shared_memory_possibilties) + + kernels_list = [compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL] + ppcg_group.add_argument("--kernels-to-tune", + type=int_csv, + metavar="", + help="consider only these kernels values when tuning (default: all)", + default=kernels_list) tile_size_range = (2**0, 2**6) ppcg_group.add_argument("--tile-size-range", @@ -234,6 +279,16 @@ def string_csv(string): help="do not tune kernel sizes individually, i.e. use a uniform tile size for all kernels and let PPCG decide on suitable block and grid sizes", default=False) + ppcg_group.add_argument("--no-shared-memory", + action="store_false", + help="do not consider shared memory while autotuning", + default=True) + + ppcg_group.add_argument("--no-private-memory", + action="store_false", + help="do not consider private memory while autotuning", + default=True) + ppcg_group.add_argument("--all-isl-options", action=ISLAction, metavar="", @@ -330,10 +385,74 @@ def string_csv(string): default=randoms, help="the number of random tests to generate (default: %d)" % randoms) - parser.parse_args(namespace=config.Arguments) + + parser_exhaustive = search_subparsers.add_parser(enums.SearchStrategy.exhaustive) + + parser_exhaustive.add_argument("--params-from-file", + action="store_true", + help="read the paramters from the explore-params py", + default=False) + + parser_exhaustive.add_argument("--only-powers-of-two", + action="store_true", + help="Search for parameter values that are powers of two", + default=False) + + parser_exhaustive.add_argument("--filter-testcases", + action="store_true", + help="few heursitics to reduce search space such as tile size multiple of block size, tile size > block size etc..", + default=True) + + parser_exhaustive.add_argument("--parallelize-compilation", + action="store_true", + help="parallelize ppcg compilation and execution of test case", + default=False) + + + num_compile_threads = 1 + parser_exhaustive.add_argument("--num-compile-threads", + type=int, + metavar="", + default=num_compile_threads, + help="number of threads to use for ppcg compilation (default: %d)" % num_compile_threads) + + + max_work_group_size = 1024 + parser_exhaustive.add_argument("--max-work-group-size", + type=int, + metavar="", + default=max_work_group_size, + help="max work group size, test cases with work group size greater than this value will be filtered out (default: %d)" % max_work_group_size) + + + + min_work_group_size = 1 + parser_exhaustive.add_argument("--min-work-group-size", + type=int, + metavar="", + default=min_work_group_size, + help="min work group size, test cases with work group size lesser than this value will be filtered out (default: %d)" % min_work_group_size) + + timeout = 500 + parser_exhaustive.add_argument("--timeout-ppcg", + type=int, + metavar="", + default=timeout, + help="timeout for ppcg compilation and testcase execution (default: %d sec)" % num_compile_threads) + + + max_exec_time_var = 20 + parser_exhaustive.add_argument("--max-exec-time-var", + type=int, + metavar="", + default=max_exec_time_var, + help="max allowed variance for execution time. If the execution time of a test case is greater that best so far + max-exec-time-var then number of runs is restricted to 1 (default: %d )" % max_exec_time_var) + + parser.parse_args(namespace=config.Arguments) + if __name__ == "__main__": the_command_line() setup_PPCG_flags() autotune() - \ No newline at end of file +