From 4ac792875e369be5453184b92bf103c373569f11 Mon Sep 17 00:00:00 2001 From: chandan Date: Wed, 11 Mar 2015 19:16:29 +0100 Subject: [PATCH 01/34] adding exahaustive serach option --- compiler_flags.py | 14 ++- enums.py | 5 +- heuristic_search.py | 221 ++++++++++++++++++++++++++++++++++++++++++-- individual.py | 166 ++++++++++++++++++++++++++++----- main.py | 67 +++++++++++++- 5 files changed, 433 insertions(+), 40 deletions(-) diff --git a/compiler_flags.py b/compiler_flags.py index 26c88ca..309e519 100644 --- a/compiler_flags.py +++ b/compiler_flags.py @@ -47,7 +47,7 @@ def get_command_line_string(self, value): else: return "" else: - return "%s %s" % (self.name, value.__str__( )) + return "%s=%s" % (self.name, value.__str__( )) class Size: """Models a tile, block or grid size""" @@ -202,7 +202,15 @@ def random_value(self): self.block_size.random_value(), self.grid_size.random_value()) return per_kernel_size_info + + def init_value(self, tile_size, block_size, grid_size): + per_kernel_size_info = collections.OrderedDict() + per_kernel_size_info[SizesFlag.ALL_KERNELS_SENTINEL] = SizeTuple(TileSize(tile_size), + BlockSize(block_size), + GridSize(grid_size)) + return per_kernel_size_info + def permute(self, value): per_kernel_size_info = collections.OrderedDict() for kernel_number, size_tuple in value.iteritems(): @@ -301,8 +309,8 @@ class PPCG: flag_map[no_isl_schedule_separate_components] = EnumerationFlag(no_isl_schedule_separate_components) flag_map[no_wrap] = EnumerationFlag(no_wrap) flag_map[no_scale_tile_loops] = EnumerationFlag(no_scale_tile_loops) - flag_map[no_shared_memory] = EnumerationFlag(no_shared_memory) - flag_map[no_private_memory] = EnumerationFlag(no_private_memory) + flag_map[no_shared_memory] = EnumerationFlag(no_shared_memory, [True, False]) + flag_map[no_private_memory] = EnumerationFlag(no_private_memory, [True, False]) flag_map[no_live_range_reordering] = EnumerationFlag(no_live_range_reordering) optimisation_flags = [] diff --git a/enums.py b/enums.py index c67f0e0..0af2b4c 100644 --- a/enums.py +++ b/enums.py @@ -18,8 +18,11 @@ class SearchStrategy: ga = "ga" random = "random" simulated_annealing = "simulated-annealing" + exhaustive = "exhaustive" class Status: passed = "passed" failed = "failed" - \ No newline at end of file + timeout = "timedout" + ppcgtimeout = "ppcg_timeout" + diff --git a/heuristic_search.py b/heuristic_search.py index caece42..fa0e4ba 100644 --- a/heuristic_search.py +++ b/heuristic_search.py @@ -9,6 +9,10 @@ import individual import collections import internal_exceptions +import itertools +import os +from Queue import Queue +from threading import Thread class SearchStrategy: """Abstract class for a search strategy""" @@ -20,9 +24,16 @@ def run(self): @abc.abstractmethod def summarise(self): pass + + @abc.abstractmethod + def logall(self): + pass class GA(SearchStrategy): """Search using a genetic algorithm""" + + def logall(self): + return def set_child_flags(self, child, the_flags, the_flag_values): for idx, flag in enumerate(the_flags): @@ -291,22 +302,212 @@ def summarise(self): except internal_exceptions.NoFittestException: pass + def logall(self): + for i in self.individuals: + debug.summary_message(i.ppcg_cmd_line_flags, False) + return + +compile_queue = Queue(10) +run_queue = Queue(10) + +class CompileThread(Thread): + def run(self): + global compile_queue + global run_queue + while True: + testcase = compile_queue.get() + if testcase.get_ID() == -1: + run_queue.put(testcase) + break + + testcase.ppcg() + testcase.build() + run_queue.put(testcase) + + +class RunThread(Thread): + def run(self): + global run_queue + best_time = float("inf") + f = open(config.Arguments.results_file + ".log", 'a') + f_iter = open('.lastiter', 'w') + while True: + testcase = run_queue.get() + if testcase.get_ID() == -1: + try: + os.remove('.lastiter') + except: + pass + break + testcase.run_with_timeout() + f_iter.seek(0) + f_iter.write(str(testcase.get_ID())) + + if testcase.execution_time < best_time and testcase.execution_time != 0 and testcase.status == enums.Status.passed: + best_time = testcase.execution_time + f.write("\n Best iter so far = \n") + f.write(str(testcase)) + f.flush() + +class Exhaustive(SearchStrategy): + """Exhaustive search all the values in the specified range or """ + """all combinations provided in explore-params.py file""" + + def readParamValues(self): + f = open('explore-params.py', 'r') + paramValues = eval(f.read()) + f.close() + return paramValues + + def countConfigs(self, paramValues): + n = 1 + for i in paramValues: + n *= len(i) + return n + + def createExhaConfigs(self): + tile_size_lb = config.Arguments.tile_size_range[0] + tile_size_ub = config.Arguments.tile_size_range[1] + if config.Arguments.only_powers_of_two: + tile_size_range = [2**i for i in range(tile_size_lb, tile_size_ub)] + else: + tile_size_range = range(tile_size_lb, tile_size_ub) + + tile_sizes = itertools.product(tile_size_range, repeat=config.Arguments.tile_dimensions) + + block_size_lb = config.Arguments.block_size_range[0] + block_size_ub = config.Arguments.block_size_range[1] + if config.Arguments.only_powers_of_two: + block_size_range = [2**i for i in range(block_size_lb, block_size_ub)] + else: + block_size_range = range(block_size_lb, block_size_ub) + + block_sizes = itertools.product(tile_size_range, repeat=config.Arguments.block_dimensions) + + grid_size_lb = config.Arguments.grid_size_range[0] + grid_size_ub = config.Arguments.grid_size_range[1] + if config.Arguments.only_powers_of_two: + grid_size_range = [2**i for i in range(grid_size_lb, grid_size_ub)] + else: + grid_size_range = range(grid_size_lb, grid_size_ub) + + grid_sizes = itertools.product(grid_size_range, repeat=config.Arguments.grid_dimensions) + + if config.Arguments.no_shared_memory: + shared_mem = [True, False] + else: + shared_mem = [False] + + if config.Arguments.no_private_memory: + private_mem = [True, False] + else: + private_mem = [False] + + if config.Arguments.all_fusion_structures: + fusion = ['max', 'min'] + else: + fusion = ['max'] + + paramValues = [tile_sizes, block_sizes, grid_sizes, private_mem, shared_mem, fusion] + return paramValues + + def pipelineExec(self, combs): + num_threads = config.Arguments.num_compile_threads + for i in range(num_threads): + CompileThread().start() + + RunThread().start() + + cnt = 0 + for conf in combs: + print '---- Configuration ' + str(cnt) + ': ' + str(conf) + cur = individual.create_test_case(conf[0], conf[1], conf[2], conf[3], conf[4]) + cur.set_ID(cnt) + cnt += 1 + compile_queue.put(cur) + + cur.set_ID(-1) + for i in range(num_threads): + compile_queue.put(cur) + + + def run(self): + self.individuals = [] + + if config.Arguments.params_from_file: + paramValues = self.readParamValues() + else: + paramValues = self.createExhaConfigs() + + cnt = 0 + combs = itertools.product(*paramValues) + if config.Arguments.parallelize_compilation: + self.pipelineExec(combs) + return + + f = open(config.Arguments.results_file + ".log", 'a') + best_time = 0 + #print 'Parameter values to be explored: ' + str(paramValues) + #print 'Number of configurations: ' + str(self.countConfigs(paramValues)) + for conf in combs: + print '---- Configuration ' + str(cnt) + ': ' + str(conf) + cur = individual.create_test_case(conf[0], conf[1], conf[2], conf[3], conf[4], conf[5]) + cur.set_ID(cnt) + cnt += 1 + cur.run(best_time) + if cur.status == enums.Status.ppcgtimeout : + f.write("\nppcg timeout") + f.write(str(best_run)) + f.flush() + continue + + if cur.execution_time == 0: + continue + + if cur.execution_time < best_time and cur.status == enums.Status.passed: + self.individuals.append(cur) + best_time = cur.execution_time + best_run = cur + f.write("\n Best iter so far = "+ str(i) + "\n") + f.write(str(best_run)) + f.flush() + + def summarise(self): + print("%s Summary of %s %s" % ('*' * 30, __name__, '*' * 30)) + try: + fittest = individual.get_fittest(self.individuals) + debug.summary_message("The fittest individual had execution time %f seconds" % (fittest.execution_time)) + debug.summary_message("To replicate, pass the following to PPCG:") + debug.summary_message(fittest.ppcg_cmd_line_flags, False) + except internal_exceptions.NoFittestException: + pass + + def logall(self): + print("%s Log of all runs %s" %('*' * 30, '*' * 30)) + for i in self.individuals: + print(i) + debug.summary_message(i.ppcg_cmd_line_flags, False) + pass + class SimulatedAnnealing(SearchStrategy): - """Search using simulated annealing""" - - def acceptance_probability(self, currentEnergy, newEnergy, temperature): + """Search using simulated annealing""" + + def acceptance_probability(self, currentEnergy, newEnergy, temperature): if newEnergy < currentEnergy: return 1.0 - return math.exp((currentEnergy - newEnergy) / temperature) - - def mutate_backend_flags(self, clone_flags, solution_flags): + return math.exp((currentEnergy - newEnergy) / temperature) + + def logall(self): + return + + def mutate_backend_flags(self, clone_flags, solution_flags): for the_flag in solution_flags.keys(): if bool(random.getrandbits(1)): idx = the_flag.possible_values.index(solution_flags[the_flag]) newIdx = (idx + 1) % len(the_flag.possible_values) clone_flags[the_flag] = the_flag.possible_values[newIdx] - def mutate(self, solution): + def mutate(self, solution): clone = copy.deepcopy(solution) clone.ID = individual.Individual.get_ID() for the_flag in solution.ppcg_flags.keys(): @@ -324,7 +525,7 @@ def mutate(self, solution): self.mutate_backend_flags(clone.nvcc_flags, solution.nvcc_flags) return clone - def run(self): + def run(self): debug.verbose_message("Creating initial solution", __name__) current = individual.create_random() current.run() @@ -344,8 +545,8 @@ def run(self): if current.execution_time < self.fittest.execution_time: self.fittest = current - def summarise(self): + def summarise(self): debug.summary_message("The final individual had execution time %f seconds" % (self.fittest.execution_time)) debug.summary_message("To replicate, pass the following to PPCG:") debug.summary_message(self.fittest.ppcg_cmd_line_flags, False) - \ No newline at end of file + diff --git a/individual.py b/individual.py index b5d23e3..6ec05e3 100644 --- a/individual.py +++ b/individual.py @@ -7,7 +7,9 @@ import enums import collections import subprocess +import threading import internal_exceptions +import time def get_fittest(population): fittest = None @@ -22,9 +24,42 @@ def get_fittest(population): raise internal_exceptions.NoFittestException("None of the individuals among this population completed successfully, hence there is no fittest individual") return fittest +def create_test_case(tile_size, block_size, grid_size, shared_mem=True, private_mem=True, fusion='max'): + individual = Individual() + per_kernel_size_info = collections.OrderedDict() + per_kernel_size_info[compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL] = compiler_flags.SizeTuple(tile_size, block_size, grid_size) + + #for flag in compiler_flags.PPCG.optimisation_flags: + # print(flag) + + #TODO: Get a better way of getting size_data_flag + flag = compiler_flags.PPCG.optimisation_flags[4] + individual.ppcg_flags[flag] = per_kernel_size_info + + if not shared_mem: + flag = compiler_flags.PPCG.optimisation_flags[0] + #individual.ppcg_flags[flag] = compiler_flags.EnumerationFlag(flag) + individual.ppcg_flags[flag] = True + + + if not private_mem: + flag = compiler_flags.PPCG.optimisation_flags[7] + #individual.ppcg_flags[flag] = compiler_flags.EnumerationFlag(flag) + individual.ppcg_flags[flag] = True + + #Set isl fusion flag + flag = compiler_flags.PPCG.optimisation_flags[6] + individual.ppcg_flags[flag] = fusion + #string = individual.ppcg_flags[flag].get_command_line_string(1024) + #print(string) + #print("end") + return individual + + def create_random(): individual = Individual() for flag in compiler_flags.PPCG.optimisation_flags: + print(flag) individual.ppcg_flags[flag] = flag.random_value() for flag in compiler_flags.CC.optimisation_flags: individual.cc_flags[flag] = flag.random_value() @@ -39,17 +74,29 @@ class Individual: ID = 0 @staticmethod - def get_ID(): + def get_ID_init(): Individual.ID += 1 return Individual.ID + def file_name(self): + return 'testcase'+str(self.ID) + #return 'gemm' + + def set_ID(self, num): + self.ID = num + + def get_ID(self): + return self.ID + def __init__(self): - self.ID = Individual.get_ID() + self.ID = Individual.get_ID_init() self.ppcg_flags = collections.OrderedDict() self.cc_flags = collections.OrderedDict() self.cxx_flags = collections.OrderedDict() self.nvcc_flags = collections.OrderedDict() self.status = enums.Status.failed + self.execution_time = float("inf") + self.num = 0 def all_flags(self): return self.ppcg_flags.keys() + self.cc_flags.keys() + self.cxx_flags.keys() + self.nvcc_flags.keys() @@ -57,12 +104,15 @@ def all_flags(self): def all_flag_values(self): return self.ppcg_flags.values() + self.cc_flags.values() + self.cxx_flags.values() + self.nvcc_flags.values() - def run(self): + def run(self, timeout): try: - self.compile() + self.compile(timeout) if self.status == enums.Status.passed: # Fitness is inversely proportional to execution time - self.fitness = 1/self.execution_time + if self.execution_time == 0: + self.fitness = float("inf") + else: + self.fitness = 1/self.execution_time debug.verbose_message("Individual %d: execution time = %f, fitness = %f" \ % (self.ID, self.execution_time, self.fitness), __name__) else: @@ -70,49 +120,91 @@ def run(self): except internal_exceptions.FailedCompilationException as e: debug.exit_message(e) - def compile(self): - self.ppcg() + + def checkforpause(self): + while(1): + if os.path.isfile('.pause'): + print("Auto tuning paused, remove .pause to restart") + time.sleep(20) + else: + #print("Auto tuning restarted") + break + + def compile(self, timeout=2): + self.checkforpause() + sucess=self.ppcg_with_timeout() + if not sucess: + return self.build() - self.binary() + self.run_with_timeout(timeout) def ppcg(self): self.ppcg_cmd_line_flags = "--target=%s --dump-sizes %s" % (config.Arguments.target, ' '.join(flag.get_command_line_string(self.ppcg_flags[flag]) for flag in self.ppcg_flags.keys())) os.environ["AUTOTUNER_PPCG_FLAGS"] = self.ppcg_cmd_line_flags - debug.verbose_message("Running '%s'" % config.Arguments.ppcg_cmd, __name__) + cmd = config.Arguments.ppcg_cmd + ' '+self.ppcg_cmd_line_flags+' -o '+self.file_name() + debug.verbose_message("Running '%s'" % cmd, __name__) + #debug.verbose_message("Running '%s'" % self.ppcg_cmd_line_flags , __name__) start = timeit.default_timer() - proc = subprocess.Popen(config.Arguments.ppcg_cmd, shell=True, stderr=subprocess.PIPE) - stderr = proc.communicate()[1] + self.ppcg_proc = subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE) + stderr = self.ppcg_proc.communicate()[1] end = timeit.default_timer() config.time_PPCG += end - start - if proc.returncode: + if self.ppcg_proc.returncode: raise internal_exceptions.FailedCompilationException("FAILED: '%s'" % config.Arguments.ppcg_cmd) # Store the sizes used by PPCG self.size_data = compiler_flags.SizesFlag.parse_PPCG_dump_sizes(stderr) + + def ppcg_with_timeout(self, timeout=200): + thread = threading.Thread(target=self.ppcg) + thread.start() + thread.join(timeout) + if thread.is_alive(): + print("Timeout: terminating the ppcg ") + self.ppcg_proc.terminate() + thread.join(timeout) + self.status = enums.Status.ppcgtimeout + return False + return True + def build(self): - debug.verbose_message("Running '%s'" % config.Arguments.build_cmd, __name__) + build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.cu ' + self.file_name()+ '_kernel.cu '+ '-o '+ self.file_name()+'.exe' + debug.verbose_message("Running '%s'" % build_cmd, __name__) start = timeit.default_timer() - proc = subprocess.Popen(config.Arguments.build_cmd, shell=True) + proc = subprocess.Popen(build_cmd, shell=True) stderr = proc.communicate()[1] end = timeit.default_timer() config.time_backend += end - start if proc.returncode: raise internal_exceptions.FailedCompilationException("FAILED: '%s'" % config.Arguments.build_cmd) + + + def deleteFile(self, fileName): + try: + if os.path.exists(fileName): + os.remove(fileName) + except: + pass + def binary(self): - time_regex = re.compile(r'^(\d*\.\d+|\d+)$') + #time_regex = re.compile(r'^(\d*\.\d+|\d+)$') + print config.Arguments.execution_time_regex + time_regex = re.compile(config.Arguments.execution_time_regex) total_time = 0.0 status = enums.Status.passed for run in xrange(1,config.Arguments.runs+1): - debug.verbose_message("Run #%d of '%s'" % (run, config.Arguments.run_cmd), __name__) + run_cmd = './'+self.file_name()+'.exe' + #run_cmd = config.Arguments.run_cmd + debug.verbose_message("Run #%d of '%s'" % (run, run_cmd), __name__) start = timeit.default_timer() - proc = subprocess.Popen(config.Arguments.run_cmd, shell=True, stdout=subprocess.PIPE) - stdout, stderr = proc.communicate() + self.proc = subprocess.Popen(run_cmd, shell=True, stdout=subprocess.PIPE) + stdout, stderr = self.proc.communicate() end = timeit.default_timer() - if proc.returncode: - status = enums.Status.failed + if self.proc.returncode: + sper_kernel_size_infotatus = enums.Status.failed debug.warning_message("FAILED: '%s'" % config.Arguments.run_cmd) continue if config.Arguments.execution_time_from_binary: @@ -131,7 +223,37 @@ def binary(self): self.status = status config.time_binary += total_time self.execution_time = total_time/config.Arguments.runs + + self.deleteFile(self.file_name()+'.exe') + self.deleteFile(self.file_name()+'_host.c') + self.deleteFile(self.file_name()+'_host_kernel.cl') + self.deleteFile(self.file_name()+'_host.cu') + self.deleteFile(self.file_name()+'_kernel.cu') + self.deleteFile(self.file_name()+'_kernel.hu') + self.deleteFile(self.file_name()+'_host_kernel.hu') + self.deleteFile(self.file_name()+'_host_kernel.h') + self.deleteFile(self.file_name()) + + + def run_with_timeout(self, timeout=2): + print "executing task " + str(self.ID) + timeout = config.Arguments.timeout_ppcg + try: + thread = threading.Thread(target=self.binary) + thread.start() + thread.join(timeout) + if thread.is_alive(): + print("Timeout: terminating the procs") + self.proc.terminate() + thread.join() + self.status = enums.Status.timeout + except: + print("Exception running"+str(self.ID)) + self.status = enums.Status.timeout + + return + def __str__(self): - return "ID %d: fitness %f" % (self.ID, self.fitness) - \ No newline at end of file + return "ID %4d: execution time = %3f, ppcg = %s, status = %s" % (self.ID, self.execution_time, self.ppcg_cmd_line_flags, self.status) + diff --git a/main.py b/main.py index dddba88..8c1743d 100755 --- a/main.py +++ b/main.py @@ -16,6 +16,7 @@ def print_summary(search): sys.stdout = output_stream config.summarise_timing() search.summarise() + search.logall() finally: if config.Arguments.results_file is not None: output_stream.close() @@ -26,6 +27,8 @@ def autotune(): search = heuristic_search.GA() elif config.Arguments.autotune_subcommand == enums.SearchStrategy.random: search = heuristic_search.Random() + elif config.Arguments.autotune_subcommand == enums.SearchStrategy.exhaustive: + search = heuristic_search.Exhaustive() elif config.Arguments.autotune_subcommand == enums.SearchStrategy.simulated_annealing: search = heuristic_search.SimulatedAnnealing() else: @@ -129,7 +132,7 @@ def string_csv(string): help="how to run the generated binary from the auto-tuner", required=True) - runs = 5 + runs = 1 building_and_running_group.add_argument("--runs", type=int, metavar="", @@ -141,6 +144,12 @@ def string_csv(string): help="assume that the binary prints its execution time to standard output (rather than measuring the execution time through Python)", default=False) + + building_and_running_group.add_argument("--execution-time-regex", + type=str, + help="regular expression format for execution time", + default=r'^(\d*\.\d+|\d+)$') + # PPCG options ppcg_group = parser.add_argument_group("PPCG arguments") @@ -173,7 +182,7 @@ def string_csv(string): help="consider only values in this range when tuning the tile size (default: %d-%d)" % (tile_size_range[0], tile_size_range[1]), default=tile_size_range) - tile_dimensions = 3 + tile_dimensions = 1 ppcg_group.add_argument("--tile-dimensions", type=int, metavar="", @@ -234,6 +243,16 @@ def string_csv(string): help="do not tune kernel sizes individually, i.e. use a uniform tile size for all kernels and let PPCG decide on suitable block and grid sizes", default=False) + ppcg_group.add_argument("--no-shared-memory", + action="store_false", + help="do not consider shared memory while autotuning", + default=True) + + ppcg_group.add_argument("--no-private-memory", + action="store_false", + help="do not consider private memory while autotuning", + default=True) + ppcg_group.add_argument("--all-isl-options", action=ISLAction, metavar="", @@ -330,10 +349,50 @@ def string_csv(string): default=randoms, help="the number of random tests to generate (default: %d)" % randoms) - parser.parse_args(namespace=config.Arguments) + + parser_exhaustive = search_subparsers.add_parser(enums.SearchStrategy.exhaustive) + + parser_exhaustive.add_argument("--params-from-file", + action="store_true", + help="read the paramters from the explore-params py", + default=False) + + parser_exhaustive.add_argument("--only-powers-of-two", + action="store_true", + help="Search for parameter values that are powers of two", + default=False) + + parser_exhaustive.add_argument("--all-fusion-structures", + action="store_true", + help="explore all fusion structures [max, min] ", + default=False) + parser_exhaustive.add_argument("--parallelize-compilation", + action="store_true", + help="parallelize ppcg compilation and execution of test case", + default=False) + + + num_compile_threads = 1 + parser_exhaustive.add_argument("--num-compile-threads", + type=int, + metavar="", + default=num_compile_threads, + help="number of threads to use for ppcg compilation (default: %d)" % num_compile_threads) + + + + timeout = 500 + parser_exhaustive.add_argument("--timeout-ppcg", + type=int, + metavar="", + default=timeout, + help="timeout for ppcg compilation and testcase execution (default: %d sec)" % num_compile_threads) + + parser.parse_args(namespace=config.Arguments) + if __name__ == "__main__": the_command_line() setup_PPCG_flags() autotune() - \ No newline at end of file + From d9e943f8867140cb1363cd73d7a732604fa043c1 Mon Sep 17 00:00:00 2001 From: chandan Date: Thu, 12 Mar 2015 14:35:34 +0100 Subject: [PATCH 02/34] adding template file explore-params.py to specifiy the parameters explored in exhaustive search option --- explore-params.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 explore-params.py diff --git a/explore-params.py b/explore-params.py new file mode 100644 index 0000000..60b7dd7 --- /dev/null +++ b/explore-params.py @@ -0,0 +1,23 @@ +# This file contains the PPCG parameter values that are explored. +# The exploration script considers each combination of the parameter values. + +([ + # Tile sizes + [(16,16), (32,32), (64,64)], + + # Grid sizes + [(16,16), (32,32), (256,256), (1024,1024)], + + # Block sizes + [(1,1), (1,2), (1,4), (1,8), + (16,16), (32,32), (64,64)], + + #private memory + [False], + + #Shared memory + [False, True], + + #Fusion + ['max', 'min'] +]) From 33e1e2683e5511d547208a1f13156794d97aa28e Mon Sep 17 00:00:00 2001 From: chandan Date: Thu, 12 Mar 2015 14:38:53 +0100 Subject: [PATCH 03/34] Removed trailing whitespace --- heuristic_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/heuristic_search.py b/heuristic_search.py index fa0e4ba..1668fa8 100644 --- a/heuristic_search.py +++ b/heuristic_search.py @@ -490,7 +490,7 @@ def logall(self): pass class SimulatedAnnealing(SearchStrategy): - """Search using simulated annealing""" + """Search using simulated annealing""" def acceptance_probability(self, currentEnergy, newEnergy, temperature): if newEnergy < currentEnergy: From fd13616fbe770acfb152dbbd451cdf4e09370440 Mon Sep 17 00:00:00 2001 From: chandan Date: Thu, 12 Mar 2015 14:44:33 +0100 Subject: [PATCH 04/34] changed the default of tile dimensions and number of runs --- main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index 8c1743d..451776a 100755 --- a/main.py +++ b/main.py @@ -132,7 +132,7 @@ def string_csv(string): help="how to run the generated binary from the auto-tuner", required=True) - runs = 1 + runs = 5 building_and_running_group.add_argument("--runs", type=int, metavar="", @@ -182,7 +182,7 @@ def string_csv(string): help="consider only values in this range when tuning the tile size (default: %d-%d)" % (tile_size_range[0], tile_size_range[1]), default=tile_size_range) - tile_dimensions = 1 + tile_dimensions = 3 ppcg_group.add_argument("--tile-dimensions", type=int, metavar="", From 4e64fa0c33e049f0f2d74a5d4700d4996c8c98e1 Mon Sep 17 00:00:00 2001 From: chandan Date: Thu, 12 Mar 2015 15:11:23 +0100 Subject: [PATCH 05/34] changed the command line print of enumerated flag --- compiler_flags.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler_flags.py b/compiler_flags.py index 309e519..6434c60 100644 --- a/compiler_flags.py +++ b/compiler_flags.py @@ -47,7 +47,7 @@ def get_command_line_string(self, value): else: return "" else: - return "%s=%s" % (self.name, value.__str__( )) + return "%s %s" % (self.name, value.__str__( )) class Size: """Models a tile, block or grid size""" From 7b38fb66a4d9787097d9d86b4278a162c93064d9 Mon Sep 17 00:00:00 2001 From: chandan Date: Thu, 12 Mar 2015 15:13:58 +0100 Subject: [PATCH 06/34] addedd = to the command line print of enumerated flag --- compiler_flags.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler_flags.py b/compiler_flags.py index 6434c60..309e519 100644 --- a/compiler_flags.py +++ b/compiler_flags.py @@ -47,7 +47,7 @@ def get_command_line_string(self, value): else: return "" else: - return "%s %s" % (self.name, value.__str__( )) + return "%s=%s" % (self.name, value.__str__( )) class Size: """Models a tile, block or grid size""" From e224aec869e5aca993209061936945b7c923123a Mon Sep 17 00:00:00 2001 From: chandan Date: Thu, 12 Mar 2015 15:31:32 +0100 Subject: [PATCH 07/34] fixed the build command for opencl compilation --- individual.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/individual.py b/individual.py index 6ec05e3..7a61fd4 100644 --- a/individual.py +++ b/individual.py @@ -170,7 +170,10 @@ def ppcg_with_timeout(self, timeout=200): return True def build(self): - build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.cu ' + self.file_name()+ '_kernel.cu '+ '-o '+ self.file_name()+'.exe' + if config.Arguments.target == enums.Target.cuda: + build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.cu ' + self.file_name()+ '_kernel.cu '+ '-o '+ self.file_name()+'.exe' + else: + build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.c ' + '-o '+ self.file_name()+'.exe' debug.verbose_message("Running '%s'" % build_cmd, __name__) start = timeit.default_timer() proc = subprocess.Popen(build_cmd, shell=True) From 81c67b43cc77495a6f5da4bb87eedb9980aae88d Mon Sep 17 00:00:00 2001 From: chandan Date: Thu, 12 Mar 2015 15:47:31 +0100 Subject: [PATCH 08/34] fixed the ppcg command line output file options for opecl --- individual.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/individual.py b/individual.py index 7a61fd4..bcb73a1 100644 --- a/individual.py +++ b/individual.py @@ -143,7 +143,12 @@ def ppcg(self): ' '.join(flag.get_command_line_string(self.ppcg_flags[flag]) for flag in self.ppcg_flags.keys())) os.environ["AUTOTUNER_PPCG_FLAGS"] = self.ppcg_cmd_line_flags - cmd = config.Arguments.ppcg_cmd + ' '+self.ppcg_cmd_line_flags+' -o '+self.file_name() + + if config.Arguments.target == enums.Targets.cuda: + cmd = config.Arguments.ppcg_cmd + ' '+self.ppcg_cmd_line_flags+' -o '+self.file_name() + else: + cmd = config.Arguments.ppcg_cmd + ' '+self.ppcg_cmd_line_flags+' -o '+self.file_name()+'_host.c' + debug.verbose_message("Running '%s'" % cmd, __name__) #debug.verbose_message("Running '%s'" % self.ppcg_cmd_line_flags , __name__) start = timeit.default_timer() @@ -170,7 +175,7 @@ def ppcg_with_timeout(self, timeout=200): return True def build(self): - if config.Arguments.target == enums.Target.cuda: + if config.Arguments.target == enums.Targets.cuda: build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.cu ' + self.file_name()+ '_kernel.cu '+ '-o '+ self.file_name()+'.exe' else: build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.c ' + '-o '+ self.file_name()+'.exe' From 9fd7121ff9a8bd7410513fdeba4b1c6e758fb42f Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Thu, 12 Mar 2015 15:53:08 +0100 Subject: [PATCH 09/34] Fail if regex isn't found in program output --- individual.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/individual.py b/individual.py index bcb73a1..fb3503c 100644 --- a/individual.py +++ b/individual.py @@ -218,14 +218,18 @@ def binary(self): if config.Arguments.execution_time_from_binary: if not stdout: raise internal_exceptions.BinaryRunException("Expected the binary to dump its execution time. Found nothing") + nmatchedlines = 0 for line in stdout.split(os.linesep): line = line.strip() matches = time_regex.findall(line) if matches: + nmatchedlines += 1 try: total_time += float(matches[0]) except: raise internal_exceptions.BinaryRunException("Execution time '%s' is not in the required format" % matches[0]) + if nmatchedlines == 0: + raise internal_exceptions.BinaryRunException("Regular expression did not match anything on the program's output") else: total_time += end - start self.status = status From 4eb1cf71d7d5ec3a7f97a78f2d72945174ed9e2e Mon Sep 17 00:00:00 2001 From: chandan Date: Fri, 13 Mar 2015 13:53:16 +0100 Subject: [PATCH 10/34] removed the required flag for run cmd, as it is no longer required to be sepcified --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 451776a..0750c2c 100755 --- a/main.py +++ b/main.py @@ -130,7 +130,7 @@ def string_csv(string): building_and_running_group.add_argument("--run-cmd", metavar="", help="how to run the generated binary from the auto-tuner", - required=True) + required=False) runs = 5 building_and_running_group.add_argument("--runs", From c5b79227eb6aeed681c6ec0cc1d41a067162f44a Mon Sep 17 00:00:00 2001 From: chandan Date: Fri, 13 Mar 2015 18:06:22 +0100 Subject: [PATCH 11/34] added support for pencil runtime libraries while building opencl target --- individual.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/individual.py b/individual.py index bcb73a1..e22e773 100644 --- a/individual.py +++ b/individual.py @@ -178,7 +178,7 @@ def build(self): if config.Arguments.target == enums.Targets.cuda: build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.cu ' + self.file_name()+ '_kernel.cu '+ '-o '+ self.file_name()+'.exe' else: - build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.c ' + '-o '+ self.file_name()+'.exe' + build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.c ' + '-o '+ self.file_name()+'.exe' + ' -lprl -lOpenCL' debug.verbose_message("Running '%s'" % build_cmd, __name__) start = timeit.default_timer() proc = subprocess.Popen(build_cmd, shell=True) @@ -199,7 +199,7 @@ def deleteFile(self, fileName): def binary(self): #time_regex = re.compile(r'^(\d*\.\d+|\d+)$') - print config.Arguments.execution_time_regex + #print config.Arguments.execution_time_regex time_regex = re.compile(config.Arguments.execution_time_regex) total_time = 0.0 status = enums.Status.passed From d73406117b0cb5546c314950a0e43201f362037b Mon Sep 17 00:00:00 2001 From: Michael Kruse Date: Mon, 16 Mar 2015 16:50:26 +0100 Subject: [PATCH 12/34] Fix RunThread termination condition --- heuristic_search.py | 34 +++++++++++++++++++++++----------- individual.py | 6 ++++++ 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/heuristic_search.py b/heuristic_search.py index 1668fa8..d2c88ee 100644 --- a/heuristic_search.py +++ b/heuristic_search.py @@ -316,7 +316,7 @@ def run(self): global run_queue while True: testcase = compile_queue.get() - if testcase.get_ID() == -1: + if isinstance(testcase, individual.EndOfQueue): run_queue.put(testcase) break @@ -326,19 +326,30 @@ def run(self): class RunThread(Thread): + def __init__(self, num_threads): + super(RunThread, self).__init__() + self.num_threads = num_threads + def run(self): global run_queue best_time = float("inf") f = open(config.Arguments.results_file + ".log", 'a') f_iter = open('.lastiter', 'w') while True: + print('***run thread waiting') testcase = run_queue.get() - if testcase.get_ID() == -1: - try: - os.remove('.lastiter') - except: - pass - break + if isinstance(testcase, individual.EndOfQueue): + self.num_threads = self.num_threads - 1 + print('***remaining threads: ' + str(self.num_threads)) + if self.num_threads<=0: + try: + os.remove('.lastiter') + except: + pass + print('***run thread exiting') + break + continue + print('***run thread got job') testcase.run_with_timeout() f_iter.seek(0) f_iter.write(str(testcase.get_ID())) @@ -414,9 +425,11 @@ def createExhaConfigs(self): def pipelineExec(self, combs): num_threads = config.Arguments.num_compile_threads for i in range(num_threads): - CompileThread().start() + t = CompileThread() + t.daemon = True + t.start() - RunThread().start() + RunThread(num_threads).start() cnt = 0 for conf in combs: @@ -426,9 +439,8 @@ def pipelineExec(self, combs): cnt += 1 compile_queue.put(cur) - cur.set_ID(-1) for i in range(num_threads): - compile_queue.put(cur) + compile_queue.put(individual.EndOfQueue()) # So every CompileThread fetches one EndOfQueue element def run(self): diff --git a/individual.py b/individual.py index fb3503c..70d1d96 100644 --- a/individual.py +++ b/individual.py @@ -11,6 +11,12 @@ import internal_exceptions import time +class EndOfQueue: + def __init__(self): + pass + + + def get_fittest(population): fittest = None for individual in population: From 5f38aef736cdc3e1d652824c1458fd071814e58a Mon Sep 17 00:00:00 2001 From: chandan Date: Tue, 17 Mar 2015 14:00:07 +0100 Subject: [PATCH 13/34] added few hesuritics to prune search space --- heuristic_search.py | 30 ++++++++++++++++++++++++++++-- main.py | 6 ++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/heuristic_search.py b/heuristic_search.py index d2c88ee..0683637 100644 --- a/heuristic_search.py +++ b/heuristic_search.py @@ -336,7 +336,7 @@ def run(self): f = open(config.Arguments.results_file + ".log", 'a') f_iter = open('.lastiter', 'w') while True: - print('***run thread waiting') + #print('***run thread waiting') testcase = run_queue.get() if isinstance(testcase, individual.EndOfQueue): self.num_threads = self.num_threads - 1 @@ -349,7 +349,7 @@ def run(self): print('***run thread exiting') break continue - print('***run thread got job') + #print('***run thread got job') testcase.run_with_timeout() f_iter.seek(0) f_iter.write(str(testcase.get_ID())) @@ -442,6 +442,22 @@ def pipelineExec(self, combs): for i in range(num_threads): compile_queue.put(individual.EndOfQueue()) # So every CompileThread fetches one EndOfQueue element + def tile_size_multiple_filter(self, conf): + tile_size = conf[0] + block_size = conf[1] + + mul_factor = 1 + for t, b in zip(tile_size, block_size): + if t < b: + return False + if t % b != 0: + return False + mul_factor *= t/b + + if mul_factor > 36: + return False + + return True def run(self): self.individuals = [] @@ -453,6 +469,16 @@ def run(self): cnt = 0 combs = itertools.product(*paramValues) + + + if config.Arguments.filter_testcases: + #Filter out only test cases based on heusristics such as tile size is multiple of block size etc.. + combs = filter(self.tile_size_multiple_filter, combs) + #Filter out only test cases where shared memory is true + combs = filter(lambda conf: conf[3] == True, combs) + #Filter out only test cases where private memory is true + combs = filter(lambda conf: conf[4] == True, combs) + if config.Arguments.parallelize_compilation: self.pipelineExec(combs) return diff --git a/main.py b/main.py index 0750c2c..8920d58 100755 --- a/main.py +++ b/main.py @@ -367,6 +367,12 @@ def string_csv(string): help="explore all fusion structures [max, min] ", default=False) + + parser_exhaustive.add_argument("--filter-testcases", + action="store_true", + help="few heursitics to reduce search space such as tile size multiple of block size, tile size > block size etc..", + default=False) + parser_exhaustive.add_argument("--parallelize-compilation", action="store_true", help="parallelize ppcg compilation and execution of test case", From cd58e363578e7140c59c6d0d56d173f0f11bd3c9 Mon Sep 17 00:00:00 2001 From: chandan Date: Tue, 17 Mar 2015 17:31:09 +0100 Subject: [PATCH 14/34] added support for last iter --- heuristic_search.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/heuristic_search.py b/heuristic_search.py index 0683637..bc5cdf8 100644 --- a/heuristic_search.py +++ b/heuristic_search.py @@ -422,7 +422,23 @@ def createExhaConfigs(self): paramValues = [tile_sizes, block_sizes, grid_sizes, private_mem, shared_mem, fusion] return paramValues + def get_last_iter(self): + if os.path.isfile(".lastiter"): + print("found last iter") + try: + f_iter = open(".lastiter", 'r+') + start_iter = int(f_iter.readline()) + except: + start_iter = 0 + print("starting from test case = ", start_iter) + else: + start_iter = 0 + + return start_iter + def pipelineExec(self, combs): + + start_iter = self.get_last_iter() num_threads = config.Arguments.num_compile_threads for i in range(num_threads): t = CompileThread() @@ -433,6 +449,9 @@ def pipelineExec(self, combs): cnt = 0 for conf in combs: + if cnt < start_iter: + cnt += 1 + continue print '---- Configuration ' + str(cnt) + ': ' + str(conf) cur = individual.create_test_case(conf[0], conf[1], conf[2], conf[3], conf[4]) cur.set_ID(cnt) @@ -484,10 +503,15 @@ def run(self): return f = open(config.Arguments.results_file + ".log", 'a') + start_iter = self.get_last_iter() + best_time = 0 #print 'Parameter values to be explored: ' + str(paramValues) #print 'Number of configurations: ' + str(self.countConfigs(paramValues)) for conf in combs: + if cnt < start_iter: + cnt += 1 + continue print '---- Configuration ' + str(cnt) + ': ' + str(conf) cur = individual.create_test_case(conf[0], conf[1], conf[2], conf[3], conf[4], conf[5]) cur.set_ID(cnt) From 7ee2e676df70506e2e11566fe04388fe27a6a82b Mon Sep 17 00:00:00 2001 From: chandan Date: Tue, 17 Mar 2015 18:05:46 +0100 Subject: [PATCH 15/34] fixed the execution time logging bug --- heuristic_search.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/heuristic_search.py b/heuristic_search.py index bc5cdf8..dfe0fc7 100644 --- a/heuristic_search.py +++ b/heuristic_search.py @@ -430,6 +430,7 @@ def get_last_iter(self): start_iter = int(f_iter.readline()) except: start_iter = 0 + pass print("starting from test case = ", start_iter) else: start_iter = 0 @@ -503,9 +504,9 @@ def run(self): return f = open(config.Arguments.results_file + ".log", 'a') - start_iter = self.get_last_iter() + start_iter = 0 - best_time = 0 + best_time = float("inf") #print 'Parameter values to be explored: ' + str(paramValues) #print 'Number of configurations: ' + str(self.countConfigs(paramValues)) for conf in combs: From 9df1195d28f9c9d73819938b978bd4caa35556f2 Mon Sep 17 00:00:00 2001 From: chandan Date: Tue, 17 Mar 2015 18:26:48 +0100 Subject: [PATCH 16/34] fixed the undefined variable bug --- heuristic_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/heuristic_search.py b/heuristic_search.py index dfe0fc7..ae28dd5 100644 --- a/heuristic_search.py +++ b/heuristic_search.py @@ -531,7 +531,7 @@ def run(self): self.individuals.append(cur) best_time = cur.execution_time best_run = cur - f.write("\n Best iter so far = "+ str(i) + "\n") + f.write("\n Best iter so far = "+ str(cnt) + "\n") f.write(str(best_run)) f.flush() From 3fbf390c90657f90abd2b937263da7519fff06fa Mon Sep 17 00:00:00 2001 From: chandan Date: Wed, 18 Mar 2015 14:50:35 +0100 Subject: [PATCH 17/34] added filter to remove testcases which have inavlid work group size --- heuristic_search.py | 4 ++++ main.py | 9 ++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/heuristic_search.py b/heuristic_search.py index ae28dd5..5da17ee 100644 --- a/heuristic_search.py +++ b/heuristic_search.py @@ -466,6 +466,10 @@ def tile_size_multiple_filter(self, conf): tile_size = conf[0] block_size = conf[1] + work_group_size = reduce(lambda x,y: x*y, block_size) + if work_group_size > config.Arguments.max_work_group_size: + return False + mul_factor = 1 for t, b in zip(tile_size, block_size): if t < b: diff --git a/main.py b/main.py index 8920d58..20a5111 100755 --- a/main.py +++ b/main.py @@ -386,7 +386,14 @@ def string_csv(string): default=num_compile_threads, help="number of threads to use for ppcg compilation (default: %d)" % num_compile_threads) - + + max_work_group_size = 256 + parser_exhaustive.add_argument("--max-work-group-size", + type=int, + metavar="", + default=max_work_group_size, + help="max work group size, test cases with work group size greater than this value will be filtered out (default: %d)" % num_compile_threads) + timeout = 500 parser_exhaustive.add_argument("--timeout-ppcg", From 58ecfaccae2f53041295648fff25083979a2f0f3 Mon Sep 17 00:00:00 2001 From: chandan Date: Thu, 19 Mar 2015 15:20:29 +0100 Subject: [PATCH 18/34] added summerasize function for Parallelize execution --- heuristic_search.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/heuristic_search.py b/heuristic_search.py index 5da17ee..2b4f1c3 100644 --- a/heuristic_search.py +++ b/heuristic_search.py @@ -324,11 +324,11 @@ def run(self): testcase.build() run_queue.put(testcase) - class RunThread(Thread): def __init__(self, num_threads): super(RunThread, self).__init__() self.num_threads = num_threads + self.individuals = [] def run(self): global run_queue @@ -344,6 +344,8 @@ def run(self): if self.num_threads<=0: try: os.remove('.lastiter') + self.summarise() + self.logall() except: pass print('***run thread exiting') @@ -355,11 +357,29 @@ def run(self): f_iter.write(str(testcase.get_ID())) if testcase.execution_time < best_time and testcase.execution_time != 0 and testcase.status == enums.Status.passed: + self.individuals.append(testcase) best_time = testcase.execution_time f.write("\n Best iter so far = \n") f.write(str(testcase)) f.flush() + def summarise(self): + print("%s Summary of %s %s" % ('*' * 30, __name__, '*' * 30)) + try: + fittest = individual.get_fittest(self.individuals) + debug.summary_message("The fittest individual had execution time %f seconds" % (fittest.execution_time)) + debug.summary_message("To replicate, pass the following to PPCG:") + debug.summary_message(fittest.ppcg_cmd_line_flags, False) + except internal_exceptions.NoFittestException: + pass + + def logall(self): + print("%s Log of all runs %s" %('*' * 30, '*' * 30)) + for i in self.individuals: + print(i) + debug.summary_message(i.ppcg_cmd_line_flags, False) + pass + class Exhaustive(SearchStrategy): """Exhaustive search all the values in the specified range or """ """all combinations provided in explore-params.py file""" From 9b6c08217501e905d866def3936e4e19d77864d6 Mon Sep 17 00:00:00 2001 From: chandan Date: Thu, 19 Mar 2015 15:21:35 +0100 Subject: [PATCH 19/34] fixed the block size range bug --- heuristic_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/heuristic_search.py b/heuristic_search.py index 2b4f1c3..41aed1d 100644 --- a/heuristic_search.py +++ b/heuristic_search.py @@ -413,7 +413,7 @@ def createExhaConfigs(self): else: block_size_range = range(block_size_lb, block_size_ub) - block_sizes = itertools.product(tile_size_range, repeat=config.Arguments.block_dimensions) + block_sizes = itertools.product(block_size_range, repeat=config.Arguments.block_dimensions) grid_size_lb = config.Arguments.grid_size_range[0] grid_size_ub = config.Arguments.grid_size_range[1] From 46c58663283aa207065047a97aed3c615d3b4e8b Mon Sep 17 00:00:00 2001 From: chandan Date: Fri, 20 Mar 2015 15:06:57 +0100 Subject: [PATCH 20/34] added min work group size fitlering --- heuristic_search.py | 3 +++ main.py | 10 +++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/heuristic_search.py b/heuristic_search.py index 41aed1d..3649328 100644 --- a/heuristic_search.py +++ b/heuristic_search.py @@ -490,6 +490,9 @@ def tile_size_multiple_filter(self, conf): if work_group_size > config.Arguments.max_work_group_size: return False + if work_group_size < config.Arguments.min_work_group_size: + return False + mul_factor = 1 for t, b in zip(tile_size, block_size): if t < b: diff --git a/main.py b/main.py index 20a5111..2f076e3 100755 --- a/main.py +++ b/main.py @@ -392,9 +392,17 @@ def string_csv(string): type=int, metavar="", default=max_work_group_size, - help="max work group size, test cases with work group size greater than this value will be filtered out (default: %d)" % num_compile_threads) + help="max work group size, test cases with work group size greater than this value will be filtered out (default: %d)" % max_work_group_size) + + min_work_group_size = 1 + parser_exhaustive.add_argument("--min-work-group-size", + type=int, + metavar="", + default=min_work_group_size, + help="min work group size, test cases with work group size lesser than this value will be filtered out (default: %d)" % min_work_group_size) + timeout = 500 parser_exhaustive.add_argument("--timeout-ppcg", type=int, From ccfdfba71c5e66227321510df56bd681212f115b Mon Sep 17 00:00:00 2001 From: chandan Date: Fri, 20 Mar 2015 16:21:43 +0100 Subject: [PATCH 21/34] added execution time check for multiple runs case --- heuristic_search.py | 2 +- individual.py | 29 ++++++++++++++++++++--------- main.py | 8 ++++++++ 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/heuristic_search.py b/heuristic_search.py index 3649328..1e72045 100644 --- a/heuristic_search.py +++ b/heuristic_search.py @@ -352,7 +352,7 @@ def run(self): break continue #print('***run thread got job') - testcase.run_with_timeout() + testcase.run(best_time) f_iter.seek(0) f_iter.write(str(testcase.get_ID())) diff --git a/individual.py b/individual.py index 4f0f47c..ccb4274 100644 --- a/individual.py +++ b/individual.py @@ -1,4 +1,4 @@ -import timeit +umport timeit import os import re import debug @@ -136,13 +136,14 @@ def checkforpause(self): #print("Auto tuning restarted") break - def compile(self, timeout=2): + def compile(self, timeout=float("inf")): self.checkforpause() - sucess=self.ppcg_with_timeout() - if not sucess: - return + self.ppcg() + #sucess=self.ppcg_with_timeout(timeout) + #if not sucess: + # return self.build() - self.run_with_timeout(timeout) + self.binary(timeout) def ppcg(self): self.ppcg_cmd_line_flags = "--target=%s --dump-sizes %s" % (config.Arguments.target, @@ -168,7 +169,7 @@ def ppcg(self): self.size_data = compiler_flags.SizesFlag.parse_PPCG_dump_sizes(stderr) - def ppcg_with_timeout(self, timeout=200): + def ppcg_with_timeout(self, timeout=float("inf")): thread = threading.Thread(target=self.ppcg) thread.start() thread.join(timeout) @@ -203,12 +204,13 @@ def deleteFile(self, fileName): except: pass - def binary(self): + def binary(self, best_execution_time=float("inf")): #time_regex = re.compile(r'^(\d*\.\d+|\d+)$') #print config.Arguments.execution_time_regex time_regex = re.compile(config.Arguments.execution_time_regex) total_time = 0.0 status = enums.Status.passed + num_actual_runs = 0 for run in xrange(1,config.Arguments.runs+1): run_cmd = './'+self.file_name()+'.exe' #run_cmd = config.Arguments.run_cmd @@ -238,9 +240,18 @@ def binary(self): raise internal_exceptions.BinaryRunException("Regular expression did not match anything on the program's output") else: total_time += end - start + + num_actual_runs +=1 + per_var = 1 + config.Arguments.max_exec_time_var/100 + time = per_var * best_execution_time + if total_time > (best_execution_time + per_var * best_execution_time ): + #print "Execution time of cur test case is worst than the best so far, stopping at first run" + break + + self.status = status config.time_binary += total_time - self.execution_time = total_time/config.Arguments.runs + self.execution_time = total_time/num_actual_runs self.deleteFile(self.file_name()+'.exe') self.deleteFile(self.file_name()+'_host.c') diff --git a/main.py b/main.py index 2f076e3..4ab7977 100755 --- a/main.py +++ b/main.py @@ -410,6 +410,14 @@ def string_csv(string): default=timeout, help="timeout for ppcg compilation and testcase execution (default: %d sec)" % num_compile_threads) + + max_exec_time_var = 20 + parser_exhaustive.add_argument("--max-exec-time-var", + type=int, + metavar="", + default=max_exec_time_var, + help="max allowed variance for execution time. If the execution time of a test case is greater that best so far + max-exec-time-var then number of runs is restricted to 1 (default: %d )" % max_exec_time_var) + parser.parse_args(namespace=config.Arguments) if __name__ == "__main__": From 6d210f48567111fe14cd2d1dd4710c4c59e4e2e5 Mon Sep 17 00:00:00 2001 From: chandan Date: Fri, 20 Mar 2015 16:29:34 +0100 Subject: [PATCH 22/34] fixed the bug multiple execution time filtering --- individual.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/individual.py b/individual.py index ccb4274..7ac278f 100644 --- a/individual.py +++ b/individual.py @@ -1,4 +1,4 @@ -umport timeit +import timeit import os import re import debug @@ -244,7 +244,7 @@ def binary(self, best_execution_time=float("inf")): num_actual_runs +=1 per_var = 1 + config.Arguments.max_exec_time_var/100 time = per_var * best_execution_time - if total_time > (best_execution_time + per_var * best_execution_time ): + if total_time > time * num_actual_runs: #print "Execution time of cur test case is worst than the best so far, stopping at first run" break From 944a9b7ba0392deb0e8c31e060697dbb451e918a Mon Sep 17 00:00:00 2001 From: chandan Date: Sun, 22 Mar 2015 00:38:01 +0100 Subject: [PATCH 23/34] fixed the divide by zero error --- individual.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/individual.py b/individual.py index 7ac278f..57feba6 100644 --- a/individual.py +++ b/individual.py @@ -251,7 +251,10 @@ def binary(self, best_execution_time=float("inf")): self.status = status config.time_binary += total_time - self.execution_time = total_time/num_actual_runs + if num_actual_runs != 0: + self.execution_time = total_time/num_actual_runs + else + self.execution_time = total_time self.deleteFile(self.file_name()+'.exe') self.deleteFile(self.file_name()+'_host.c') From e864259c21c6eed5caad3cb8ac282e0cb7b454d4 Mon Sep 17 00:00:00 2001 From: chandan Date: Sun, 22 Mar 2015 13:45:40 +0100 Subject: [PATCH 24/34] added the missing : for else --- individual.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/individual.py b/individual.py index 57feba6..ae77e2e 100644 --- a/individual.py +++ b/individual.py @@ -253,7 +253,7 @@ def binary(self, best_execution_time=float("inf")): config.time_binary += total_time if num_actual_runs != 0: self.execution_time = total_time/num_actual_runs - else + else: self.execution_time = total_time self.deleteFile(self.file_name()+'.exe') From 60d3e1263b4484a0035180fdc12c370ed29a0309 Mon Sep 17 00:00:00 2001 From: chandan Date: Mon, 23 Mar 2015 15:43:10 +0100 Subject: [PATCH 25/34] added cmd string modififcation option --- individual.py | 16 +++++++++++++--- main.py | 18 ++++++++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/individual.py b/individual.py index ae77e2e..9739a56 100644 --- a/individual.py +++ b/individual.py @@ -85,6 +85,9 @@ def get_ID_init(): return Individual.ID def file_name(self): + if config.Arguments.binary_file_name: + return config.Arguments.binary_file_name + return 'testcase'+str(self.ID) #return 'gemm' @@ -151,7 +154,9 @@ def ppcg(self): os.environ["AUTOTUNER_PPCG_FLAGS"] = self.ppcg_cmd_line_flags - if config.Arguments.target == enums.Targets.cuda: + if config.Arguments.cmd_string_complete: + cmd = config.Arguments.ppcg_cmd + elif config.Arguments.target == enums.Targets.cuda: cmd = config.Arguments.ppcg_cmd + ' '+self.ppcg_cmd_line_flags+' -o '+self.file_name() else: cmd = config.Arguments.ppcg_cmd + ' '+self.ppcg_cmd_line_flags+' -o '+self.file_name()+'_host.c' @@ -182,7 +187,9 @@ def ppcg_with_timeout(self, timeout=float("inf")): return True def build(self): - if config.Arguments.target == enums.Targets.cuda: + if config.Arguments.cmd_string_complete: + build_cmd = config.Arguments.build_cmd + elif config.Arguments.target == enums.Targets.cuda: build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.cu ' + self.file_name()+ '_kernel.cu '+ '-o '+ self.file_name()+'.exe' else: build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.c ' + '-o '+ self.file_name()+'.exe' + ' -lprl -lOpenCL' @@ -212,7 +219,10 @@ def binary(self, best_execution_time=float("inf")): status = enums.Status.passed num_actual_runs = 0 for run in xrange(1,config.Arguments.runs+1): - run_cmd = './'+self.file_name()+'.exe' + if config.Arguments.cmd_string_complete: + run_cmd = config.Arguments.run_cmd + else: + run_cmd = './'+self.file_name()+'.exe '+config.Arguments.run_cmd_input #run_cmd = config.Arguments.run_cmd debug.verbose_message("Run #%d of '%s'" % (run, run_cmd), __name__) start = timeit.default_timer() diff --git a/main.py b/main.py index 4ab7977..a80f5ae 100755 --- a/main.py +++ b/main.py @@ -132,6 +132,19 @@ def string_csv(string): help="how to run the generated binary from the auto-tuner", required=False) + + building_and_running_group.add_argument("--run-cmd-input", + metavar="", + help="input to the generated binary from the auto-tuner", + required=False, + default="") + + + building_and_running_group.add_argument("--cmd-string-complete", + action="store_true", + help="dont modify the cmd string, note the output file nmaes should be part of cmd lines", + default=False) + runs = 5 building_and_running_group.add_argument("--runs", type=int, @@ -144,6 +157,11 @@ def string_csv(string): help="assume that the binary prints its execution time to standard output (rather than measuring the execution time through Python)", default=False) + building_and_running_group.add_argument("--binary-file-name", + metavar="", + help="name of the generated binary from the auto-tuner", + required=False, + default="") building_and_running_group.add_argument("--execution-time-regex", type=str, From 4a5baa8eb98922d16a13b00a2a4fbaa92fadc409 Mon Sep 17 00:00:00 2001 From: Chandan Date: Mon, 23 Mar 2015 21:17:14 +0100 Subject: [PATCH 26/34] fixed the ppcg cmd line bug with cmd string complete option --- individual.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/individual.py b/individual.py index 9739a56..38c095c 100644 --- a/individual.py +++ b/individual.py @@ -155,7 +155,7 @@ def ppcg(self): os.environ["AUTOTUNER_PPCG_FLAGS"] = self.ppcg_cmd_line_flags if config.Arguments.cmd_string_complete: - cmd = config.Arguments.ppcg_cmd + cmd = config.Arguments.ppcg_cmd+ ' '+self.ppcg_cmd_line_flags elif config.Arguments.target == enums.Targets.cuda: cmd = config.Arguments.ppcg_cmd + ' '+self.ppcg_cmd_line_flags+' -o '+self.file_name() else: From 9825823ead6c3a425fbcb9820f0072d6ab500d2c Mon Sep 17 00:00:00 2001 From: chandan Date: Mon, 11 May 2015 15:36:24 +0200 Subject: [PATCH 27/34] fixed the shared_mem private_mem option interchange bug --- explore-params.py | 4 ++-- heuristic_search.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/explore-params.py b/explore-params.py index 60b7dd7..b8c5f93 100644 --- a/explore-params.py +++ b/explore-params.py @@ -12,10 +12,10 @@ [(1,1), (1,2), (1,4), (1,8), (16,16), (32,32), (64,64)], - #private memory + #Shared memory [False], - #Shared memory + #private memory [False, True], #Fusion diff --git a/heuristic_search.py b/heuristic_search.py index 1e72045..194111a 100644 --- a/heuristic_search.py +++ b/heuristic_search.py @@ -439,7 +439,7 @@ def createExhaConfigs(self): else: fusion = ['max'] - paramValues = [tile_sizes, block_sizes, grid_sizes, private_mem, shared_mem, fusion] + paramValues = [tile_sizes, block_sizes, grid_sizes, shared_mem, private_mem, fusion] return paramValues def get_last_iter(self): From 9d2907f59c3faa22b3ccab9c2113701c5d9215e9 Mon Sep 17 00:00:00 2001 From: chandanReddy Date: Thu, 7 Apr 2016 14:55:59 +0200 Subject: [PATCH 28/34] integrated into prl and added multiple kernel tuning support --- enums.py | 1 + heuristic_search.py | 27 ++++++++++++++++++++++++--- individual.py | 18 ++++++++++++++---- main.py | 21 +++++++++++++++++---- 4 files changed, 56 insertions(+), 11 deletions(-) diff --git a/enums.py b/enums.py index 0af2b4c..7c0d7bb 100644 --- a/enums.py +++ b/enums.py @@ -1,6 +1,7 @@ class Targets: cuda = "cuda" opencl = "opencl" + prl = "prl" class Crossover: one_point = "one_point" diff --git a/heuristic_search.py b/heuristic_search.py index 194111a..b206de5 100644 --- a/heuristic_search.py +++ b/heuristic_search.py @@ -13,6 +13,7 @@ import os from Queue import Queue from threading import Thread +import sys class SearchStrategy: """Abstract class for a search strategy""" @@ -508,6 +509,14 @@ def tile_size_multiple_filter(self, conf): def run(self): self.individuals = [] + self.output_stream = open(config.Arguments.results_file, 'w') + for k in config.Arguments.kernels_to_tune: + self.individuals = [] + self.tune_kernel(k) + self.print_summary() + self.output_stream.close() + + def tune_kernel(self, k): if config.Arguments.params_from_file: paramValues = self.readParamValues() @@ -522,9 +531,9 @@ def run(self): #Filter out only test cases based on heusristics such as tile size is multiple of block size etc.. combs = filter(self.tile_size_multiple_filter, combs) #Filter out only test cases where shared memory is true - combs = filter(lambda conf: conf[3] == True, combs) + #combs = filter(lambda conf: conf[3] == True, combs) #Filter out only test cases where private memory is true - combs = filter(lambda conf: conf[4] == True, combs) + #combs = filter(lambda conf: conf[4] == True, combs) if config.Arguments.parallelize_compilation: self.pipelineExec(combs) @@ -541,7 +550,7 @@ def run(self): cnt += 1 continue print '---- Configuration ' + str(cnt) + ': ' + str(conf) - cur = individual.create_test_case(conf[0], conf[1], conf[2], conf[3], conf[4], conf[5]) + cur = individual.create_test_case(conf[0], conf[1], conf[2], conf[3], conf[4], conf[5], k) cur.set_ID(cnt) cnt += 1 cur.run(best_time) @@ -579,6 +588,18 @@ def logall(self): debug.summary_message(i.ppcg_cmd_line_flags, False) pass + + def print_summary(self): + old_stdout = sys.stdout + try: + if config.Arguments.results_file is not None: + sys.stdout = self.output_stream + self.summarise() + #self.logall() + finally: + if config.Arguments.results_file is not None: + sys.stdout = old_stdout + class SimulatedAnnealing(SearchStrategy): """Search using simulated annealing""" diff --git a/individual.py b/individual.py index 38c095c..15ca0f4 100644 --- a/individual.py +++ b/individual.py @@ -30,10 +30,11 @@ def get_fittest(population): raise internal_exceptions.NoFittestException("None of the individuals among this population completed successfully, hence there is no fittest individual") return fittest -def create_test_case(tile_size, block_size, grid_size, shared_mem=True, private_mem=True, fusion='max'): +def create_test_case(tile_size, block_size, grid_size, shared_mem=True, private_mem=True, fusion='max', k=compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL): individual = Individual() per_kernel_size_info = collections.OrderedDict() - per_kernel_size_info[compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL] = compiler_flags.SizeTuple(tile_size, block_size, grid_size) + per_kernel_size_info[k] = compiler_flags.SizeTuple(tile_size, block_size, grid_size) + individual.kernel_num = k #for flag in compiler_flags.PPCG.optimisation_flags: # print(flag) @@ -106,6 +107,7 @@ def __init__(self): self.status = enums.Status.failed self.execution_time = float("inf") self.num = 0 + self.kernel_num=compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL def all_flags(self): return self.ppcg_flags.keys() + self.cc_flags.keys() + self.cxx_flags.keys() + self.nvcc_flags.keys() @@ -214,7 +216,16 @@ def deleteFile(self, fileName): def binary(self, best_execution_time=float("inf")): #time_regex = re.compile(r'^(\d*\.\d+|\d+)$') #print config.Arguments.execution_time_regex - time_regex = re.compile(config.Arguments.execution_time_regex) + if config.Arguments.prl_profiling: + if self.kernel_num == compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL: + re_str = r'compute\s*:\s*(\d*.\d+)ms' + else: + re_str = r'kernel'+str(self.kernel_num)+'\s*:\s*(\d*.\d+)ms' + else: + re_str = config.Arguments.execution_time_regex + + print re_str + time_regex = re.compile(re_str) total_time = 0.0 status = enums.Status.passed num_actual_runs = 0 @@ -258,7 +269,6 @@ def binary(self, best_execution_time=float("inf")): #print "Execution time of cur test case is worst than the best so far, stopping at first run" break - self.status = status config.time_binary += total_time if num_actual_runs != 0: diff --git a/main.py b/main.py index a80f5ae..b0ca718 100755 --- a/main.py +++ b/main.py @@ -9,6 +9,7 @@ import sys def print_summary(search): + return try: if config.Arguments.results_file is not None: old_stdout = sys.stdout @@ -145,7 +146,7 @@ def string_csv(string): help="dont modify the cmd string, note the output file nmaes should be part of cmd lines", default=False) - runs = 5 + runs = 1 building_and_running_group.add_argument("--runs", type=int, metavar="", @@ -168,11 +169,16 @@ def string_csv(string): help="regular expression format for execution time", default=r'^(\d*\.\d+|\d+)$') + + building_and_running_group.add_argument("--prl-profiling", + action="store_true", + help="Using prl profiling, used to extract timing info from prl profiling output", + default=False) # PPCG options ppcg_group = parser.add_argument_group("PPCG arguments") ppcg_group.add_argument("--target", - choices=[enums.Targets.cuda, enums.Targets.opencl], + choices=[enums.Targets.cuda, enums.Targets.opencl, enums.Targets.prl], help="the target to generate code for", default=enums.Targets.opencl) @@ -192,6 +198,13 @@ def string_csv(string): metavar="", help="consider only these values when tuning the shared memory size (default: %s)" % (shared_memory_possibilties), default=shared_memory_possibilties) + + kernels_list = [compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL] + ppcg_group.add_argument("--kernels-to-tune", + type=int_csv, + metavar="", + help="consider only these kernels values when tuning (default: all)", + default=kernels_list) tile_size_range = (2**0, 2**6) ppcg_group.add_argument("--tile-size-range", @@ -389,7 +402,7 @@ def string_csv(string): parser_exhaustive.add_argument("--filter-testcases", action="store_true", help="few heursitics to reduce search space such as tile size multiple of block size, tile size > block size etc..", - default=False) + default=True) parser_exhaustive.add_argument("--parallelize-compilation", action="store_true", @@ -405,7 +418,7 @@ def string_csv(string): help="number of threads to use for ppcg compilation (default: %d)" % num_compile_threads) - max_work_group_size = 256 + max_work_group_size = 1024 parser_exhaustive.add_argument("--max-work-group-size", type=int, metavar="", From a5ed122132c0319a023dc5cf71042861a716611e Mon Sep 17 00:00:00 2001 From: chandanReddy Date: Thu, 7 Apr 2016 16:55:58 +0200 Subject: [PATCH 29/34] added concurrent kernel tuning option, enabled by default --- heuristic_search.py | 35 ++++++++++++++++++++++++++++++++++- individual.py | 30 ++++++++++++++++++++++++++++++ main.py | 5 +++++ 3 files changed, 69 insertions(+), 1 deletion(-) diff --git a/heuristic_search.py b/heuristic_search.py index b206de5..1f11bff 100644 --- a/heuristic_search.py +++ b/heuristic_search.py @@ -509,7 +509,14 @@ def tile_size_multiple_filter(self, conf): def run(self): self.individuals = [] + self.multi_kernel = False self.output_stream = open(config.Arguments.results_file, 'w') + if config.Arguments.no_concurrent_kernel_tuning: + self.multi_kernel = True + self.tune_kernel(compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL) + self.print_summary() + return + for k in config.Arguments.kernels_to_tune: self.individuals = [] self.tune_kernel(k) @@ -543,6 +550,12 @@ def tune_kernel(self, k): start_iter = 0 best_time = float("inf") + best_kernel_time = [] + self.best_kernel_run = [] + if self.multi_kernel: + for s in config.Arguments.kernels_to_tune: + best_kernel_time.append(float("inf")) + self.best_kernel_run.append(0) #print 'Parameter values to be explored: ' + str(paramValues) #print 'Number of configurations: ' + str(self.countConfigs(paramValues)) for conf in combs: @@ -563,6 +576,15 @@ def tune_kernel(self, k): if cur.execution_time == 0: continue + if self.multi_kernel: + for k in config.Arguments.kernels_to_tune: + if cur.per_kernel_time[k] < best_kernel_time[k]: + best_kernel_time[k] = cur.per_kernel_time[k] + self.best_kernel_run[k] = cur + f.write("\n Best time so far for kernel "+str(k) + " ID " + str(cnt) + " kernel time = " + str(best_kernel_time[k])) + f.write(str(cur.ppcg_cmd_line_flags)) + f.flush() + if cur.execution_time < best_time and cur.status == enums.Status.passed: self.individuals.append(cur) best_time = cur.execution_time @@ -571,6 +593,14 @@ def tune_kernel(self, k): f.write(str(best_run)) f.flush() + + def summarise_per_kernel(self): + for k in config.Arguments.kernels_to_tune: + print "Best config for kernel " + str(k) + print("had execution time %f ms" % (self.best_kernel_run[k].execution_time)) + print("To replicate, use the following configuration:") + print(self.best_kernel_run[k].ppcg_cmd_line_flags, False) + def summarise(self): print("%s Summary of %s %s" % ('*' * 30, __name__, '*' * 30)) try: @@ -594,7 +624,10 @@ def print_summary(self): try: if config.Arguments.results_file is not None: sys.stdout = self.output_stream - self.summarise() + if self.multi_kernel: + self.summarise_per_kernel() + else: + self.summarise() #self.logall() finally: if config.Arguments.results_file is not None: diff --git a/individual.py b/individual.py index 15ca0f4..8818d17 100644 --- a/individual.py +++ b/individual.py @@ -108,6 +108,9 @@ def __init__(self): self.execution_time = float("inf") self.num = 0 self.kernel_num=compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL + self.per_kernel_time = [] + for k in config.Arguments.kernels_to_tune: + self.per_kernel_time.append(float("inf")) def all_flags(self): return self.ppcg_flags.keys() + self.cc_flags.keys() + self.cxx_flags.keys() + self.nvcc_flags.keys() @@ -213,6 +216,32 @@ def deleteFile(self, fileName): except: pass + def extract_kernel_time(self, kernel_num, stdout): + re_str = r'kernel'+str(kernel_num)+'\s*:\s*(\d*.\d+)ms' + print re_str + time_regex = re.compile(re_str) + total_time = 0.0 + + nmatchedlines = 0 + for line in stdout.split(os.linesep): + line = line.strip() + matches = time_regex.findall(line) + if matches: + nmatchedlines += 1 + try: + total_time += float(matches[0]) + except: + raise internal_exceptions.BinaryRunException("Execution time '%s' is not in the required format" % matches[0]) + if nmatchedlines == 0: + total_time = float("inf") + return total_time + + def update_kernel_times(self, stdout): + if not config.Arguments.prl_profiling: + return + for k in config.Arguments.kernels_to_tune: + self.per_kernel_time[k] = self.extract_kernel_time(k, stdout) + def binary(self, best_execution_time=float("inf")): #time_regex = re.compile(r'^(\d*\.\d+|\d+)$') #print config.Arguments.execution_time_regex @@ -247,6 +276,7 @@ def binary(self, best_execution_time=float("inf")): if config.Arguments.execution_time_from_binary: if not stdout: raise internal_exceptions.BinaryRunException("Expected the binary to dump its execution time. Found nothing") + self.update_kernel_times(stdout) nmatchedlines = 0 for line in stdout.split(os.linesep): line = line.strip() diff --git a/main.py b/main.py index b0ca718..fb197b6 100755 --- a/main.py +++ b/main.py @@ -174,6 +174,11 @@ def string_csv(string): action="store_true", help="Using prl profiling, used to extract timing info from prl profiling output", default=False) + + building_and_running_group.add_argument("--no-concurrent-kernel-tuning", + action="store_false", + help="Do not tune multiple kernels at the same time", + default=True) # PPCG options ppcg_group = parser.add_argument_group("PPCG arguments") From 2807d3f843f47ebb56d786e4d48071f389fca245 Mon Sep 17 00:00:00 2001 From: chandanReddy Date: Mon, 11 Apr 2016 15:53:30 +0200 Subject: [PATCH 30/34] changed the order of grid and block sizes in explore params --- explore-params.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/explore-params.py b/explore-params.py index b8c5f93..dd861ad 100644 --- a/explore-params.py +++ b/explore-params.py @@ -5,13 +5,13 @@ # Tile sizes [(16,16), (32,32), (64,64)], - # Grid sizes - [(16,16), (32,32), (256,256), (1024,1024)], - # Block sizes [(1,1), (1,2), (1,4), (1,8), (16,16), (32,32), (64,64)], + # Grid sizes + [(16,16), (32,32), (256,256), (1024,1024)], + #Shared memory [False], From 2c0f3a9643c93d71857b7c6622f561566146418e Mon Sep 17 00:00:00 2001 From: chandanReddy Date: Mon, 11 Apr 2016 15:59:19 +0200 Subject: [PATCH 31/34] added concurrent kernel tuning support --- heuristic_search.py | 17 ++++++++++++----- individual.py | 1 - 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/heuristic_search.py b/heuristic_search.py index 1f11bff..3e9821b 100644 --- a/heuristic_search.py +++ b/heuristic_search.py @@ -452,6 +452,7 @@ def get_last_iter(self): except: start_iter = 0 pass + f_iter.close() print("starting from test case = ", start_iter) else: start_iter = 0 @@ -523,7 +524,7 @@ def run(self): self.print_summary() self.output_stream.close() - def tune_kernel(self, k): + def tune_kernel(self, ker_num): if config.Arguments.params_from_file: paramValues = self.readParamValues() @@ -546,8 +547,10 @@ def tune_kernel(self, k): self.pipelineExec(combs) return + start_iter = self.get_last_iter() + f = open(config.Arguments.results_file + ".log", 'a') - start_iter = 0 + f_iter = open('.lastiter', 'w') best_time = float("inf") best_kernel_time = [] @@ -563,7 +566,7 @@ def tune_kernel(self, k): cnt += 1 continue print '---- Configuration ' + str(cnt) + ': ' + str(conf) - cur = individual.create_test_case(conf[0], conf[1], conf[2], conf[3], conf[4], conf[5], k) + cur = individual.create_test_case(conf[0], conf[1], conf[2], conf[3], conf[4], conf[5], ker_num) cur.set_ID(cnt) cnt += 1 cur.run(best_time) @@ -577,12 +580,13 @@ def tune_kernel(self, k): continue if self.multi_kernel: + #f.write("\n====================================\n") for k in config.Arguments.kernels_to_tune: if cur.per_kernel_time[k] < best_kernel_time[k]: best_kernel_time[k] = cur.per_kernel_time[k] self.best_kernel_run[k] = cur f.write("\n Best time so far for kernel "+str(k) + " ID " + str(cnt) + " kernel time = " + str(best_kernel_time[k])) - f.write(str(cur.ppcg_cmd_line_flags)) + f.write(str(cur.ppcg_cmd_line_flags) + str("\n")) f.flush() if cur.execution_time < best_time and cur.status == enums.Status.passed: @@ -593,11 +597,14 @@ def tune_kernel(self, k): f.write(str(best_run)) f.flush() + f_iter.seek(0) + f_iter.write(str(cur.get_ID())) + def summarise_per_kernel(self): for k in config.Arguments.kernels_to_tune: print "Best config for kernel " + str(k) - print("had execution time %f ms" % (self.best_kernel_run[k].execution_time)) + print("had execution time %f ms" % (self.best_kernel_time[k])) print("To replicate, use the following configuration:") print(self.best_kernel_run[k].ppcg_cmd_line_flags, False) diff --git a/individual.py b/individual.py index 8818d17..2456d09 100644 --- a/individual.py +++ b/individual.py @@ -218,7 +218,6 @@ def deleteFile(self, fileName): def extract_kernel_time(self, kernel_num, stdout): re_str = r'kernel'+str(kernel_num)+'\s*:\s*(\d*.\d+)ms' - print re_str time_regex = re.compile(re_str) total_time = 0.0 From 53491101a168f2762ca55af9644cbf25803469a6 Mon Sep 17 00:00:00 2001 From: Chandan Reddy Date: Thu, 22 Sep 2016 18:15:57 +0200 Subject: [PATCH 32/34] Fixed the run thread dual compilation bug --- heuristic_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/heuristic_search.py b/heuristic_search.py index 194111a..a99ddc6 100644 --- a/heuristic_search.py +++ b/heuristic_search.py @@ -352,7 +352,7 @@ def run(self): break continue #print('***run thread got job') - testcase.run(best_time) + testcase.binary(best_time) f_iter.seek(0) f_iter.write(str(testcase.get_ID())) From 9451ffbdb0c8ab2cadfbc87dc070389ce4e98a02 Mon Sep 17 00:00:00 2001 From: Chandan Reddy Date: Thu, 22 Sep 2016 18:17:58 +0200 Subject: [PATCH 33/34] Removed the call to unused parse sizes --- individual.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/individual.py b/individual.py index 38c095c..c95de12 100644 --- a/individual.py +++ b/individual.py @@ -170,8 +170,6 @@ def ppcg(self): config.time_PPCG += end - start if self.ppcg_proc.returncode: raise internal_exceptions.FailedCompilationException("FAILED: '%s'" % config.Arguments.ppcg_cmd) - # Store the sizes used by PPCG - self.size_data = compiler_flags.SizesFlag.parse_PPCG_dump_sizes(stderr) def ppcg_with_timeout(self, timeout=float("inf")): From ab3f6af5700a15ed945869e10f973ed0298de55a Mon Sep 17 00:00:00 2001 From: Chandan Reddy Date: Thu, 22 Sep 2016 18:24:41 +0200 Subject: [PATCH 34/34] Removed fusion option no longer supported by ppc --- explore-params.py | 2 -- heuristic_search.py | 9 ++------- individual.py | 6 +++--- main.py | 5 ----- 4 files changed, 5 insertions(+), 17 deletions(-) diff --git a/explore-params.py b/explore-params.py index b8c5f93..daaca14 100644 --- a/explore-params.py +++ b/explore-params.py @@ -18,6 +18,4 @@ #private memory [False, True], - #Fusion - ['max', 'min'] ]) diff --git a/heuristic_search.py b/heuristic_search.py index a99ddc6..7cce121 100644 --- a/heuristic_search.py +++ b/heuristic_search.py @@ -434,12 +434,7 @@ def createExhaConfigs(self): else: private_mem = [False] - if config.Arguments.all_fusion_structures: - fusion = ['max', 'min'] - else: - fusion = ['max'] - - paramValues = [tile_sizes, block_sizes, grid_sizes, shared_mem, private_mem, fusion] + paramValues = [tile_sizes, block_sizes, grid_sizes, shared_mem, private_mem] return paramValues def get_last_iter(self): @@ -541,7 +536,7 @@ def run(self): cnt += 1 continue print '---- Configuration ' + str(cnt) + ': ' + str(conf) - cur = individual.create_test_case(conf[0], conf[1], conf[2], conf[3], conf[4], conf[5]) + cur = individual.create_test_case(conf[0], conf[1], conf[2], conf[3], conf[4]]) cur.set_ID(cnt) cnt += 1 cur.run(best_time) diff --git a/individual.py b/individual.py index c95de12..0f36240 100644 --- a/individual.py +++ b/individual.py @@ -30,7 +30,7 @@ def get_fittest(population): raise internal_exceptions.NoFittestException("None of the individuals among this population completed successfully, hence there is no fittest individual") return fittest -def create_test_case(tile_size, block_size, grid_size, shared_mem=True, private_mem=True, fusion='max'): +def create_test_case(tile_size, block_size, grid_size, shared_mem=True, private_mem=True): individual = Individual() per_kernel_size_info = collections.OrderedDict() per_kernel_size_info[compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL] = compiler_flags.SizeTuple(tile_size, block_size, grid_size) @@ -54,8 +54,8 @@ def create_test_case(tile_size, block_size, grid_size, shared_mem=True, private_ individual.ppcg_flags[flag] = True #Set isl fusion flag - flag = compiler_flags.PPCG.optimisation_flags[6] - individual.ppcg_flags[flag] = fusion + #flag = compiler_flags.PPCG.optimisation_flags[6] + #individual.ppcg_flags[flag] = fusion #string = individual.ppcg_flags[flag].get_command_line_string(1024) #print(string) #print("end") diff --git a/main.py b/main.py index a80f5ae..cc66f2d 100755 --- a/main.py +++ b/main.py @@ -380,11 +380,6 @@ def string_csv(string): help="Search for parameter values that are powers of two", default=False) - parser_exhaustive.add_argument("--all-fusion-structures", - action="store_true", - help="explore all fusion structures [max, min] ", - default=False) - parser_exhaustive.add_argument("--filter-testcases", action="store_true",