From 4ac792875e369be5453184b92bf103c373569f11 Mon Sep 17 00:00:00 2001
From: chandan <chandan@fermi.(none)>
Date: Wed, 11 Mar 2015 19:16:29 +0100
Subject: [PATCH 01/34] adding exahaustive serach option

---
 compiler_flags.py   |  14 ++-
 enums.py            |   5 +-
 heuristic_search.py | 221 ++++++++++++++++++++++++++++++++++++++++++--
 individual.py       | 166 ++++++++++++++++++++++++++++-----
 main.py             |  67 +++++++++++++-
 5 files changed, 433 insertions(+), 40 deletions(-)

diff --git a/compiler_flags.py b/compiler_flags.py
index 26c88ca..309e519 100644
--- a/compiler_flags.py
+++ b/compiler_flags.py
@@ -47,7 +47,7 @@ def get_command_line_string(self, value):
             else:
                 return ""
         else:
-            return "%s %s" % (self.name, value.__str__( ))
+            return "%s=%s" % (self.name, value.__str__( ))
     
 class Size:
     """Models a tile, block or grid size"""
@@ -202,7 +202,15 @@ def random_value(self):
                                                                          self.block_size.random_value(),
                                                                          self.grid_size.random_value())
         return per_kernel_size_info
+     
+    def init_value(self, tile_size, block_size, grid_size):
+        per_kernel_size_info = collections.OrderedDict()
+        per_kernel_size_info[SizesFlag.ALL_KERNELS_SENTINEL] = SizeTuple(TileSize(tile_size), 
+                                                                         BlockSize(block_size),
+                                                                         GridSize(grid_size))
+        return per_kernel_size_info
     
+
     def permute(self, value):
         per_kernel_size_info = collections.OrderedDict()
         for kernel_number, size_tuple in value.iteritems():
@@ -301,8 +309,8 @@ class PPCG:
     flag_map[no_isl_schedule_separate_components]  = EnumerationFlag(no_isl_schedule_separate_components)
     flag_map[no_wrap]                              = EnumerationFlag(no_wrap)
     flag_map[no_scale_tile_loops]                  = EnumerationFlag(no_scale_tile_loops)
-    flag_map[no_shared_memory]                     = EnumerationFlag(no_shared_memory)
-    flag_map[no_private_memory]                    = EnumerationFlag(no_private_memory)
+    flag_map[no_shared_memory]                     = EnumerationFlag(no_shared_memory, [True, False])
+    flag_map[no_private_memory]                    = EnumerationFlag(no_private_memory, [True, False])
     flag_map[no_live_range_reordering]             = EnumerationFlag(no_live_range_reordering)
     
     optimisation_flags = []
diff --git a/enums.py b/enums.py
index c67f0e0..0af2b4c 100644
--- a/enums.py
+++ b/enums.py
@@ -18,8 +18,11 @@ class SearchStrategy:
     ga                  = "ga"
     random              = "random"
     simulated_annealing = "simulated-annealing"
+    exhaustive          = "exhaustive"
 
 class Status:
     passed = "passed"
     failed = "failed"
-    
\ No newline at end of file
+    timeout = "timedout"
+    ppcgtimeout = "ppcg_timeout"
+    
diff --git a/heuristic_search.py b/heuristic_search.py
index caece42..fa0e4ba 100644
--- a/heuristic_search.py
+++ b/heuristic_search.py
@@ -9,6 +9,10 @@
 import individual
 import collections
 import internal_exceptions
+import itertools
+import os
+from Queue import Queue
+from threading import Thread
 
 class SearchStrategy:
     """Abstract class for a search strategy"""
@@ -20,9 +24,16 @@ def run(self):
     @abc.abstractmethod
     def summarise(self):
         pass
+
+    @abc.abstractmethod
+    def logall(self):
+        pass
     
 class GA(SearchStrategy):
     """Search using a genetic algorithm"""
+
+    def logall(self):
+        return
     
     def set_child_flags(self, child, the_flags, the_flag_values):
         for idx, flag in enumerate(the_flags):
@@ -291,22 +302,212 @@ def summarise(self):
         except internal_exceptions.NoFittestException:
             pass
 
+    def logall(self):
+        for i in self.individuals:
+            debug.summary_message(i.ppcg_cmd_line_flags, False)
+        return
+
+compile_queue = Queue(10)
+run_queue = Queue(10)
+
+class CompileThread(Thread):
+    def run(self):
+        global compile_queue
+        global run_queue
+        while True:
+            testcase = compile_queue.get()
+            if testcase.get_ID() == -1:
+                run_queue.put(testcase)
+                break
+
+            testcase.ppcg()
+            testcase.build()
+            run_queue.put(testcase)
+
+
+class RunThread(Thread):
+    def run(self):
+        global run_queue
+        best_time = float("inf")
+        f = open(config.Arguments.results_file + ".log", 'a')
+        f_iter = open('.lastiter', 'w')
+        while True:
+            testcase = run_queue.get()
+            if testcase.get_ID() == -1:
+                try:
+                    os.remove('.lastiter')
+                except:
+                    pass
+                break
+            testcase.run_with_timeout()
+            f_iter.seek(0)
+            f_iter.write(str(testcase.get_ID()))
+
+            if testcase.execution_time < best_time and testcase.execution_time != 0 and testcase.status == enums.Status.passed: 
+                best_time = testcase.execution_time
+                f.write("\n Best iter so far = \n")
+                f.write(str(testcase))
+                f.flush()
+
+class Exhaustive(SearchStrategy):
+    """Exhaustive search all the values in the specified range or """
+    """all combinations provided in explore-params.py file"""
+
+    def readParamValues(self):
+        f = open('explore-params.py', 'r')
+        paramValues = eval(f.read())
+        f.close()
+        return paramValues
+
+    def countConfigs(self, paramValues):
+        n = 1
+        for i in paramValues:
+            n *= len(i)
+        return n
+
+    def createExhaConfigs(self):
+        tile_size_lb = config.Arguments.tile_size_range[0] 
+        tile_size_ub = config.Arguments.tile_size_range[1]
+        if config.Arguments.only_powers_of_two:
+            tile_size_range = [2**i for i in range(tile_size_lb, tile_size_ub)]
+        else:
+            tile_size_range = range(tile_size_lb, tile_size_ub)
+
+        tile_sizes = itertools.product(tile_size_range, repeat=config.Arguments.tile_dimensions)
+        
+        block_size_lb = config.Arguments.block_size_range[0] 
+        block_size_ub = config.Arguments.block_size_range[1] 
+        if config.Arguments.only_powers_of_two:
+            block_size_range = [2**i for i in range(block_size_lb, block_size_ub)]
+        else:
+            block_size_range = range(block_size_lb, block_size_ub)
+
+        block_sizes = itertools.product(tile_size_range, repeat=config.Arguments.block_dimensions)
+
+        grid_size_lb = config.Arguments.grid_size_range[0] 
+        grid_size_ub = config.Arguments.grid_size_range[1] 
+        if config.Arguments.only_powers_of_two:
+            grid_size_range = [2**i for i in range(grid_size_lb, grid_size_ub)]
+        else:
+            grid_size_range = range(grid_size_lb, grid_size_ub)
+
+        grid_sizes = itertools.product(grid_size_range, repeat=config.Arguments.grid_dimensions)
+
+        if config.Arguments.no_shared_memory:
+            shared_mem = [True, False]
+        else:
+            shared_mem = [False]
+
+        if config.Arguments.no_private_memory:
+            private_mem = [True, False]
+        else:
+            private_mem = [False]
+
+        if config.Arguments.all_fusion_structures:
+            fusion = ['max', 'min']
+        else:
+            fusion = ['max']
+
+        paramValues = [tile_sizes, block_sizes, grid_sizes, private_mem, shared_mem, fusion]
+        return paramValues
+
+    def pipelineExec(self, combs):
+        num_threads = config.Arguments.num_compile_threads
+        for i in range(num_threads):
+            CompileThread().start()
+
+        RunThread().start()
+
+        cnt = 0
+        for conf in combs:
+            print '---- Configuration ' + str(cnt) + ': ' + str(conf)
+            cur = individual.create_test_case(conf[0], conf[1], conf[2], conf[3], conf[4])
+            cur.set_ID(cnt)
+            cnt += 1
+            compile_queue.put(cur)
+
+        cur.set_ID(-1)
+        for i in range(num_threads):
+            compile_queue.put(cur)
+       
+
+    def run(self):
+        self.individuals = []
+
+        if config.Arguments.params_from_file:
+            paramValues = self.readParamValues()
+        else:
+            paramValues = self.createExhaConfigs()
+
+        cnt = 0
+        combs = itertools.product(*paramValues)
+        if config.Arguments.parallelize_compilation:
+            self.pipelineExec(combs)
+            return
+
+        f = open(config.Arguments.results_file + ".log", 'a')
+        best_time = 0
+        #print 'Parameter values to be explored: ' + str(paramValues)
+        #print 'Number of configurations: ' + str(self.countConfigs(paramValues))
+        for conf in combs:
+            print '---- Configuration ' + str(cnt) + ': ' + str(conf)
+            cur = individual.create_test_case(conf[0], conf[1], conf[2], conf[3], conf[4], conf[5])
+            cur.set_ID(cnt)
+            cnt += 1
+            cur.run(best_time)
+            if cur.status == enums.Status.ppcgtimeout :
+                f.write("\nppcg timeout")
+                f.write(str(best_run))
+                f.flush()
+                continue
+                
+            if cur.execution_time == 0:
+                continue
+
+            if cur.execution_time < best_time and cur.status == enums.Status.passed:
+                self.individuals.append(cur)
+                best_time = cur.execution_time
+                best_run = cur
+                f.write("\n Best iter so far = "+ str(i) + "\n")
+                f.write(str(best_run))
+                f.flush()
+
+    def summarise(self):
+        print("%s Summary of %s %s" % ('*' * 30, __name__, '*' * 30))
+        try:
+            fittest = individual.get_fittest(self.individuals)
+            debug.summary_message("The fittest individual had execution time %f seconds" % (fittest.execution_time)) 
+            debug.summary_message("To replicate, pass the following to PPCG:")
+            debug.summary_message(fittest.ppcg_cmd_line_flags, False)
+        except internal_exceptions.NoFittestException:
+           pass
+
+    def logall(self):
+        print("%s Log of all runs %s" %('*' * 30, '*' * 30))
+        for i in self.individuals:
+            print(i)
+            debug.summary_message(i.ppcg_cmd_line_flags, False)
+        pass
+
 class SimulatedAnnealing(SearchStrategy):
-    """Search using simulated annealing"""
-    
-    def acceptance_probability(self, currentEnergy, newEnergy, temperature):
+   """Search using simulated annealing""" 
+
+   def acceptance_probability(self, currentEnergy, newEnergy, temperature):
         if newEnergy < currentEnergy:
             return 1.0
-        return math.exp((currentEnergy - newEnergy) / temperature)
-    
-    def mutate_backend_flags(self, clone_flags, solution_flags):
+        return math.exp((currentEnergy - newEnergy) / temperature) 
+
+   def logall(self):
+        return
+
+   def mutate_backend_flags(self, clone_flags, solution_flags):
         for the_flag in solution_flags.keys():   
             if bool(random.getrandbits(1)):
                 idx    = the_flag.possible_values.index(solution_flags[the_flag])
                 newIdx = (idx + 1) % len(the_flag.possible_values)
                 clone_flags[the_flag] = the_flag.possible_values[newIdx]
     
-    def mutate(self, solution):
+   def mutate(self, solution):
         clone    = copy.deepcopy(solution)
         clone.ID = individual.Individual.get_ID()
         for the_flag in solution.ppcg_flags.keys():   
@@ -324,7 +525,7 @@ def mutate(self, solution):
         self.mutate_backend_flags(clone.nvcc_flags, solution.nvcc_flags)
         return clone
     
-    def run(self):        
+   def run(self):        
         debug.verbose_message("Creating initial solution", __name__)
         current = individual.create_random()
         current.run()   
@@ -344,8 +545,8 @@ def run(self):
                     if current.execution_time < self.fittest.execution_time:
                         self.fittest = current
     
-    def summarise(self):
+   def summarise(self):
         debug.summary_message("The final individual had execution time %f seconds" % (self.fittest.execution_time)) 
         debug.summary_message("To replicate, pass the following to PPCG:")
         debug.summary_message(self.fittest.ppcg_cmd_line_flags, False)
-        
\ No newline at end of file
+        
diff --git a/individual.py b/individual.py
index b5d23e3..6ec05e3 100644
--- a/individual.py
+++ b/individual.py
@@ -7,7 +7,9 @@
 import enums
 import collections
 import subprocess
+import threading
 import internal_exceptions
+import time
 
 def get_fittest(population):
     fittest = None
@@ -22,9 +24,42 @@ def get_fittest(population):
         raise internal_exceptions.NoFittestException("None of the individuals among this population completed successfully, hence there is no fittest individual")
     return fittest
 
+def create_test_case(tile_size, block_size, grid_size, shared_mem=True, private_mem=True, fusion='max'):
+    individual = Individual()   
+    per_kernel_size_info = collections.OrderedDict()
+    per_kernel_size_info[compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL] = compiler_flags.SizeTuple(tile_size, block_size, grid_size)
+
+    #for flag in compiler_flags.PPCG.optimisation_flags:
+    #    print(flag)
+
+    #TODO: Get a better way of getting size_data_flag
+    flag = compiler_flags.PPCG.optimisation_flags[4]
+    individual.ppcg_flags[flag] = per_kernel_size_info 
+
+    if not shared_mem:
+        flag = compiler_flags.PPCG.optimisation_flags[0]
+        #individual.ppcg_flags[flag] = compiler_flags.EnumerationFlag(flag) 
+        individual.ppcg_flags[flag] = True 
+
+
+    if not private_mem:
+        flag = compiler_flags.PPCG.optimisation_flags[7]
+        #individual.ppcg_flags[flag] = compiler_flags.EnumerationFlag(flag) 
+        individual.ppcg_flags[flag] = True 
+
+    #Set isl fusion flag
+    flag = compiler_flags.PPCG.optimisation_flags[6]
+    individual.ppcg_flags[flag] = fusion
+    #string  = individual.ppcg_flags[flag].get_command_line_string(1024)
+    #print(string)
+    #print("end")
+    return individual
+
+
 def create_random():
     individual = Individual()   
     for flag in compiler_flags.PPCG.optimisation_flags:
+        print(flag)
         individual.ppcg_flags[flag] = flag.random_value()
     for flag in compiler_flags.CC.optimisation_flags:
         individual.cc_flags[flag] = flag.random_value()
@@ -39,17 +74,29 @@ class Individual:
     
     ID = 0
     @staticmethod
-    def get_ID():
+    def get_ID_init():
         Individual.ID += 1
         return Individual.ID
     
+    def file_name(self):
+        return 'testcase'+str(self.ID)
+        #return 'gemm'
+
+    def set_ID(self, num):
+        self.ID = num  
+
+    def get_ID(self):
+        return self.ID 
+    
     def __init__(self):
-        self.ID               = Individual.get_ID()
+        self.ID               = Individual.get_ID_init()
         self.ppcg_flags       = collections.OrderedDict()
         self.cc_flags         = collections.OrderedDict()
         self.cxx_flags        = collections.OrderedDict()
         self.nvcc_flags       = collections.OrderedDict()
         self.status           = enums.Status.failed
+        self.execution_time   = float("inf") 
+        self.num = 0
         
     def all_flags(self):
         return self.ppcg_flags.keys() + self.cc_flags.keys() + self.cxx_flags.keys() + self.nvcc_flags.keys()
@@ -57,12 +104,15 @@ def all_flags(self):
     def all_flag_values(self):
         return self.ppcg_flags.values() + self.cc_flags.values() + self.cxx_flags.values() + self.nvcc_flags.values()
             
-    def run(self):
+    def run(self, timeout):
         try:
-            self.compile()
+            self.compile(timeout)
             if self.status == enums.Status.passed:
                 # Fitness is inversely proportional to execution time
-                self.fitness = 1/self.execution_time 
+                if self.execution_time == 0:
+                    self.fitness = float("inf")
+                else:
+                    self.fitness = 1/self.execution_time 
                 debug.verbose_message("Individual %d: execution time = %f, fitness = %f" \
                                       % (self.ID, self.execution_time, self.fitness), __name__) 
             else:
@@ -70,49 +120,91 @@ def run(self):
         except internal_exceptions.FailedCompilationException as e:
             debug.exit_message(e)
             
-    def compile(self):
-        self.ppcg()
+            
+    def checkforpause(self):
+        while(1):
+            if os.path.isfile('.pause'):
+                print("Auto tuning paused, remove .pause to restart")
+                time.sleep(20)
+            else:
+                #print("Auto tuning restarted")
+                break
+
+    def compile(self, timeout=2):
+        self.checkforpause()
+        sucess=self.ppcg_with_timeout()
+        if not sucess:
+            return
         self.build()
-        self.binary()
+        self.run_with_timeout(timeout)
 
     def ppcg(self):
         self.ppcg_cmd_line_flags = "--target=%s --dump-sizes %s" % (config.Arguments.target, 
                                                                     ' '.join(flag.get_command_line_string(self.ppcg_flags[flag]) for flag in self.ppcg_flags.keys()))
         
         os.environ["AUTOTUNER_PPCG_FLAGS"] = self.ppcg_cmd_line_flags
-        debug.verbose_message("Running '%s'" % config.Arguments.ppcg_cmd, __name__)
+        cmd = config.Arguments.ppcg_cmd + ' '+self.ppcg_cmd_line_flags+' -o '+self.file_name()
+        debug.verbose_message("Running '%s'" % cmd, __name__)
+        #debug.verbose_message("Running '%s'" % self.ppcg_cmd_line_flags , __name__)
         start  = timeit.default_timer()
-        proc   = subprocess.Popen(config.Arguments.ppcg_cmd, shell=True, stderr=subprocess.PIPE)  
-        stderr = proc.communicate()[1]
+        self.ppcg_proc   = subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE)  
+        stderr = self.ppcg_proc.communicate()[1]
         end    = timeit.default_timer()
         config.time_PPCG += end - start
-        if proc.returncode:
+        if self.ppcg_proc.returncode:
             raise internal_exceptions.FailedCompilationException("FAILED: '%s'" % config.Arguments.ppcg_cmd)         
         # Store the sizes used by PPCG
         self.size_data = compiler_flags.SizesFlag.parse_PPCG_dump_sizes(stderr)
         
+
+    def ppcg_with_timeout(self, timeout=200):
+        thread = threading.Thread(target=self.ppcg)
+        thread.start()
+        thread.join(timeout)
+        if thread.is_alive():
+            print("Timeout: terminating the ppcg ")
+            self.ppcg_proc.terminate()
+            thread.join(timeout)
+            self.status = enums.Status.ppcgtimeout
+            return False
+        return True
+
     def build(self):
-        debug.verbose_message("Running '%s'" % config.Arguments.build_cmd, __name__)
+        build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.cu ' + self.file_name()+ '_kernel.cu '+ '-o '+ self.file_name()+'.exe'
+        debug.verbose_message("Running '%s'" % build_cmd, __name__)
         start  = timeit.default_timer()
-        proc   = subprocess.Popen(config.Arguments.build_cmd, shell=True)  
+        proc   = subprocess.Popen(build_cmd, shell=True)  
         stderr = proc.communicate()[1]     
         end    = timeit.default_timer()
         config.time_backend += end - start
         if proc.returncode:
             raise internal_exceptions.FailedCompilationException("FAILED: '%s'" % config.Arguments.build_cmd)
+
+
     
+    def deleteFile(self, fileName):
+        try:
+            if os.path.exists(fileName):
+                os.remove(fileName)
+        except:
+            pass
+
     def binary(self):
-        time_regex = re.compile(r'^(\d*\.\d+|\d+)$')
+        #time_regex = re.compile(r'^(\d*\.\d+|\d+)$')
+        print config.Arguments.execution_time_regex
+        time_regex = re.compile(config.Arguments.execution_time_regex)
         total_time = 0.0
         status     = enums.Status.passed
         for run in xrange(1,config.Arguments.runs+1):
-            debug.verbose_message("Run #%d of '%s'" % (run, config.Arguments.run_cmd), __name__)
+            run_cmd = './'+self.file_name()+'.exe'
+            #run_cmd = config.Arguments.run_cmd
+            debug.verbose_message("Run #%d of '%s'" % (run, run_cmd), __name__)
             start = timeit.default_timer()
-            proc  = subprocess.Popen(config.Arguments.run_cmd, shell=True, stdout=subprocess.PIPE)    
-            stdout, stderr = proc.communicate()
+            self.proc  = subprocess.Popen(run_cmd, shell=True, stdout=subprocess.PIPE)    
+            stdout, stderr = self.proc.communicate()
             end   = timeit.default_timer()
-            if proc.returncode:
-                status = enums.Status.failed
+            if self.proc.returncode:
+                sper_kernel_size_infotatus = enums.Status.failed
                 debug.warning_message("FAILED: '%s'" % config.Arguments.run_cmd)
                 continue
             if config.Arguments.execution_time_from_binary:
@@ -131,7 +223,37 @@ def binary(self):
         self.status = status
         config.time_binary += total_time
         self.execution_time = total_time/config.Arguments.runs
+
+        self.deleteFile(self.file_name()+'.exe')
+        self.deleteFile(self.file_name()+'_host.c')
+        self.deleteFile(self.file_name()+'_host_kernel.cl')
+        self.deleteFile(self.file_name()+'_host.cu')
+        self.deleteFile(self.file_name()+'_kernel.cu')
+        self.deleteFile(self.file_name()+'_kernel.hu')
+        self.deleteFile(self.file_name()+'_host_kernel.hu')
+        self.deleteFile(self.file_name()+'_host_kernel.h')
+        self.deleteFile(self.file_name())
+
+ 
+    def run_with_timeout(self, timeout=2):
+        print "executing task " + str(self.ID)
+        timeout = config.Arguments.timeout_ppcg
+        try:
+            thread = threading.Thread(target=self.binary)
+            thread.start()
+            thread.join(timeout)
+            if thread.is_alive():
+                print("Timeout: terminating the procs")
+                self.proc.terminate()
+                thread.join()
+                self.status = enums.Status.timeout
+        except:
+            print("Exception running"+str(self.ID))
+            self.status = enums.Status.timeout
+
+        return
         
+               
     def __str__(self):
-        return "ID %d: fitness %f" % (self.ID, self.fitness)
-    
\ No newline at end of file
+        return "ID %4d: execution time = %3f, ppcg = %s, status = %s" % (self.ID, self.execution_time, self.ppcg_cmd_line_flags, self.status)
+    
diff --git a/main.py b/main.py
index dddba88..8c1743d 100755
--- a/main.py
+++ b/main.py
@@ -16,6 +16,7 @@ def print_summary(search):
             sys.stdout    = output_stream
         config.summarise_timing()
         search.summarise()
+        search.logall()
     finally:
         if config.Arguments.results_file is not None:
             output_stream.close()
@@ -26,6 +27,8 @@ def autotune():
         search = heuristic_search.GA()
     elif config.Arguments.autotune_subcommand == enums.SearchStrategy.random:
         search = heuristic_search.Random()
+    elif config.Arguments.autotune_subcommand == enums.SearchStrategy.exhaustive:
+        search = heuristic_search.Exhaustive()
     elif config.Arguments.autotune_subcommand == enums.SearchStrategy.simulated_annealing:
         search = heuristic_search.SimulatedAnnealing()
     else:
@@ -129,7 +132,7 @@ def string_csv(string):
                                             help="how to run the generated binary from the auto-tuner",
                                             required=True)
     
-    runs = 5
+    runs = 1
     building_and_running_group.add_argument("--runs",
                                             type=int,
                                             metavar="<int>",
@@ -141,6 +144,12 @@ def string_csv(string):
                                             help="assume that the binary prints its execution time to standard output (rather than measuring the execution time through Python)",
                                             default=False)
     
+    
+    building_and_running_group.add_argument("--execution-time-regex",
+                            type=str,
+                            help="regular expression format for execution time",
+                            default=r'^(\d*\.\d+|\d+)$')
+    
     # PPCG options
     ppcg_group = parser.add_argument_group("PPCG arguments")
     
@@ -173,7 +182,7 @@ def string_csv(string):
                             help="consider only values in this range when tuning the tile size (default: %d-%d)" % (tile_size_range[0], tile_size_range[1]),
                             default=tile_size_range)
     
-    tile_dimensions = 3
+    tile_dimensions = 1
     ppcg_group.add_argument("--tile-dimensions",
                             type=int,
                             metavar="<int>",
@@ -234,6 +243,16 @@ def string_csv(string):
                             help="do not tune kernel sizes individually, i.e. use a uniform tile size for all kernels and let PPCG decide on suitable block and grid sizes",
                             default=False)
     
+    ppcg_group.add_argument("--no-shared-memory",
+                            action="store_false",
+                            help="do not consider shared memory while autotuning",
+                            default=True)
+    
+    ppcg_group.add_argument("--no-private-memory",
+                            action="store_false",
+                            help="do not consider private memory  while autotuning",
+                            default=True)
+
     ppcg_group.add_argument("--all-isl-options",
                             action=ISLAction,
                             metavar="",
@@ -330,10 +349,50 @@ def string_csv(string):
                                default=randoms,
                                help="the number of random tests to generate (default: %d)" % randoms)
     
-    parser.parse_args(namespace=config.Arguments)
+    
+    parser_exhaustive = search_subparsers.add_parser(enums.SearchStrategy.exhaustive)
+
+    parser_exhaustive.add_argument("--params-from-file",
+                         action="store_true",
+                         help="read the paramters from the explore-params py",
+                         default=False)
+
+    parser_exhaustive.add_argument("--only-powers-of-two",
+                         action="store_true",
+                         help="Search for parameter values that are powers of two",
+                         default=False)
+
+    parser_exhaustive.add_argument("--all-fusion-structures",
+                         action="store_true",
+                         help="explore all fusion structures [max, min] ",
+                         default=False)
 
+    parser_exhaustive.add_argument("--parallelize-compilation",
+                         action="store_true",
+                         help="parallelize ppcg compilation and execution of test case",
+                         default=False)
+    
+    
+    num_compile_threads = 1
+    parser_exhaustive.add_argument("--num-compile-threads",
+                               type=int,
+                               metavar="<int>",
+                               default=num_compile_threads,
+                               help="number of threads to use for ppcg compilation (default: %d)" % num_compile_threads)
+    
+
+    
+    timeout = 500
+    parser_exhaustive.add_argument("--timeout-ppcg",
+                               type=int,
+                               metavar="<int>",
+                               default=timeout,
+                               help="timeout for ppcg compilation and testcase execution (default: %d sec)" % num_compile_threads)
+    
+    parser.parse_args(namespace=config.Arguments)
+  
 if __name__ == "__main__":
     the_command_line()
     setup_PPCG_flags()
     autotune()    
-        
\ No newline at end of file
+        

From d9e943f8867140cb1363cd73d7a732604fa043c1 Mon Sep 17 00:00:00 2001
From: chandan <chandan@fermi.(none)>
Date: Thu, 12 Mar 2015 14:35:34 +0100
Subject: [PATCH 02/34] adding template file explore-params.py to specifiy the
 parameters explored in exhaustive search option

---
 explore-params.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 explore-params.py

diff --git a/explore-params.py b/explore-params.py
new file mode 100644
index 0000000..60b7dd7
--- /dev/null
+++ b/explore-params.py
@@ -0,0 +1,23 @@
+# This file contains the PPCG parameter values that are explored.
+# The exploration script considers each combination of the parameter values.
+
+([
+  # Tile sizes
+  [(16,16), (32,32), (64,64)],
+
+  # Grid sizes
+  [(16,16), (32,32), (256,256), (1024,1024)],
+
+  # Block sizes
+  [(1,1), (1,2), (1,4), (1,8),
+   (16,16), (32,32), (64,64)],
+
+  #private memory
+  [False],
+
+  #Shared memory
+  [False, True],
+
+  #Fusion
+  ['max', 'min']
+])

From 33e1e2683e5511d547208a1f13156794d97aa28e Mon Sep 17 00:00:00 2001
From: chandan <chandan@fermi.(none)>
Date: Thu, 12 Mar 2015 14:38:53 +0100
Subject: [PATCH 03/34] Removed trailing whitespace

---
 heuristic_search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/heuristic_search.py b/heuristic_search.py
index fa0e4ba..1668fa8 100644
--- a/heuristic_search.py
+++ b/heuristic_search.py
@@ -490,7 +490,7 @@ def logall(self):
         pass
 
 class SimulatedAnnealing(SearchStrategy):
-   """Search using simulated annealing""" 
+   """Search using simulated annealing"""
 
    def acceptance_probability(self, currentEnergy, newEnergy, temperature):
         if newEnergy < currentEnergy:

From fd13616fbe770acfb152dbbd451cdf4e09370440 Mon Sep 17 00:00:00 2001
From: chandan <chandan@fermi.(none)>
Date: Thu, 12 Mar 2015 14:44:33 +0100
Subject: [PATCH 04/34] changed the default of tile dimensions and number of
 runs

---
 main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.py b/main.py
index 8c1743d..451776a 100755
--- a/main.py
+++ b/main.py
@@ -132,7 +132,7 @@ def string_csv(string):
                                             help="how to run the generated binary from the auto-tuner",
                                             required=True)
     
-    runs = 1
+    runs = 5
     building_and_running_group.add_argument("--runs",
                                             type=int,
                                             metavar="<int>",
@@ -182,7 +182,7 @@ def string_csv(string):
                             help="consider only values in this range when tuning the tile size (default: %d-%d)" % (tile_size_range[0], tile_size_range[1]),
                             default=tile_size_range)
     
-    tile_dimensions = 1
+    tile_dimensions = 3
     ppcg_group.add_argument("--tile-dimensions",
                             type=int,
                             metavar="<int>",

From 4e64fa0c33e049f0f2d74a5d4700d4996c8c98e1 Mon Sep 17 00:00:00 2001
From: chandan <chandan@fermi.(none)>
Date: Thu, 12 Mar 2015 15:11:23 +0100
Subject: [PATCH 05/34] changed the command line print of enumerated flag

---
 compiler_flags.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler_flags.py b/compiler_flags.py
index 309e519..6434c60 100644
--- a/compiler_flags.py
+++ b/compiler_flags.py
@@ -47,7 +47,7 @@ def get_command_line_string(self, value):
             else:
                 return ""
         else:
-            return "%s=%s" % (self.name, value.__str__( ))
+            return "%s %s" % (self.name, value.__str__( ))
     
 class Size:
     """Models a tile, block or grid size"""

From 7b38fb66a4d9787097d9d86b4278a162c93064d9 Mon Sep 17 00:00:00 2001
From: chandan <chandan@fermi.(none)>
Date: Thu, 12 Mar 2015 15:13:58 +0100
Subject: [PATCH 06/34] addedd = to the command line print of enumerated flag

---
 compiler_flags.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler_flags.py b/compiler_flags.py
index 6434c60..309e519 100644
--- a/compiler_flags.py
+++ b/compiler_flags.py
@@ -47,7 +47,7 @@ def get_command_line_string(self, value):
             else:
                 return ""
         else:
-            return "%s %s" % (self.name, value.__str__( ))
+            return "%s=%s" % (self.name, value.__str__( ))
     
 class Size:
     """Models a tile, block or grid size"""

From e224aec869e5aca993209061936945b7c923123a Mon Sep 17 00:00:00 2001
From: chandan <chandan@fermi.(none)>
Date: Thu, 12 Mar 2015 15:31:32 +0100
Subject: [PATCH 07/34] fixed the build command for opencl compilation

---
 individual.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/individual.py b/individual.py
index 6ec05e3..7a61fd4 100644
--- a/individual.py
+++ b/individual.py
@@ -170,7 +170,10 @@ def ppcg_with_timeout(self, timeout=200):
         return True
 
     def build(self):
-        build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.cu ' + self.file_name()+ '_kernel.cu '+ '-o '+ self.file_name()+'.exe'
+        if config.Arguments.target == enums.Target.cuda:
+            build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.cu ' + self.file_name()+ '_kernel.cu '+ '-o '+ self.file_name()+'.exe'
+        else:
+            build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.c ' + '-o '+ self.file_name()+'.exe'
         debug.verbose_message("Running '%s'" % build_cmd, __name__)
         start  = timeit.default_timer()
         proc   = subprocess.Popen(build_cmd, shell=True)  

From 81c67b43cc77495a6f5da4bb87eedb9980aae88d Mon Sep 17 00:00:00 2001
From: chandan <chandan@fermi.(none)>
Date: Thu, 12 Mar 2015 15:47:31 +0100
Subject: [PATCH 08/34] fixed the ppcg command line output file options for
 opecl

---
 individual.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/individual.py b/individual.py
index 7a61fd4..bcb73a1 100644
--- a/individual.py
+++ b/individual.py
@@ -143,7 +143,12 @@ def ppcg(self):
                                                                     ' '.join(flag.get_command_line_string(self.ppcg_flags[flag]) for flag in self.ppcg_flags.keys()))
         
         os.environ["AUTOTUNER_PPCG_FLAGS"] = self.ppcg_cmd_line_flags
-        cmd = config.Arguments.ppcg_cmd + ' '+self.ppcg_cmd_line_flags+' -o '+self.file_name()
+
+        if config.Arguments.target == enums.Targets.cuda:
+            cmd = config.Arguments.ppcg_cmd + ' '+self.ppcg_cmd_line_flags+' -o '+self.file_name()
+        else:
+            cmd = config.Arguments.ppcg_cmd + ' '+self.ppcg_cmd_line_flags+' -o '+self.file_name()+'_host.c'
+
         debug.verbose_message("Running '%s'" % cmd, __name__)
         #debug.verbose_message("Running '%s'" % self.ppcg_cmd_line_flags , __name__)
         start  = timeit.default_timer()
@@ -170,7 +175,7 @@ def ppcg_with_timeout(self, timeout=200):
         return True
 
     def build(self):
-        if config.Arguments.target == enums.Target.cuda:
+        if config.Arguments.target == enums.Targets.cuda:
             build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.cu ' + self.file_name()+ '_kernel.cu '+ '-o '+ self.file_name()+'.exe'
         else:
             build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.c ' + '-o '+ self.file_name()+'.exe'

From 9fd7121ff9a8bd7410513fdeba4b1c6e758fb42f Mon Sep 17 00:00:00 2001
From: Michael Kruse <autotuner@meinersbur.de>
Date: Thu, 12 Mar 2015 15:53:08 +0100
Subject: [PATCH 09/34] Fail if regex isn't found in program output

---
 individual.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/individual.py b/individual.py
index bcb73a1..fb3503c 100644
--- a/individual.py
+++ b/individual.py
@@ -218,14 +218,18 @@ def binary(self):
             if config.Arguments.execution_time_from_binary:
                 if not stdout:
                     raise internal_exceptions.BinaryRunException("Expected the binary to dump its execution time. Found nothing")
+                nmatchedlines = 0
                 for line in stdout.split(os.linesep):
                     line    = line.strip()
                     matches = time_regex.findall(line)
                     if matches:
+                        nmatchedlines += 1
                         try:
                             total_time += float(matches[0])
                         except:
                             raise internal_exceptions.BinaryRunException("Execution time '%s' is not in the required format" % matches[0])
+                if nmatchedlines == 0:
+                    raise internal_exceptions.BinaryRunException("Regular expression did not match anything on the program's output")
             else:
                 total_time += end - start
         self.status = status

From 4eb1cf71d7d5ec3a7f97a78f2d72945174ed9e2e Mon Sep 17 00:00:00 2001
From: chandan <chandan@fermi.(none)>
Date: Fri, 13 Mar 2015 13:53:16 +0100
Subject: [PATCH 10/34] removed the required flag for run cmd, as it is no
 longer required to be sepcified

---
 main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.py b/main.py
index 451776a..0750c2c 100755
--- a/main.py
+++ b/main.py
@@ -130,7 +130,7 @@ def string_csv(string):
     building_and_running_group.add_argument("--run-cmd",
                                             metavar="<STRING>",
                                             help="how to run the generated binary from the auto-tuner",
-                                            required=True)
+                                            required=False)
     
     runs = 5
     building_and_running_group.add_argument("--runs",

From c5b79227eb6aeed681c6ec0cc1d41a067162f44a Mon Sep 17 00:00:00 2001
From: chandan <chandan@fermi.(none)>
Date: Fri, 13 Mar 2015 18:06:22 +0100
Subject: [PATCH 11/34] added support for pencil runtime libraries while
 building opencl target

---
 individual.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/individual.py b/individual.py
index bcb73a1..e22e773 100644
--- a/individual.py
+++ b/individual.py
@@ -178,7 +178,7 @@ def build(self):
         if config.Arguments.target == enums.Targets.cuda:
             build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.cu ' + self.file_name()+ '_kernel.cu '+ '-o '+ self.file_name()+'.exe'
         else:
-            build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.c ' + '-o '+ self.file_name()+'.exe'
+            build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.c ' + '-o '+ self.file_name()+'.exe' + ' -lprl -lOpenCL'
         debug.verbose_message("Running '%s'" % build_cmd, __name__)
         start  = timeit.default_timer()
         proc   = subprocess.Popen(build_cmd, shell=True)  
@@ -199,7 +199,7 @@ def deleteFile(self, fileName):
 
     def binary(self):
         #time_regex = re.compile(r'^(\d*\.\d+|\d+)$')
-        print config.Arguments.execution_time_regex
+        #print config.Arguments.execution_time_regex
         time_regex = re.compile(config.Arguments.execution_time_regex)
         total_time = 0.0
         status     = enums.Status.passed

From d73406117b0cb5546c314950a0e43201f362037b Mon Sep 17 00:00:00 2001
From: Michael Kruse <autotuner@meinersbur.de>
Date: Mon, 16 Mar 2015 16:50:26 +0100
Subject: [PATCH 12/34] Fix RunThread termination condition

---
 heuristic_search.py | 34 +++++++++++++++++++++++-----------
 individual.py       |  6 ++++++
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/heuristic_search.py b/heuristic_search.py
index 1668fa8..d2c88ee 100644
--- a/heuristic_search.py
+++ b/heuristic_search.py
@@ -316,7 +316,7 @@ def run(self):
         global run_queue
         while True:
             testcase = compile_queue.get()
-            if testcase.get_ID() == -1:
+            if isinstance(testcase, individual.EndOfQueue):
                 run_queue.put(testcase)
                 break
 
@@ -326,19 +326,30 @@ def run(self):
 
 
 class RunThread(Thread):
+    def __init__(self, num_threads):
+        super(RunThread, self).__init__()
+        self.num_threads = num_threads
+
     def run(self):
         global run_queue
         best_time = float("inf")
         f = open(config.Arguments.results_file + ".log", 'a')
         f_iter = open('.lastiter', 'w')
         while True:
+            print('***run thread waiting')
             testcase = run_queue.get()
-            if testcase.get_ID() == -1:
-                try:
-                    os.remove('.lastiter')
-                except:
-                    pass
-                break
+            if isinstance(testcase, individual.EndOfQueue):
+                self.num_threads = self.num_threads - 1
+                print('***remaining threads: ' + str(self.num_threads))
+                if self.num_threads<=0:
+                    try:
+                       os.remove('.lastiter')
+                    except:
+                       pass
+                    print('***run thread exiting')
+                    break
+                continue
+            print('***run thread got job')
             testcase.run_with_timeout()
             f_iter.seek(0)
             f_iter.write(str(testcase.get_ID()))
@@ -414,9 +425,11 @@ def createExhaConfigs(self):
     def pipelineExec(self, combs):
         num_threads = config.Arguments.num_compile_threads
         for i in range(num_threads):
-            CompileThread().start()
+            t = CompileThread()
+            t.daemon = True
+            t.start()
 
-        RunThread().start()
+        RunThread(num_threads).start()
 
         cnt = 0
         for conf in combs:
@@ -426,9 +439,8 @@ def pipelineExec(self, combs):
             cnt += 1
             compile_queue.put(cur)
 
-        cur.set_ID(-1)
         for i in range(num_threads):
-            compile_queue.put(cur)
+            compile_queue.put(individual.EndOfQueue()) # So every CompileThread fetches one EndOfQueue element
        
 
     def run(self):
diff --git a/individual.py b/individual.py
index fb3503c..70d1d96 100644
--- a/individual.py
+++ b/individual.py
@@ -11,6 +11,12 @@
 import internal_exceptions
 import time
 
+class EndOfQueue:
+    def __init__(self):
+        pass
+
+
+
 def get_fittest(population):
     fittest = None
     for individual in population:

From 5f38aef736cdc3e1d652824c1458fd071814e58a Mon Sep 17 00:00:00 2001
From: chandan <chandan@fermi.(none)>
Date: Tue, 17 Mar 2015 14:00:07 +0100
Subject: [PATCH 13/34] added few hesuritics to prune search space

---
 heuristic_search.py | 30 ++++++++++++++++++++++++++++--
 main.py             |  6 ++++++
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/heuristic_search.py b/heuristic_search.py
index d2c88ee..0683637 100644
--- a/heuristic_search.py
+++ b/heuristic_search.py
@@ -336,7 +336,7 @@ def run(self):
         f = open(config.Arguments.results_file + ".log", 'a')
         f_iter = open('.lastiter', 'w')
         while True:
-            print('***run thread waiting')
+            #print('***run thread waiting')
             testcase = run_queue.get()
             if isinstance(testcase, individual.EndOfQueue):
                 self.num_threads = self.num_threads - 1
@@ -349,7 +349,7 @@ def run(self):
                     print('***run thread exiting')
                     break
                 continue
-            print('***run thread got job')
+            #print('***run thread got job')
             testcase.run_with_timeout()
             f_iter.seek(0)
             f_iter.write(str(testcase.get_ID()))
@@ -442,6 +442,22 @@ def pipelineExec(self, combs):
         for i in range(num_threads):
             compile_queue.put(individual.EndOfQueue()) # So every CompileThread fetches one EndOfQueue element
        
+    def tile_size_multiple_filter(self, conf):
+        tile_size = conf[0]
+        block_size = conf[1]
+
+        mul_factor = 1
+        for t, b in zip(tile_size, block_size):
+            if t < b:
+                return False
+            if t % b != 0:
+                return False
+            mul_factor *= t/b
+
+        if mul_factor > 36:
+            return False
+
+        return True
 
     def run(self):
         self.individuals = []
@@ -453,6 +469,16 @@ def run(self):
 
         cnt = 0
         combs = itertools.product(*paramValues)
+
+
+        if config.Arguments.filter_testcases:
+            #Filter out only test cases based on heusristics such as tile size is multiple of block size etc.. 
+            combs = filter(self.tile_size_multiple_filter, combs)
+            #Filter out only test cases where shared memory is true
+            combs = filter(lambda conf: conf[3] == True, combs)
+            #Filter out only test cases where private memory is true
+            combs = filter(lambda conf: conf[4] == True, combs)
+
         if config.Arguments.parallelize_compilation:
             self.pipelineExec(combs)
             return
diff --git a/main.py b/main.py
index 0750c2c..8920d58 100755
--- a/main.py
+++ b/main.py
@@ -367,6 +367,12 @@ def string_csv(string):
                          help="explore all fusion structures [max, min] ",
                          default=False)
 
+
+    parser_exhaustive.add_argument("--filter-testcases",
+                         action="store_true",
+                         help="few heursitics to reduce search space such as tile size multiple of block size, tile size > block size etc..",
+                         default=False)
+
     parser_exhaustive.add_argument("--parallelize-compilation",
                          action="store_true",
                          help="parallelize ppcg compilation and execution of test case",

From cd58e363578e7140c59c6d0d56d173f0f11bd3c9 Mon Sep 17 00:00:00 2001
From: chandan <chandan@fermi.(none)>
Date: Tue, 17 Mar 2015 17:31:09 +0100
Subject: [PATCH 14/34] added support for last iter

---
 heuristic_search.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/heuristic_search.py b/heuristic_search.py
index 0683637..bc5cdf8 100644
--- a/heuristic_search.py
+++ b/heuristic_search.py
@@ -422,7 +422,23 @@ def createExhaConfigs(self):
         paramValues = [tile_sizes, block_sizes, grid_sizes, private_mem, shared_mem, fusion]
         return paramValues
 
+    def get_last_iter(self):
+        if os.path.isfile(".lastiter"):
+            print("found last iter")
+            try:
+                f_iter = open(".lastiter", 'r+')
+                start_iter = int(f_iter.readline())
+            except:
+                start_iter = 0
+            print("starting from test case = ", start_iter)
+        else:
+            start_iter = 0
+
+        return start_iter
+
     def pipelineExec(self, combs):
+
+        start_iter = self.get_last_iter()
         num_threads = config.Arguments.num_compile_threads
         for i in range(num_threads):
             t = CompileThread()
@@ -433,6 +449,9 @@ def pipelineExec(self, combs):
 
         cnt = 0
         for conf in combs:
+            if cnt < start_iter:
+                cnt += 1
+                continue
             print '---- Configuration ' + str(cnt) + ': ' + str(conf)
             cur = individual.create_test_case(conf[0], conf[1], conf[2], conf[3], conf[4])
             cur.set_ID(cnt)
@@ -484,10 +503,15 @@ def run(self):
             return
 
         f = open(config.Arguments.results_file + ".log", 'a')
+        start_iter = self.get_last_iter()
+
         best_time = 0
         #print 'Parameter values to be explored: ' + str(paramValues)
         #print 'Number of configurations: ' + str(self.countConfigs(paramValues))
         for conf in combs:
+            if cnt < start_iter:
+                cnt += 1
+                continue
             print '---- Configuration ' + str(cnt) + ': ' + str(conf)
             cur = individual.create_test_case(conf[0], conf[1], conf[2], conf[3], conf[4], conf[5])
             cur.set_ID(cnt)

From 7ee2e676df70506e2e11566fe04388fe27a6a82b Mon Sep 17 00:00:00 2001
From: chandan <chandan@fermi.(none)>
Date: Tue, 17 Mar 2015 18:05:46 +0100
Subject: [PATCH 15/34] fixed the execution time logging bug

---
 heuristic_search.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/heuristic_search.py b/heuristic_search.py
index bc5cdf8..dfe0fc7 100644
--- a/heuristic_search.py
+++ b/heuristic_search.py
@@ -430,6 +430,7 @@ def get_last_iter(self):
                 start_iter = int(f_iter.readline())
             except:
                 start_iter = 0
+                pass
             print("starting from test case = ", start_iter)
         else:
             start_iter = 0
@@ -503,9 +504,9 @@ def run(self):
             return
 
         f = open(config.Arguments.results_file + ".log", 'a')
-        start_iter = self.get_last_iter()
+        start_iter = 0
 
-        best_time = 0
+        best_time = float("inf")
         #print 'Parameter values to be explored: ' + str(paramValues)
         #print 'Number of configurations: ' + str(self.countConfigs(paramValues))
         for conf in combs:

From 9df1195d28f9c9d73819938b978bd4caa35556f2 Mon Sep 17 00:00:00 2001
From: chandan <chandan@fermi.(none)>
Date: Tue, 17 Mar 2015 18:26:48 +0100
Subject: [PATCH 16/34] fixed the undefined variable bug

---
 heuristic_search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/heuristic_search.py b/heuristic_search.py
index dfe0fc7..ae28dd5 100644
--- a/heuristic_search.py
+++ b/heuristic_search.py
@@ -531,7 +531,7 @@ def run(self):
                 self.individuals.append(cur)
                 best_time = cur.execution_time
                 best_run = cur
-                f.write("\n Best iter so far = "+ str(i) + "\n")
+                f.write("\n Best iter so far = "+ str(cnt) + "\n")
                 f.write(str(best_run))
                 f.flush()
 

From 3fbf390c90657f90abd2b937263da7519fff06fa Mon Sep 17 00:00:00 2001
From: chandan <chandan@fermi.(none)>
Date: Wed, 18 Mar 2015 14:50:35 +0100
Subject: [PATCH 17/34] added filter to remove testcases which have inavlid
 work group size

---
 heuristic_search.py | 4 ++++
 main.py             | 9 ++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/heuristic_search.py b/heuristic_search.py
index ae28dd5..5da17ee 100644
--- a/heuristic_search.py
+++ b/heuristic_search.py
@@ -466,6 +466,10 @@ def tile_size_multiple_filter(self, conf):
         tile_size = conf[0]
         block_size = conf[1]
 
+        work_group_size = reduce(lambda x,y: x*y, block_size)
+        if work_group_size > config.Arguments.max_work_group_size:
+            return False
+
         mul_factor = 1
         for t, b in zip(tile_size, block_size):
             if t < b:
diff --git a/main.py b/main.py
index 8920d58..20a5111 100755
--- a/main.py
+++ b/main.py
@@ -386,7 +386,14 @@ def string_csv(string):
                                default=num_compile_threads,
                                help="number of threads to use for ppcg compilation (default: %d)" % num_compile_threads)
     
-
+    
+    max_work_group_size = 256 
+    parser_exhaustive.add_argument("--max-work-group-size",
+                               type=int,
+                               metavar="<int>",
+                               default=max_work_group_size,
+                               help="max work group size, test cases with work group size greater than this value will be filtered out (default: %d)" % num_compile_threads)
+    
     
     timeout = 500
     parser_exhaustive.add_argument("--timeout-ppcg",

From 58ecfaccae2f53041295648fff25083979a2f0f3 Mon Sep 17 00:00:00 2001
From: chandan <chandan@fermi.(none)>
Date: Thu, 19 Mar 2015 15:20:29 +0100
Subject: [PATCH 18/34] added summerasize function for Parallelize execution

---
 heuristic_search.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/heuristic_search.py b/heuristic_search.py
index 5da17ee..2b4f1c3 100644
--- a/heuristic_search.py
+++ b/heuristic_search.py
@@ -324,11 +324,11 @@ def run(self):
             testcase.build()
             run_queue.put(testcase)
 
-
 class RunThread(Thread):
     def __init__(self, num_threads):
         super(RunThread, self).__init__()
         self.num_threads = num_threads
+        self.individuals = []
 
     def run(self):
         global run_queue
@@ -344,6 +344,8 @@ def run(self):
                 if self.num_threads<=0:
                     try:
                        os.remove('.lastiter')
+                       self.summarise()
+                       self.logall()
                     except:
                        pass
                     print('***run thread exiting')
@@ -355,11 +357,29 @@ def run(self):
             f_iter.write(str(testcase.get_ID()))
 
             if testcase.execution_time < best_time and testcase.execution_time != 0 and testcase.status == enums.Status.passed: 
+                self.individuals.append(testcase)
                 best_time = testcase.execution_time
                 f.write("\n Best iter so far = \n")
                 f.write(str(testcase))
                 f.flush()
 
+    def summarise(self):
+        print("%s Summary of %s %s" % ('*' * 30, __name__, '*' * 30))
+        try:
+            fittest = individual.get_fittest(self.individuals)
+            debug.summary_message("The fittest individual had execution time %f seconds" % (fittest.execution_time)) 
+            debug.summary_message("To replicate, pass the following to PPCG:")
+            debug.summary_message(fittest.ppcg_cmd_line_flags, False)
+        except internal_exceptions.NoFittestException:
+           pass
+
+    def logall(self):
+        print("%s Log of all runs %s" %('*' * 30, '*' * 30))
+        for i in self.individuals:
+            print(i)
+            debug.summary_message(i.ppcg_cmd_line_flags, False)
+        pass
+
 class Exhaustive(SearchStrategy):
     """Exhaustive search all the values in the specified range or """
     """all combinations provided in explore-params.py file"""

From 9b6c08217501e905d866def3936e4e19d77864d6 Mon Sep 17 00:00:00 2001
From: chandan <chandan@fermi.(none)>
Date: Thu, 19 Mar 2015 15:21:35 +0100
Subject: [PATCH 19/34] fixed the block size range bug

---
 heuristic_search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/heuristic_search.py b/heuristic_search.py
index 2b4f1c3..41aed1d 100644
--- a/heuristic_search.py
+++ b/heuristic_search.py
@@ -413,7 +413,7 @@ def createExhaConfigs(self):
         else:
             block_size_range = range(block_size_lb, block_size_ub)
 
-        block_sizes = itertools.product(tile_size_range, repeat=config.Arguments.block_dimensions)
+        block_sizes = itertools.product(block_size_range, repeat=config.Arguments.block_dimensions)
 
         grid_size_lb = config.Arguments.grid_size_range[0] 
         grid_size_ub = config.Arguments.grid_size_range[1] 

From 46c58663283aa207065047a97aed3c615d3b4e8b Mon Sep 17 00:00:00 2001
From: chandan <chandan@fermi.(none)>
Date: Fri, 20 Mar 2015 15:06:57 +0100
Subject: [PATCH 20/34] added min work group size fitlering

---
 heuristic_search.py |  3 +++
 main.py             | 10 +++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/heuristic_search.py b/heuristic_search.py
index 41aed1d..3649328 100644
--- a/heuristic_search.py
+++ b/heuristic_search.py
@@ -490,6 +490,9 @@ def tile_size_multiple_filter(self, conf):
         if work_group_size > config.Arguments.max_work_group_size:
             return False
 
+        if work_group_size < config.Arguments.min_work_group_size:
+            return False
+
         mul_factor = 1
         for t, b in zip(tile_size, block_size):
             if t < b:
diff --git a/main.py b/main.py
index 20a5111..2f076e3 100755
--- a/main.py
+++ b/main.py
@@ -392,9 +392,17 @@ def string_csv(string):
                                type=int,
                                metavar="<int>",
                                default=max_work_group_size,
-                               help="max work group size, test cases with work group size greater than this value will be filtered out (default: %d)" % num_compile_threads)
+                               help="max work group size, test cases with work group size greater than this value will be filtered out (default: %d)" % max_work_group_size)
     
     
+    
+    min_work_group_size = 1 
+    parser_exhaustive.add_argument("--min-work-group-size",
+                               type=int,
+                               metavar="<int>",
+                               default=min_work_group_size,
+                               help="min work group size, test cases with work group size lesser than this value will be filtered out (default: %d)" % min_work_group_size)
+    
     timeout = 500
     parser_exhaustive.add_argument("--timeout-ppcg",
                                type=int,

From ccfdfba71c5e66227321510df56bd681212f115b Mon Sep 17 00:00:00 2001
From: chandan <chandan@fermi.(none)>
Date: Fri, 20 Mar 2015 16:21:43 +0100
Subject: [PATCH 21/34] added execution time check for multiple runs case

---
 heuristic_search.py |  2 +-
 individual.py       | 29 ++++++++++++++++++++---------
 main.py             |  8 ++++++++
 3 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/heuristic_search.py b/heuristic_search.py
index 3649328..1e72045 100644
--- a/heuristic_search.py
+++ b/heuristic_search.py
@@ -352,7 +352,7 @@ def run(self):
                     break
                 continue
             #print('***run thread got job')
-            testcase.run_with_timeout()
+            testcase.run(best_time)
             f_iter.seek(0)
             f_iter.write(str(testcase.get_ID()))
 
diff --git a/individual.py b/individual.py
index 4f0f47c..ccb4274 100644
--- a/individual.py
+++ b/individual.py
@@ -1,4 +1,4 @@
-import timeit
+umport timeit
 import os
 import re
 import debug
@@ -136,13 +136,14 @@ def checkforpause(self):
                 #print("Auto tuning restarted")
                 break
 
-    def compile(self, timeout=2):
+    def compile(self, timeout=float("inf")):
         self.checkforpause()
-        sucess=self.ppcg_with_timeout()
-        if not sucess:
-            return
+        self.ppcg()
+        #sucess=self.ppcg_with_timeout(timeout)
+        #if not sucess:
+        #    return
         self.build()
-        self.run_with_timeout(timeout)
+        self.binary(timeout)
 
     def ppcg(self):
         self.ppcg_cmd_line_flags = "--target=%s --dump-sizes %s" % (config.Arguments.target, 
@@ -168,7 +169,7 @@ def ppcg(self):
         self.size_data = compiler_flags.SizesFlag.parse_PPCG_dump_sizes(stderr)
         
 
-    def ppcg_with_timeout(self, timeout=200):
+    def ppcg_with_timeout(self, timeout=float("inf")):
         thread = threading.Thread(target=self.ppcg)
         thread.start()
         thread.join(timeout)
@@ -203,12 +204,13 @@ def deleteFile(self, fileName):
         except:
             pass
 
-    def binary(self):
+    def binary(self, best_execution_time=float("inf")):
         #time_regex = re.compile(r'^(\d*\.\d+|\d+)$')
         #print config.Arguments.execution_time_regex
         time_regex = re.compile(config.Arguments.execution_time_regex)
         total_time = 0.0
         status     = enums.Status.passed
+        num_actual_runs = 0
         for run in xrange(1,config.Arguments.runs+1):
             run_cmd = './'+self.file_name()+'.exe'
             #run_cmd = config.Arguments.run_cmd
@@ -238,9 +240,18 @@ def binary(self):
                     raise internal_exceptions.BinaryRunException("Regular expression did not match anything on the program's output")
             else:
                 total_time += end - start
+
+            num_actual_runs +=1
+            per_var = 1 + config.Arguments.max_exec_time_var/100
+            time = per_var  * best_execution_time
+            if total_time > (best_execution_time + per_var  * best_execution_time ):
+                #print "Execution time of cur test case is worst than the best so far, stopping at first run" 
+                break
+
+            
         self.status = status
         config.time_binary += total_time
-        self.execution_time = total_time/config.Arguments.runs
+        self.execution_time = total_time/num_actual_runs
 
         self.deleteFile(self.file_name()+'.exe')
         self.deleteFile(self.file_name()+'_host.c')
diff --git a/main.py b/main.py
index 2f076e3..4ab7977 100755
--- a/main.py
+++ b/main.py
@@ -410,6 +410,14 @@ def string_csv(string):
                                default=timeout,
                                help="timeout for ppcg compilation and testcase execution (default: %d sec)" % num_compile_threads)
     
+    
+    max_exec_time_var = 20 
+    parser_exhaustive.add_argument("--max-exec-time-var",
+                               type=int,
+                               metavar="<int>",
+                               default=max_exec_time_var,
+                               help="max allowed variance for execution time. If the execution time of a test case is greater that best so far + max-exec-time-var then number of runs is restricted to 1 (default: %d )" % max_exec_time_var)
+    
     parser.parse_args(namespace=config.Arguments)
   
 if __name__ == "__main__":

From 6d210f48567111fe14cd2d1dd4710c4c59e4e2e5 Mon Sep 17 00:00:00 2001
From: chandan <chandan@fermi.(none)>
Date: Fri, 20 Mar 2015 16:29:34 +0100
Subject: [PATCH 22/34] fixed the bug multiple execution time filtering

---
 individual.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/individual.py b/individual.py
index ccb4274..7ac278f 100644
--- a/individual.py
+++ b/individual.py
@@ -1,4 +1,4 @@
-umport timeit
+import timeit
 import os
 import re
 import debug
@@ -244,7 +244,7 @@ def binary(self, best_execution_time=float("inf")):
             num_actual_runs +=1
             per_var = 1 + config.Arguments.max_exec_time_var/100
             time = per_var  * best_execution_time
-            if total_time > (best_execution_time + per_var  * best_execution_time ):
+            if total_time > time * num_actual_runs:
                 #print "Execution time of cur test case is worst than the best so far, stopping at first run" 
                 break
 

From 944a9b7ba0392deb0e8c31e060697dbb451e918a Mon Sep 17 00:00:00 2001
From: chandan <chandan@fermi.(none)>
Date: Sun, 22 Mar 2015 00:38:01 +0100
Subject: [PATCH 23/34] fixed the divide by zero error

---
 individual.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/individual.py b/individual.py
index 7ac278f..57feba6 100644
--- a/individual.py
+++ b/individual.py
@@ -251,7 +251,10 @@ def binary(self, best_execution_time=float("inf")):
             
         self.status = status
         config.time_binary += total_time
-        self.execution_time = total_time/num_actual_runs
+        if num_actual_runs != 0:
+            self.execution_time = total_time/num_actual_runs
+        else 
+            self.execution_time = total_time
 
         self.deleteFile(self.file_name()+'.exe')
         self.deleteFile(self.file_name()+'_host.c')

From e864259c21c6eed5caad3cb8ac282e0cb7b454d4 Mon Sep 17 00:00:00 2001
From: chandan <chandan@fermi.(none)>
Date: Sun, 22 Mar 2015 13:45:40 +0100
Subject: [PATCH 24/34] added the missing : for else

---
 individual.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/individual.py b/individual.py
index 57feba6..ae77e2e 100644
--- a/individual.py
+++ b/individual.py
@@ -253,7 +253,7 @@ def binary(self, best_execution_time=float("inf")):
         config.time_binary += total_time
         if num_actual_runs != 0:
             self.execution_time = total_time/num_actual_runs
-        else 
+        else:
             self.execution_time = total_time
 
         self.deleteFile(self.file_name()+'.exe')

From 60d3e1263b4484a0035180fdc12c370ed29a0309 Mon Sep 17 00:00:00 2001
From: chandan <chandan@fermi.(none)>
Date: Mon, 23 Mar 2015 15:43:10 +0100
Subject: [PATCH 25/34] added cmd string modififcation option

---
 individual.py | 16 +++++++++++++---
 main.py       | 18 ++++++++++++++++++
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/individual.py b/individual.py
index ae77e2e..9739a56 100644
--- a/individual.py
+++ b/individual.py
@@ -85,6 +85,9 @@ def get_ID_init():
         return Individual.ID
     
     def file_name(self):
+        if config.Arguments.binary_file_name:
+            return config.Arguments.binary_file_name
+
         return 'testcase'+str(self.ID)
         #return 'gemm'
 
@@ -151,7 +154,9 @@ def ppcg(self):
         
         os.environ["AUTOTUNER_PPCG_FLAGS"] = self.ppcg_cmd_line_flags
 
-        if config.Arguments.target == enums.Targets.cuda:
+        if config.Arguments.cmd_string_complete:
+            cmd = config.Arguments.ppcg_cmd
+        elif config.Arguments.target == enums.Targets.cuda:
             cmd = config.Arguments.ppcg_cmd + ' '+self.ppcg_cmd_line_flags+' -o '+self.file_name()
         else:
             cmd = config.Arguments.ppcg_cmd + ' '+self.ppcg_cmd_line_flags+' -o '+self.file_name()+'_host.c'
@@ -182,7 +187,9 @@ def ppcg_with_timeout(self, timeout=float("inf")):
         return True
 
     def build(self):
-        if config.Arguments.target == enums.Targets.cuda:
+        if config.Arguments.cmd_string_complete:
+            build_cmd = config.Arguments.build_cmd
+        elif config.Arguments.target == enums.Targets.cuda:
             build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.cu ' + self.file_name()+ '_kernel.cu '+ '-o '+ self.file_name()+'.exe'
         else:
             build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.c ' + '-o '+ self.file_name()+'.exe' + ' -lprl -lOpenCL'
@@ -212,7 +219,10 @@ def binary(self, best_execution_time=float("inf")):
         status     = enums.Status.passed
         num_actual_runs = 0
         for run in xrange(1,config.Arguments.runs+1):
-            run_cmd = './'+self.file_name()+'.exe'
+            if config.Arguments.cmd_string_complete:
+                run_cmd = config.Arguments.run_cmd
+            else:
+                run_cmd = './'+self.file_name()+'.exe '+config.Arguments.run_cmd_input
             #run_cmd = config.Arguments.run_cmd
             debug.verbose_message("Run #%d of '%s'" % (run, run_cmd), __name__)
             start = timeit.default_timer()
diff --git a/main.py b/main.py
index 4ab7977..a80f5ae 100755
--- a/main.py
+++ b/main.py
@@ -132,6 +132,19 @@ def string_csv(string):
                                             help="how to run the generated binary from the auto-tuner",
                                             required=False)
     
+    
+    building_and_running_group.add_argument("--run-cmd-input",
+                                            metavar="<STRING>",
+                                            help="input to the generated binary from the auto-tuner",
+                                            required=False,
+                                            default="")
+
+    
+    building_and_running_group.add_argument("--cmd-string-complete",
+                                            action="store_true",
+                                            help="dont modify the cmd string, note the output file nmaes should be part of cmd lines",
+                                            default=False)
+
     runs = 5
     building_and_running_group.add_argument("--runs",
                                             type=int,
@@ -144,6 +157,11 @@ def string_csv(string):
                                             help="assume that the binary prints its execution time to standard output (rather than measuring the execution time through Python)",
                                             default=False)
     
+    building_and_running_group.add_argument("--binary-file-name",
+                                            metavar="<STRING>",
+                                            help="name of the generated binary from the auto-tuner",
+                                            required=False,
+                                            default="")
     
     building_and_running_group.add_argument("--execution-time-regex",
                             type=str,

From 4a5baa8eb98922d16a13b00a2a4fbaa92fadc409 Mon Sep 17 00:00:00 2001
From: Chandan <Chandan1986g@gmail.com>
Date: Mon, 23 Mar 2015 21:17:14 +0100
Subject: [PATCH 26/34] fixed the ppcg cmd line bug with cmd string complete
 option

---
 individual.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/individual.py b/individual.py
index 9739a56..38c095c 100644
--- a/individual.py
+++ b/individual.py
@@ -155,7 +155,7 @@ def ppcg(self):
         os.environ["AUTOTUNER_PPCG_FLAGS"] = self.ppcg_cmd_line_flags
 
         if config.Arguments.cmd_string_complete:
-            cmd = config.Arguments.ppcg_cmd
+            cmd = config.Arguments.ppcg_cmd+ ' '+self.ppcg_cmd_line_flags
         elif config.Arguments.target == enums.Targets.cuda:
             cmd = config.Arguments.ppcg_cmd + ' '+self.ppcg_cmd_line_flags+' -o '+self.file_name()
         else:

From 9825823ead6c3a425fbcb9820f0072d6ab500d2c Mon Sep 17 00:00:00 2001
From: chandan <chandan@fermi.(none)>
Date: Mon, 11 May 2015 15:36:24 +0200
Subject: [PATCH 27/34] fixed the shared_mem private_mem option interchange bug

---
 explore-params.py   | 4 ++--
 heuristic_search.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/explore-params.py b/explore-params.py
index 60b7dd7..b8c5f93 100644
--- a/explore-params.py
+++ b/explore-params.py
@@ -12,10 +12,10 @@
   [(1,1), (1,2), (1,4), (1,8),
    (16,16), (32,32), (64,64)],
 
-  #private memory
+  #Shared memory
   [False],
 
-  #Shared memory
+  #private memory
   [False, True],
 
   #Fusion
diff --git a/heuristic_search.py b/heuristic_search.py
index 1e72045..194111a 100644
--- a/heuristic_search.py
+++ b/heuristic_search.py
@@ -439,7 +439,7 @@ def createExhaConfigs(self):
         else:
             fusion = ['max']
 
-        paramValues = [tile_sizes, block_sizes, grid_sizes, private_mem, shared_mem, fusion]
+        paramValues = [tile_sizes, block_sizes, grid_sizes, shared_mem, private_mem, fusion]
         return paramValues
 
     def get_last_iter(self):

From 9d2907f59c3faa22b3ccab9c2113701c5d9215e9 Mon Sep 17 00:00:00 2001
From: chandanReddy <chandan1986g@gmail.com>
Date: Thu, 7 Apr 2016 14:55:59 +0200
Subject: [PATCH 28/34] integrated into prl and added multiple kernel tuning
 support

---
 enums.py            |  1 +
 heuristic_search.py | 27 ++++++++++++++++++++++++---
 individual.py       | 18 ++++++++++++++----
 main.py             | 21 +++++++++++++++++----
 4 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/enums.py b/enums.py
index 0af2b4c..7c0d7bb 100644
--- a/enums.py
+++ b/enums.py
@@ -1,6 +1,7 @@
 class Targets:
     cuda   = "cuda"
     opencl = "opencl"
+    prl    = "prl"
     
 class Crossover:
     one_point = "one_point"
diff --git a/heuristic_search.py b/heuristic_search.py
index 194111a..b206de5 100644
--- a/heuristic_search.py
+++ b/heuristic_search.py
@@ -13,6 +13,7 @@
 import os
 from Queue import Queue
 from threading import Thread
+import sys
 
 class SearchStrategy:
     """Abstract class for a search strategy"""
@@ -508,6 +509,14 @@ def tile_size_multiple_filter(self, conf):
 
     def run(self):
         self.individuals = []
+        self.output_stream = open(config.Arguments.results_file, 'w')
+        for k in config.Arguments.kernels_to_tune:
+            self.individuals = []
+            self.tune_kernel(k)
+            self.print_summary()
+        self.output_stream.close()
+
+    def tune_kernel(self, k):
 
         if config.Arguments.params_from_file:
             paramValues = self.readParamValues()
@@ -522,9 +531,9 @@ def run(self):
             #Filter out only test cases based on heusristics such as tile size is multiple of block size etc.. 
             combs = filter(self.tile_size_multiple_filter, combs)
             #Filter out only test cases where shared memory is true
-            combs = filter(lambda conf: conf[3] == True, combs)
+            #combs = filter(lambda conf: conf[3] == True, combs)
             #Filter out only test cases where private memory is true
-            combs = filter(lambda conf: conf[4] == True, combs)
+            #combs = filter(lambda conf: conf[4] == True, combs)
 
         if config.Arguments.parallelize_compilation:
             self.pipelineExec(combs)
@@ -541,7 +550,7 @@ def run(self):
                 cnt += 1
                 continue
             print '---- Configuration ' + str(cnt) + ': ' + str(conf)
-            cur = individual.create_test_case(conf[0], conf[1], conf[2], conf[3], conf[4], conf[5])
+            cur = individual.create_test_case(conf[0], conf[1], conf[2], conf[3], conf[4], conf[5], k)
             cur.set_ID(cnt)
             cnt += 1
             cur.run(best_time)
@@ -579,6 +588,18 @@ def logall(self):
             debug.summary_message(i.ppcg_cmd_line_flags, False)
         pass
 
+
+    def print_summary(self):
+        old_stdout    = sys.stdout
+        try:
+            if config.Arguments.results_file is not None:
+                sys.stdout    = self.output_stream
+                self.summarise()
+                #self.logall()
+        finally:
+            if config.Arguments.results_file is not None:
+                sys.stdout = old_stdout
+
 class SimulatedAnnealing(SearchStrategy):
    """Search using simulated annealing"""
 
diff --git a/individual.py b/individual.py
index 38c095c..15ca0f4 100644
--- a/individual.py
+++ b/individual.py
@@ -30,10 +30,11 @@ def get_fittest(population):
         raise internal_exceptions.NoFittestException("None of the individuals among this population completed successfully, hence there is no fittest individual")
     return fittest
 
-def create_test_case(tile_size, block_size, grid_size, shared_mem=True, private_mem=True, fusion='max'):
+def create_test_case(tile_size, block_size, grid_size, shared_mem=True, private_mem=True, fusion='max', k=compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL):
     individual = Individual()   
     per_kernel_size_info = collections.OrderedDict()
-    per_kernel_size_info[compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL] = compiler_flags.SizeTuple(tile_size, block_size, grid_size)
+    per_kernel_size_info[k] = compiler_flags.SizeTuple(tile_size, block_size, grid_size)
+    individual.kernel_num = k
 
     #for flag in compiler_flags.PPCG.optimisation_flags:
     #    print(flag)
@@ -106,6 +107,7 @@ def __init__(self):
         self.status           = enums.Status.failed
         self.execution_time   = float("inf") 
         self.num = 0
+        self.kernel_num=compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL
         
     def all_flags(self):
         return self.ppcg_flags.keys() + self.cc_flags.keys() + self.cxx_flags.keys() + self.nvcc_flags.keys()
@@ -214,7 +216,16 @@ def deleteFile(self, fileName):
     def binary(self, best_execution_time=float("inf")):
         #time_regex = re.compile(r'^(\d*\.\d+|\d+)$')
         #print config.Arguments.execution_time_regex
-        time_regex = re.compile(config.Arguments.execution_time_regex)
+        if config.Arguments.prl_profiling:
+            if self.kernel_num == compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL:
+                re_str = r'compute\s*:\s*(\d*.\d+)ms'
+            else:
+                re_str = r'kernel'+str(self.kernel_num)+'\s*:\s*(\d*.\d+)ms'
+        else:
+            re_str = config.Arguments.execution_time_regex
+
+        print re_str
+        time_regex = re.compile(re_str)
         total_time = 0.0
         status     = enums.Status.passed
         num_actual_runs = 0
@@ -258,7 +269,6 @@ def binary(self, best_execution_time=float("inf")):
                 #print "Execution time of cur test case is worst than the best so far, stopping at first run" 
                 break
 
-            
         self.status = status
         config.time_binary += total_time
         if num_actual_runs != 0:
diff --git a/main.py b/main.py
index a80f5ae..b0ca718 100755
--- a/main.py
+++ b/main.py
@@ -9,6 +9,7 @@
 import sys
 
 def print_summary(search):
+    return 
     try:
         if config.Arguments.results_file is not None:
             old_stdout    = sys.stdout
@@ -145,7 +146,7 @@ def string_csv(string):
                                             help="dont modify the cmd string, note the output file nmaes should be part of cmd lines",
                                             default=False)
 
-    runs = 5
+    runs = 1
     building_and_running_group.add_argument("--runs",
                                             type=int,
                                             metavar="<int>",
@@ -168,11 +169,16 @@ def string_csv(string):
                             help="regular expression format for execution time",
                             default=r'^(\d*\.\d+|\d+)$')
     
+    
+    building_and_running_group.add_argument("--prl-profiling",
+                                            action="store_true",
+                                            help="Using prl profiling, used to extract timing info from prl profiling output",
+                                            default=False)
     # PPCG options
     ppcg_group = parser.add_argument_group("PPCG arguments")
     
     ppcg_group.add_argument("--target",
-                            choices=[enums.Targets.cuda, enums.Targets.opencl],
+                            choices=[enums.Targets.cuda, enums.Targets.opencl, enums.Targets.prl],
                             help="the target to generate code for",
                             default=enums.Targets.opencl)
     
@@ -192,6 +198,13 @@ def string_csv(string):
                             metavar="<LIST>",
                             help="consider only these values when tuning the shared memory size (default: %s)" % (shared_memory_possibilties),
                             default=shared_memory_possibilties)
+     
+    kernels_list = [compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL]
+    ppcg_group.add_argument("--kernels-to-tune",
+                            type=int_csv,
+                            metavar="<LIST>",
+                            help="consider only these kernels values when tuning (default: all)",
+                            default=kernels_list)
     
     tile_size_range = (2**0, 2**6)
     ppcg_group.add_argument("--tile-size-range",
@@ -389,7 +402,7 @@ def string_csv(string):
     parser_exhaustive.add_argument("--filter-testcases",
                          action="store_true",
                          help="few heursitics to reduce search space such as tile size multiple of block size, tile size > block size etc..",
-                         default=False)
+                         default=True)
 
     parser_exhaustive.add_argument("--parallelize-compilation",
                          action="store_true",
@@ -405,7 +418,7 @@ def string_csv(string):
                                help="number of threads to use for ppcg compilation (default: %d)" % num_compile_threads)
     
     
-    max_work_group_size = 256 
+    max_work_group_size = 1024
     parser_exhaustive.add_argument("--max-work-group-size",
                                type=int,
                                metavar="<int>",

From a5ed122132c0319a023dc5cf71042861a716611e Mon Sep 17 00:00:00 2001
From: chandanReddy <chandan1986g@gmail.com>
Date: Thu, 7 Apr 2016 16:55:58 +0200
Subject: [PATCH 29/34] added concurrent kernel tuning option, enabled by
 default

---
 heuristic_search.py | 35 ++++++++++++++++++++++++++++++++++-
 individual.py       | 30 ++++++++++++++++++++++++++++++
 main.py             |  5 +++++
 3 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/heuristic_search.py b/heuristic_search.py
index b206de5..1f11bff 100644
--- a/heuristic_search.py
+++ b/heuristic_search.py
@@ -509,7 +509,14 @@ def tile_size_multiple_filter(self, conf):
 
     def run(self):
         self.individuals = []
+        self.multi_kernel = False
         self.output_stream = open(config.Arguments.results_file, 'w')
+        if config.Arguments.no_concurrent_kernel_tuning:
+            self.multi_kernel = True
+            self.tune_kernel(compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL)
+            self.print_summary()
+            return
+            
         for k in config.Arguments.kernels_to_tune:
             self.individuals = []
             self.tune_kernel(k)
@@ -543,6 +550,12 @@ def tune_kernel(self, k):
         start_iter = 0
 
         best_time = float("inf")
+        best_kernel_time = [] 
+        self.best_kernel_run = []
+        if self.multi_kernel:
+            for s in config.Arguments.kernels_to_tune:
+                best_kernel_time.append(float("inf"))
+                self.best_kernel_run.append(0)
         #print 'Parameter values to be explored: ' + str(paramValues)
         #print 'Number of configurations: ' + str(self.countConfigs(paramValues))
         for conf in combs:
@@ -563,6 +576,15 @@ def tune_kernel(self, k):
             if cur.execution_time == 0:
                 continue
 
+            if self.multi_kernel:
+                for k in config.Arguments.kernels_to_tune:
+                    if cur.per_kernel_time[k] < best_kernel_time[k]:
+                        best_kernel_time[k] = cur.per_kernel_time[k]
+                        self.best_kernel_run[k] = cur
+                        f.write("\n Best time so far for kernel "+str(k) + " ID " +  str(cnt) + " kernel time = " + str(best_kernel_time[k]))
+                        f.write(str(cur.ppcg_cmd_line_flags))
+                        f.flush()
+
             if cur.execution_time < best_time and cur.status == enums.Status.passed:
                 self.individuals.append(cur)
                 best_time = cur.execution_time
@@ -571,6 +593,14 @@ def tune_kernel(self, k):
                 f.write(str(best_run))
                 f.flush()
 
+            
+    def summarise_per_kernel(self):
+        for k in config.Arguments.kernels_to_tune:
+            print "Best config for kernel " + str(k)
+            print("had execution time %f ms" % (self.best_kernel_run[k].execution_time)) 
+            print("To replicate, use the following configuration:")
+            print(self.best_kernel_run[k].ppcg_cmd_line_flags, False)
+
     def summarise(self):
         print("%s Summary of %s %s" % ('*' * 30, __name__, '*' * 30))
         try:
@@ -594,7 +624,10 @@ def print_summary(self):
         try:
             if config.Arguments.results_file is not None:
                 sys.stdout    = self.output_stream
-                self.summarise()
+                if self.multi_kernel:
+                    self.summarise_per_kernel()
+                else:
+                    self.summarise()
                 #self.logall()
         finally:
             if config.Arguments.results_file is not None:
diff --git a/individual.py b/individual.py
index 15ca0f4..8818d17 100644
--- a/individual.py
+++ b/individual.py
@@ -108,6 +108,9 @@ def __init__(self):
         self.execution_time   = float("inf") 
         self.num = 0
         self.kernel_num=compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL
+        self.per_kernel_time = [] 
+        for k in config.Arguments.kernels_to_tune:
+            self.per_kernel_time.append(float("inf"))
         
     def all_flags(self):
         return self.ppcg_flags.keys() + self.cc_flags.keys() + self.cxx_flags.keys() + self.nvcc_flags.keys()
@@ -213,6 +216,32 @@ def deleteFile(self, fileName):
         except:
             pass
 
+    def extract_kernel_time(self, kernel_num, stdout):
+        re_str = r'kernel'+str(kernel_num)+'\s*:\s*(\d*.\d+)ms'
+        print re_str
+        time_regex = re.compile(re_str)
+        total_time = 0.0
+
+        nmatchedlines = 0
+        for line in stdout.split(os.linesep):
+            line    = line.strip()
+            matches = time_regex.findall(line)
+            if matches:
+                nmatchedlines += 1
+                try:
+                    total_time += float(matches[0])
+                except:
+                    raise internal_exceptions.BinaryRunException("Execution time '%s' is not in the required format" % matches[0])
+        if nmatchedlines == 0:
+            total_time = float("inf")
+        return total_time
+
+    def update_kernel_times(self, stdout):
+        if not config.Arguments.prl_profiling:
+            return
+        for k in config.Arguments.kernels_to_tune:
+            self.per_kernel_time[k] = self.extract_kernel_time(k, stdout)
+
     def binary(self, best_execution_time=float("inf")):
         #time_regex = re.compile(r'^(\d*\.\d+|\d+)$')
         #print config.Arguments.execution_time_regex
@@ -247,6 +276,7 @@ def binary(self, best_execution_time=float("inf")):
             if config.Arguments.execution_time_from_binary:
                 if not stdout:
                     raise internal_exceptions.BinaryRunException("Expected the binary to dump its execution time. Found nothing")
+                self.update_kernel_times(stdout)
                 nmatchedlines = 0
                 for line in stdout.split(os.linesep):
                     line    = line.strip()
diff --git a/main.py b/main.py
index b0ca718..fb197b6 100755
--- a/main.py
+++ b/main.py
@@ -174,6 +174,11 @@ def string_csv(string):
                                             action="store_true",
                                             help="Using prl profiling, used to extract timing info from prl profiling output",
                                             default=False)
+
+    building_and_running_group.add_argument("--no-concurrent-kernel-tuning",
+                                            action="store_false",
+                                            help="Do not tune multiple kernels at the same time",
+                                            default=True)
     # PPCG options
     ppcg_group = parser.add_argument_group("PPCG arguments")
     

From 2807d3f843f47ebb56d786e4d48071f389fca245 Mon Sep 17 00:00:00 2001
From: chandanReddy <chandan1986g@gmail.com>
Date: Mon, 11 Apr 2016 15:53:30 +0200
Subject: [PATCH 30/34] changed the order of grid and block sizes in explore
 params

---
 explore-params.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/explore-params.py b/explore-params.py
index b8c5f93..dd861ad 100644
--- a/explore-params.py
+++ b/explore-params.py
@@ -5,13 +5,13 @@
   # Tile sizes
   [(16,16), (32,32), (64,64)],
 
-  # Grid sizes
-  [(16,16), (32,32), (256,256), (1024,1024)],
-
   # Block sizes
   [(1,1), (1,2), (1,4), (1,8),
    (16,16), (32,32), (64,64)],
 
+  # Grid sizes
+  [(16,16), (32,32), (256,256), (1024,1024)],
+
   #Shared memory
   [False],
 

From 2c0f3a9643c93d71857b7c6622f561566146418e Mon Sep 17 00:00:00 2001
From: chandanReddy <chandan1986g@gmail.com>
Date: Mon, 11 Apr 2016 15:59:19 +0200
Subject: [PATCH 31/34] added concurrent kernel tuning support

---
 heuristic_search.py | 17 ++++++++++++-----
 individual.py       |  1 -
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/heuristic_search.py b/heuristic_search.py
index 1f11bff..3e9821b 100644
--- a/heuristic_search.py
+++ b/heuristic_search.py
@@ -452,6 +452,7 @@ def get_last_iter(self):
             except:
                 start_iter = 0
                 pass
+            f_iter.close()
             print("starting from test case = ", start_iter)
         else:
             start_iter = 0
@@ -523,7 +524,7 @@ def run(self):
             self.print_summary()
         self.output_stream.close()
 
-    def tune_kernel(self, k):
+    def tune_kernel(self, ker_num):
 
         if config.Arguments.params_from_file:
             paramValues = self.readParamValues()
@@ -546,8 +547,10 @@ def tune_kernel(self, k):
             self.pipelineExec(combs)
             return
 
+        start_iter = self.get_last_iter() 
+
         f = open(config.Arguments.results_file + ".log", 'a')
-        start_iter = 0
+        f_iter = open('.lastiter', 'w')
 
         best_time = float("inf")
         best_kernel_time = [] 
@@ -563,7 +566,7 @@ def tune_kernel(self, k):
                 cnt += 1
                 continue
             print '---- Configuration ' + str(cnt) + ': ' + str(conf)
-            cur = individual.create_test_case(conf[0], conf[1], conf[2], conf[3], conf[4], conf[5], k)
+            cur = individual.create_test_case(conf[0], conf[1], conf[2], conf[3], conf[4], conf[5], ker_num)
             cur.set_ID(cnt)
             cnt += 1
             cur.run(best_time)
@@ -577,12 +580,13 @@ def tune_kernel(self, k):
                 continue
 
             if self.multi_kernel:
+                #f.write("\n====================================\n")
                 for k in config.Arguments.kernels_to_tune:
                     if cur.per_kernel_time[k] < best_kernel_time[k]:
                         best_kernel_time[k] = cur.per_kernel_time[k]
                         self.best_kernel_run[k] = cur
                         f.write("\n Best time so far for kernel "+str(k) + " ID " +  str(cnt) + " kernel time = " + str(best_kernel_time[k]))
-                        f.write(str(cur.ppcg_cmd_line_flags))
+                        f.write(str(cur.ppcg_cmd_line_flags) + str("\n"))
                         f.flush()
 
             if cur.execution_time < best_time and cur.status == enums.Status.passed:
@@ -593,11 +597,14 @@ def tune_kernel(self, k):
                 f.write(str(best_run))
                 f.flush()
 
+            f_iter.seek(0)
+            f_iter.write(str(cur.get_ID()))
+
             
     def summarise_per_kernel(self):
         for k in config.Arguments.kernels_to_tune:
             print "Best config for kernel " + str(k)
-            print("had execution time %f ms" % (self.best_kernel_run[k].execution_time)) 
+            print("had execution time %f ms" % (self.best_kernel_time[k])) 
             print("To replicate, use the following configuration:")
             print(self.best_kernel_run[k].ppcg_cmd_line_flags, False)
 
diff --git a/individual.py b/individual.py
index 8818d17..2456d09 100644
--- a/individual.py
+++ b/individual.py
@@ -218,7 +218,6 @@ def deleteFile(self, fileName):
 
     def extract_kernel_time(self, kernel_num, stdout):
         re_str = r'kernel'+str(kernel_num)+'\s*:\s*(\d*.\d+)ms'
-        print re_str
         time_regex = re.compile(re_str)
         total_time = 0.0
 

From 53491101a168f2762ca55af9644cbf25803469a6 Mon Sep 17 00:00:00 2001
From: Chandan Reddy <chandan1986g@gmail.com>
Date: Thu, 22 Sep 2016 18:15:57 +0200
Subject: [PATCH 32/34] Fixed the run thread dual compilation bug

---
 heuristic_search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/heuristic_search.py b/heuristic_search.py
index 194111a..a99ddc6 100644
--- a/heuristic_search.py
+++ b/heuristic_search.py
@@ -352,7 +352,7 @@ def run(self):
                     break
                 continue
             #print('***run thread got job')
-            testcase.run(best_time)
+            testcase.binary(best_time)
             f_iter.seek(0)
             f_iter.write(str(testcase.get_ID()))
 

From 9451ffbdb0c8ab2cadfbc87dc070389ce4e98a02 Mon Sep 17 00:00:00 2001
From: Chandan Reddy <chandan1986g@gmail.com>
Date: Thu, 22 Sep 2016 18:17:58 +0200
Subject: [PATCH 33/34] Removed the call to unused parse sizes

---
 individual.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/individual.py b/individual.py
index 38c095c..c95de12 100644
--- a/individual.py
+++ b/individual.py
@@ -170,8 +170,6 @@ def ppcg(self):
         config.time_PPCG += end - start
         if self.ppcg_proc.returncode:
             raise internal_exceptions.FailedCompilationException("FAILED: '%s'" % config.Arguments.ppcg_cmd)         
-        # Store the sizes used by PPCG
-        self.size_data = compiler_flags.SizesFlag.parse_PPCG_dump_sizes(stderr)
         
 
     def ppcg_with_timeout(self, timeout=float("inf")):

From ab3f6af5700a15ed945869e10f973ed0298de55a Mon Sep 17 00:00:00 2001
From: Chandan Reddy <chandan1986g@gmail.com>
Date: Thu, 22 Sep 2016 18:24:41 +0200
Subject: [PATCH 34/34] Removed fusion option no longer supported by ppc

---
 explore-params.py   | 2 --
 heuristic_search.py | 9 ++-------
 individual.py       | 6 +++---
 main.py             | 5 -----
 4 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/explore-params.py b/explore-params.py
index b8c5f93..daaca14 100644
--- a/explore-params.py
+++ b/explore-params.py
@@ -18,6 +18,4 @@
   #private memory
   [False, True],
 
-  #Fusion
-  ['max', 'min']
 ])
diff --git a/heuristic_search.py b/heuristic_search.py
index a99ddc6..7cce121 100644
--- a/heuristic_search.py
+++ b/heuristic_search.py
@@ -434,12 +434,7 @@ def createExhaConfigs(self):
         else:
             private_mem = [False]
 
-        if config.Arguments.all_fusion_structures:
-            fusion = ['max', 'min']
-        else:
-            fusion = ['max']
-
-        paramValues = [tile_sizes, block_sizes, grid_sizes, shared_mem, private_mem, fusion]
+        paramValues = [tile_sizes, block_sizes, grid_sizes, shared_mem, private_mem]
         return paramValues
 
     def get_last_iter(self):
@@ -541,7 +536,7 @@ def run(self):
                 cnt += 1
                 continue
             print '---- Configuration ' + str(cnt) + ': ' + str(conf)
-            cur = individual.create_test_case(conf[0], conf[1], conf[2], conf[3], conf[4], conf[5])
+            cur = individual.create_test_case(conf[0], conf[1], conf[2], conf[3], conf[4]])
             cur.set_ID(cnt)
             cnt += 1
             cur.run(best_time)
diff --git a/individual.py b/individual.py
index c95de12..0f36240 100644
--- a/individual.py
+++ b/individual.py
@@ -30,7 +30,7 @@ def get_fittest(population):
         raise internal_exceptions.NoFittestException("None of the individuals among this population completed successfully, hence there is no fittest individual")
     return fittest
 
-def create_test_case(tile_size, block_size, grid_size, shared_mem=True, private_mem=True, fusion='max'):
+def create_test_case(tile_size, block_size, grid_size, shared_mem=True, private_mem=True):
     individual = Individual()   
     per_kernel_size_info = collections.OrderedDict()
     per_kernel_size_info[compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL] = compiler_flags.SizeTuple(tile_size, block_size, grid_size)
@@ -54,8 +54,8 @@ def create_test_case(tile_size, block_size, grid_size, shared_mem=True, private_
         individual.ppcg_flags[flag] = True 
 
     #Set isl fusion flag
-    flag = compiler_flags.PPCG.optimisation_flags[6]
-    individual.ppcg_flags[flag] = fusion
+    #flag = compiler_flags.PPCG.optimisation_flags[6]
+    #individual.ppcg_flags[flag] = fusion
     #string  = individual.ppcg_flags[flag].get_command_line_string(1024)
     #print(string)
     #print("end")
diff --git a/main.py b/main.py
index a80f5ae..cc66f2d 100755
--- a/main.py
+++ b/main.py
@@ -380,11 +380,6 @@ def string_csv(string):
                          help="Search for parameter values that are powers of two",
                          default=False)
 
-    parser_exhaustive.add_argument("--all-fusion-structures",
-                         action="store_true",
-                         help="explore all fusion structures [max, min] ",
-                         default=False)
-
 
     parser_exhaustive.add_argument("--filter-testcases",
                          action="store_true",