diff --git a/compiler_flags.py b/compiler_flags.py
index 26c88ca..309e519 100644
--- a/compiler_flags.py
+++ b/compiler_flags.py
@@ -47,7 +47,7 @@ def get_command_line_string(self, value):
             else:
                 return ""
         else:
-            return "%s %s" % (self.name, value.__str__( ))
+            return "%s=%s" % (self.name, value.__str__( ))
     
 class Size:
     """Models a tile, block or grid size"""
@@ -202,7 +202,15 @@ def random_value(self):
                                                                          self.block_size.random_value(),
                                                                          self.grid_size.random_value())
         return per_kernel_size_info
+     
+    def init_value(self, tile_size, block_size, grid_size):
+        per_kernel_size_info = collections.OrderedDict()
+        per_kernel_size_info[SizesFlag.ALL_KERNELS_SENTINEL] = SizeTuple(TileSize(tile_size), 
+                                                                         BlockSize(block_size),
+                                                                         GridSize(grid_size))
+        return per_kernel_size_info
     
+
     def permute(self, value):
         per_kernel_size_info = collections.OrderedDict()
         for kernel_number, size_tuple in value.iteritems():
@@ -301,8 +309,8 @@ class PPCG:
     flag_map[no_isl_schedule_separate_components]  = EnumerationFlag(no_isl_schedule_separate_components)
     flag_map[no_wrap]                              = EnumerationFlag(no_wrap)
     flag_map[no_scale_tile_loops]                  = EnumerationFlag(no_scale_tile_loops)
-    flag_map[no_shared_memory]                     = EnumerationFlag(no_shared_memory)
-    flag_map[no_private_memory]                    = EnumerationFlag(no_private_memory)
+    flag_map[no_shared_memory]                     = EnumerationFlag(no_shared_memory, [True, False])
+    flag_map[no_private_memory]                    = EnumerationFlag(no_private_memory, [True, False])
     flag_map[no_live_range_reordering]             = EnumerationFlag(no_live_range_reordering)
     
     optimisation_flags = []
diff --git a/enums.py b/enums.py
index c67f0e0..7c0d7bb 100644
--- a/enums.py
+++ b/enums.py
@@ -1,6 +1,7 @@
 class Targets:
     cuda   = "cuda"
     opencl = "opencl"
+    prl    = "prl"
     
 class Crossover:
     one_point = "one_point"
@@ -18,8 +19,11 @@ class SearchStrategy:
     ga                  = "ga"
     random              = "random"
     simulated_annealing = "simulated-annealing"
+    exhaustive          = "exhaustive"
 
 class Status:
     passed = "passed"
     failed = "failed"
-    
\ No newline at end of file
+    timeout = "timedout"
+    ppcgtimeout = "ppcg_timeout"
+    
diff --git a/explore-params.py b/explore-params.py
new file mode 100644
index 0000000..be92d6b
--- /dev/null
+++ b/explore-params.py
@@ -0,0 +1,21 @@
+# This file contains the PPCG parameter values that are explored.
+# The exploration script considers each combination of the parameter values.
+
+([
+  # Tile sizes
+  [(16,16), (32,32), (64,64)],
+
+  # Block sizes
+  [(1,1), (1,2), (1,4), (1,8),
+   (16,16), (32,32), (64,64)],
+
+  # Grid sizes
+  [(16,16), (32,32), (256,256), (1024,1024)],
+
+  #Shared memory
+  [False],
+
+  #private memory
+  [False, True],
+
+])
diff --git a/heuristic_search.py b/heuristic_search.py
index caece42..3994bfa 100644
--- a/heuristic_search.py
+++ b/heuristic_search.py
@@ -9,6 +9,11 @@
 import individual
 import collections
 import internal_exceptions
+import itertools
+import os
+from Queue import Queue
+from threading import Thread
+import sys
 
 class SearchStrategy:
     """Abstract class for a search strategy"""
@@ -20,9 +25,16 @@ def run(self):
     @abc.abstractmethod
     def summarise(self):
         pass
+
+    @abc.abstractmethod
+    def logall(self):
+        pass
     
 class GA(SearchStrategy):
     """Search using a genetic algorithm"""
+
+    def logall(self):
+        return
     
     def set_child_flags(self, child, the_flags, the_flag_values):
         for idx, flag in enumerate(the_flags):
@@ -291,22 +303,357 @@ def summarise(self):
         except internal_exceptions.NoFittestException:
             pass
 
+    def logall(self):
+        for i in self.individuals:
+            debug.summary_message(i.ppcg_cmd_line_flags, False)
+        return
+
+compile_queue = Queue(10)
+run_queue = Queue(10)
+
+class CompileThread(Thread):
+    def run(self):
+        global compile_queue
+        global run_queue
+        while True:
+            testcase = compile_queue.get()
+            if isinstance(testcase, individual.EndOfQueue):
+                run_queue.put(testcase)
+                break
+
+            testcase.ppcg()
+            testcase.build()
+            run_queue.put(testcase)
+
+class RunThread(Thread):
+    def __init__(self, num_threads):
+        super(RunThread, self).__init__()
+        self.num_threads = num_threads
+        self.individuals = []
+
+    def run(self):
+        global run_queue
+        best_time = float("inf")
+        f = open(config.Arguments.results_file + ".log", 'a')
+        f_iter = open('.lastiter', 'w')
+        while True:
+            #print('***run thread waiting')
+            testcase = run_queue.get()
+            if isinstance(testcase, individual.EndOfQueue):
+                self.num_threads = self.num_threads - 1
+                print('***remaining threads: ' + str(self.num_threads))
+                if self.num_threads<=0:
+                    try:
+                       os.remove('.lastiter')
+                       self.summarise()
+                       self.logall()
+                    except:
+                       pass
+                    print('***run thread exiting')
+                    break
+                continue
+            #print('***run thread got job')
+            testcase.binary(best_time)
+            f_iter.seek(0)
+            f_iter.write(str(testcase.get_ID()))
+
+            if testcase.execution_time < best_time and testcase.execution_time != 0 and testcase.status == enums.Status.passed: 
+                self.individuals.append(testcase)
+                best_time = testcase.execution_time
+                f.write("\n Best iter so far = \n")
+                f.write(str(testcase))
+                f.flush()
+
+    def summarise(self):
+        print("%s Summary of %s %s" % ('*' * 30, __name__, '*' * 30))
+        try:
+            fittest = individual.get_fittest(self.individuals)
+            debug.summary_message("The fittest individual had execution time %f seconds" % (fittest.execution_time)) 
+            debug.summary_message("To replicate, pass the following to PPCG:")
+            debug.summary_message(fittest.ppcg_cmd_line_flags, False)
+        except internal_exceptions.NoFittestException:
+           pass
+
+    def logall(self):
+        print("%s Log of all runs %s" %('*' * 30, '*' * 30))
+        for i in self.individuals:
+            print(i)
+            debug.summary_message(i.ppcg_cmd_line_flags, False)
+        pass
+
+class Exhaustive(SearchStrategy):
+    """Exhaustive search all the values in the specified range or """
+    """all combinations provided in explore-params.py file"""
+
+    def readParamValues(self):
+        f = open('explore-params.py', 'r')
+        paramValues = eval(f.read())
+        f.close()
+        return paramValues
+
+    def countConfigs(self, paramValues):
+        n = 1
+        for i in paramValues:
+            n *= len(i)
+        return n
+
+    def createExhaConfigs(self):
+        tile_size_lb = config.Arguments.tile_size_range[0] 
+        tile_size_ub = config.Arguments.tile_size_range[1]
+        if config.Arguments.only_powers_of_two:
+            tile_size_range = [2**i for i in range(tile_size_lb, tile_size_ub)]
+        else:
+            tile_size_range = range(tile_size_lb, tile_size_ub)
+
+        tile_sizes = itertools.product(tile_size_range, repeat=config.Arguments.tile_dimensions)
+        
+        block_size_lb = config.Arguments.block_size_range[0] 
+        block_size_ub = config.Arguments.block_size_range[1] 
+        if config.Arguments.only_powers_of_two:
+            block_size_range = [2**i for i in range(block_size_lb, block_size_ub)]
+        else:
+            block_size_range = range(block_size_lb, block_size_ub)
+
+        block_sizes = itertools.product(block_size_range, repeat=config.Arguments.block_dimensions)
+
+        grid_size_lb = config.Arguments.grid_size_range[0] 
+        grid_size_ub = config.Arguments.grid_size_range[1] 
+        if config.Arguments.only_powers_of_two:
+            grid_size_range = [2**i for i in range(grid_size_lb, grid_size_ub)]
+        else:
+            grid_size_range = range(grid_size_lb, grid_size_ub)
+
+        grid_sizes = itertools.product(grid_size_range, repeat=config.Arguments.grid_dimensions)
+
+        if config.Arguments.no_shared_memory:
+            shared_mem = [True, False]
+        else:
+            shared_mem = [False]
+
+        if config.Arguments.no_private_memory:
+            private_mem = [True, False]
+        else:
+            private_mem = [False]
+
+        paramValues = [tile_sizes, block_sizes, grid_sizes, shared_mem, private_mem]
+        return paramValues
+
+    def get_last_iter(self):
+        if os.path.isfile(".lastiter"):
+            print("found last iter")
+            try:
+                f_iter = open(".lastiter", 'r+')
+                start_iter = int(f_iter.readline())
+            except:
+                start_iter = 0
+                pass
+            f_iter.close()
+            print("starting from test case = ", start_iter)
+        else:
+            start_iter = 0
+
+        return start_iter
+
+    def pipelineExec(self, combs):
+
+        start_iter = self.get_last_iter()
+        num_threads = config.Arguments.num_compile_threads
+        for i in range(num_threads):
+            t = CompileThread()
+            t.daemon = True
+            t.start()
+
+        RunThread(num_threads).start()
+
+        cnt = 0
+        for conf in combs:
+            if cnt < start_iter:
+                cnt += 1
+                continue
+            print '---- Configuration ' + str(cnt) + ': ' + str(conf)
+            cur = individual.create_test_case(conf[0], conf[1], conf[2], conf[3], conf[4])
+            cur.set_ID(cnt)
+            cnt += 1
+            compile_queue.put(cur)
+
+        for i in range(num_threads):
+            compile_queue.put(individual.EndOfQueue()) # So every CompileThread fetches one EndOfQueue element
+       
+    def tile_size_multiple_filter(self, conf):
+        tile_size = conf[0]
+        block_size = conf[1]
+
+        work_group_size = reduce(lambda x,y: x*y, block_size)
+        if work_group_size > config.Arguments.max_work_group_size:
+            return False
+
+        if work_group_size < config.Arguments.min_work_group_size:
+            return False
+
+        mul_factor = 1
+        for t, b in zip(tile_size, block_size):
+            if t < b:
+                return False
+            if t % b != 0:
+                return False
+            mul_factor *= t/b
+
+        if mul_factor > 36:
+            return False
+
+        return True
+
+    def run(self):
+        self.individuals = []
+        self.multi_kernel = False
+        self.output_stream = open(config.Arguments.results_file, 'w')
+        if config.Arguments.no_concurrent_kernel_tuning:
+            self.multi_kernel = True
+            self.tune_kernel(compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL)
+            self.print_summary()
+            return
+            
+        for k in config.Arguments.kernels_to_tune:
+            self.individuals = []
+            self.tune_kernel(k)
+            self.print_summary()
+        self.output_stream.close()
+
+    def tune_kernel(self, ker_num):
+
+        if config.Arguments.params_from_file:
+            paramValues = self.readParamValues()
+        else:
+            paramValues = self.createExhaConfigs()
+
+        cnt = 0
+        combs = itertools.product(*paramValues)
+
+
+        if config.Arguments.filter_testcases:
+            #Filter out only test cases based on heusristics such as tile size is multiple of block size etc.. 
+            combs = filter(self.tile_size_multiple_filter, combs)
+            #Filter out only test cases where shared memory is true
+            #combs = filter(lambda conf: conf[3] == True, combs)
+            #Filter out only test cases where private memory is true
+            #combs = filter(lambda conf: conf[4] == True, combs)
+
+        if config.Arguments.parallelize_compilation:
+            self.pipelineExec(combs)
+            return
+
+        start_iter = self.get_last_iter() 
+
+        f = open(config.Arguments.results_file + ".log", 'a')
+        f_iter = open('.lastiter', 'w')
+
+        best_time = float("inf")
+        best_kernel_time = [] 
+        self.best_kernel_run = []
+        if self.multi_kernel:
+            for s in config.Arguments.kernels_to_tune:
+                best_kernel_time.append(float("inf"))
+                self.best_kernel_run.append(0)
+        #print 'Parameter values to be explored: ' + str(paramValues)
+        #print 'Number of configurations: ' + str(self.countConfigs(paramValues))
+        for conf in combs:
+            if cnt < start_iter:
+                cnt += 1
+                continue
+            print '---- Configuration ' + str(cnt) + ': ' + str(conf)
+            cur = individual.create_test_case(conf[0], conf[1], conf[2], conf[3], conf[4], ker_num)
+            cur.set_ID(cnt)
+            cnt += 1
+            cur.run(best_time)
+            if cur.status == enums.Status.ppcgtimeout :
+                f.write("\nppcg timeout")
+                f.write(str(best_run))
+                f.flush()
+                continue
+                
+            if cur.execution_time == 0:
+                continue
+
+            if self.multi_kernel:
+                #f.write("\n====================================\n")
+                for k in config.Arguments.kernels_to_tune:
+                    if cur.per_kernel_time[k] < best_kernel_time[k]:
+                        best_kernel_time[k] = cur.per_kernel_time[k]
+                        self.best_kernel_run[k] = cur
+                        f.write("\n Best time so far for kernel "+str(k) + " ID " +  str(cnt) + " kernel time = " + str(best_kernel_time[k]))
+                        f.write(str(cur.ppcg_cmd_line_flags) + str("\n"))
+                        f.flush()
+
+            if cur.execution_time < best_time and cur.status == enums.Status.passed:
+                self.individuals.append(cur)
+                best_time = cur.execution_time
+                best_run = cur
+                f.write("\n Best iter so far = "+ str(cnt) + "\n")
+                f.write(str(best_run))
+                f.flush()
+
+            f_iter.seek(0)
+            f_iter.write(str(cur.get_ID()))
+
+            
+    def summarise_per_kernel(self):
+        for k in config.Arguments.kernels_to_tune:
+            print "Best config for kernel " + str(k)
+            print("had execution time %f ms" % (self.best_kernel_time[k])) 
+            print("To replicate, use the following configuration:")
+            print(self.best_kernel_run[k].ppcg_cmd_line_flags, False)
+
+    def summarise(self):
+        print("%s Summary of %s %s" % ('*' * 30, __name__, '*' * 30))
+        try:
+            fittest = individual.get_fittest(self.individuals)
+            debug.summary_message("The fittest individual had execution time %f seconds" % (fittest.execution_time)) 
+            debug.summary_message("To replicate, pass the following to PPCG:")
+            debug.summary_message(fittest.ppcg_cmd_line_flags, False)
+        except internal_exceptions.NoFittestException:
+           pass
+
+    def logall(self):
+        print("%s Log of all runs %s" %('*' * 30, '*' * 30))
+        for i in self.individuals:
+            print(i)
+            debug.summary_message(i.ppcg_cmd_line_flags, False)
+        pass
+
+
+    def print_summary(self):
+        old_stdout    = sys.stdout
+        try:
+            if config.Arguments.results_file is not None:
+                sys.stdout    = self.output_stream
+                if self.multi_kernel:
+                    self.summarise_per_kernel()
+                else:
+                    self.summarise()
+                #self.logall()
+        finally:
+            if config.Arguments.results_file is not None:
+                sys.stdout = old_stdout
+
 class SimulatedAnnealing(SearchStrategy):
-    """Search using simulated annealing"""
-    
-    def acceptance_probability(self, currentEnergy, newEnergy, temperature):
+   """Search using simulated annealing"""
+
+   def acceptance_probability(self, currentEnergy, newEnergy, temperature):
         if newEnergy < currentEnergy:
             return 1.0
-        return math.exp((currentEnergy - newEnergy) / temperature)
-    
-    def mutate_backend_flags(self, clone_flags, solution_flags):
+        return math.exp((currentEnergy - newEnergy) / temperature) 
+
+   def logall(self):
+        return
+
+   def mutate_backend_flags(self, clone_flags, solution_flags):
         for the_flag in solution_flags.keys():   
             if bool(random.getrandbits(1)):
                 idx    = the_flag.possible_values.index(solution_flags[the_flag])
                 newIdx = (idx + 1) % len(the_flag.possible_values)
                 clone_flags[the_flag] = the_flag.possible_values[newIdx]
     
-    def mutate(self, solution):
+   def mutate(self, solution):
         clone    = copy.deepcopy(solution)
         clone.ID = individual.Individual.get_ID()
         for the_flag in solution.ppcg_flags.keys():   
@@ -324,7 +671,7 @@ def mutate(self, solution):
         self.mutate_backend_flags(clone.nvcc_flags, solution.nvcc_flags)
         return clone
     
-    def run(self):        
+   def run(self):        
         debug.verbose_message("Creating initial solution", __name__)
         current = individual.create_random()
         current.run()   
@@ -344,8 +691,8 @@ def run(self):
                     if current.execution_time < self.fittest.execution_time:
                         self.fittest = current
     
-    def summarise(self):
+   def summarise(self):
         debug.summary_message("The final individual had execution time %f seconds" % (self.fittest.execution_time)) 
         debug.summary_message("To replicate, pass the following to PPCG:")
         debug.summary_message(self.fittest.ppcg_cmd_line_flags, False)
-        
\ No newline at end of file
+        
diff --git a/individual.py b/individual.py
index b5d23e3..e8b08ee 100644
--- a/individual.py
+++ b/individual.py
@@ -7,7 +7,15 @@
 import enums
 import collections
 import subprocess
+import threading
 import internal_exceptions
+import time
+
+class EndOfQueue:
+    def __init__(self):
+        pass
+
+
 
 def get_fittest(population):
     fittest = None
@@ -22,9 +30,43 @@ def get_fittest(population):
         raise internal_exceptions.NoFittestException("None of the individuals among this population completed successfully, hence there is no fittest individual")
     return fittest
 
+def create_test_case(tile_size, block_size, grid_size, shared_mem=True, private_mem=True, k=compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL):
+    individual = Individual()   
+    per_kernel_size_info = collections.OrderedDict()
+    per_kernel_size_info[k] = compiler_flags.SizeTuple(tile_size, block_size, grid_size)
+    individual.kernel_num = k
+
+    #for flag in compiler_flags.PPCG.optimisation_flags:
+    #    print(flag)
+
+    #TODO: Get a better way of getting size_data_flag
+    flag = compiler_flags.PPCG.optimisation_flags[4]
+    individual.ppcg_flags[flag] = per_kernel_size_info 
+
+    if not shared_mem:
+        flag = compiler_flags.PPCG.optimisation_flags[0]
+        #individual.ppcg_flags[flag] = compiler_flags.EnumerationFlag(flag) 
+        individual.ppcg_flags[flag] = True 
+
+
+    if not private_mem:
+        flag = compiler_flags.PPCG.optimisation_flags[7]
+        #individual.ppcg_flags[flag] = compiler_flags.EnumerationFlag(flag) 
+        individual.ppcg_flags[flag] = True 
+
+    #Set isl fusion flag
+    #flag = compiler_flags.PPCG.optimisation_flags[6]
+    #individual.ppcg_flags[flag] = fusion
+    #string  = individual.ppcg_flags[flag].get_command_line_string(1024)
+    #print(string)
+    #print("end")
+    return individual
+
+
 def create_random():
     individual = Individual()   
     for flag in compiler_flags.PPCG.optimisation_flags:
+        print(flag)
         individual.ppcg_flags[flag] = flag.random_value()
     for flag in compiler_flags.CC.optimisation_flags:
         individual.cc_flags[flag] = flag.random_value()
@@ -39,17 +81,36 @@ class Individual:
     
     ID = 0
     @staticmethod
-    def get_ID():
+    def get_ID_init():
         Individual.ID += 1
         return Individual.ID
     
+    def file_name(self):
+        if config.Arguments.binary_file_name:
+            return config.Arguments.binary_file_name
+
+        return 'testcase'+str(self.ID)
+        #return 'gemm'
+
+    def set_ID(self, num):
+        self.ID = num  
+
+    def get_ID(self):
+        return self.ID 
+    
     def __init__(self):
-        self.ID               = Individual.get_ID()
+        self.ID               = Individual.get_ID_init()
         self.ppcg_flags       = collections.OrderedDict()
         self.cc_flags         = collections.OrderedDict()
         self.cxx_flags        = collections.OrderedDict()
         self.nvcc_flags       = collections.OrderedDict()
         self.status           = enums.Status.failed
+        self.execution_time   = float("inf") 
+        self.num = 0
+        self.kernel_num=compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL
+        self.per_kernel_time = [] 
+        for k in config.Arguments.kernels_to_tune:
+            self.per_kernel_time.append(float("inf"))
         
     def all_flags(self):
         return self.ppcg_flags.keys() + self.cc_flags.keys() + self.cxx_flags.keys() + self.nvcc_flags.keys()
@@ -57,12 +118,15 @@ def all_flags(self):
     def all_flag_values(self):
         return self.ppcg_flags.values() + self.cc_flags.values() + self.cxx_flags.values() + self.nvcc_flags.values()
             
-    def run(self):
+    def run(self, timeout):
         try:
-            self.compile()
+            self.compile(timeout)
             if self.status == enums.Status.passed:
                 # Fitness is inversely proportional to execution time
-                self.fitness = 1/self.execution_time 
+                if self.execution_time == 0:
+                    self.fitness = float("inf")
+                else:
+                    self.fitness = 1/self.execution_time 
                 debug.verbose_message("Individual %d: execution time = %f, fitness = %f" \
                                       % (self.ID, self.execution_time, self.fitness), __name__) 
             else:
@@ -70,68 +134,205 @@ def run(self):
         except internal_exceptions.FailedCompilationException as e:
             debug.exit_message(e)
             
-    def compile(self):
+            
+    def checkforpause(self):
+        while(1):
+            if os.path.isfile('.pause'):
+                print("Auto tuning paused, remove .pause to restart")
+                time.sleep(20)
+            else:
+                #print("Auto tuning restarted")
+                break
+
+    def compile(self, timeout=float("inf")):
+        self.checkforpause()
         self.ppcg()
+        #sucess=self.ppcg_with_timeout(timeout)
+        #if not sucess:
+        #    return
         self.build()
-        self.binary()
+        self.binary(timeout)
 
     def ppcg(self):
         self.ppcg_cmd_line_flags = "--target=%s --dump-sizes %s" % (config.Arguments.target, 
                                                                     ' '.join(flag.get_command_line_string(self.ppcg_flags[flag]) for flag in self.ppcg_flags.keys()))
         
         os.environ["AUTOTUNER_PPCG_FLAGS"] = self.ppcg_cmd_line_flags
-        debug.verbose_message("Running '%s'" % config.Arguments.ppcg_cmd, __name__)
+
+        if config.Arguments.cmd_string_complete:
+            cmd = config.Arguments.ppcg_cmd+ ' '+self.ppcg_cmd_line_flags
+        elif config.Arguments.target == enums.Targets.cuda:
+            cmd = config.Arguments.ppcg_cmd + ' '+self.ppcg_cmd_line_flags+' -o '+self.file_name()
+        else:
+            cmd = config.Arguments.ppcg_cmd + ' '+self.ppcg_cmd_line_flags+' -o '+self.file_name()+'_host.c'
+
+        debug.verbose_message("Running '%s'" % cmd, __name__)
+        #debug.verbose_message("Running '%s'" % self.ppcg_cmd_line_flags , __name__)
         start  = timeit.default_timer()
-        proc   = subprocess.Popen(config.Arguments.ppcg_cmd, shell=True, stderr=subprocess.PIPE)  
-        stderr = proc.communicate()[1]
+        self.ppcg_proc   = subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE)  
+        stderr = self.ppcg_proc.communicate()[1]
         end    = timeit.default_timer()
         config.time_PPCG += end - start
-        if proc.returncode:
+        if self.ppcg_proc.returncode:
             raise internal_exceptions.FailedCompilationException("FAILED: '%s'" % config.Arguments.ppcg_cmd)         
-        # Store the sizes used by PPCG
-        self.size_data = compiler_flags.SizesFlag.parse_PPCG_dump_sizes(stderr)
         
+
+    def ppcg_with_timeout(self, timeout=float("inf")):
+        thread = threading.Thread(target=self.ppcg)
+        thread.start()
+        thread.join(timeout)
+        if thread.is_alive():
+            print("Timeout: terminating the ppcg ")
+            self.ppcg_proc.terminate()
+            thread.join(timeout)
+            self.status = enums.Status.ppcgtimeout
+            return False
+        return True
+
     def build(self):
-        debug.verbose_message("Running '%s'" % config.Arguments.build_cmd, __name__)
+        if config.Arguments.cmd_string_complete:
+            build_cmd = config.Arguments.build_cmd
+        elif config.Arguments.target == enums.Targets.cuda:
+            build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.cu ' + self.file_name()+ '_kernel.cu '+ '-o '+ self.file_name()+'.exe'
+        else:
+            build_cmd = config.Arguments.build_cmd + ' ' + self.file_name()+ '_host.c ' + '-o '+ self.file_name()+'.exe' + ' -lprl -lOpenCL'
+        debug.verbose_message("Running '%s'" % build_cmd, __name__)
         start  = timeit.default_timer()
-        proc   = subprocess.Popen(config.Arguments.build_cmd, shell=True)  
+        proc   = subprocess.Popen(build_cmd, shell=True)  
         stderr = proc.communicate()[1]     
         end    = timeit.default_timer()
         config.time_backend += end - start
         if proc.returncode:
             raise internal_exceptions.FailedCompilationException("FAILED: '%s'" % config.Arguments.build_cmd)
+
+
     
-    def binary(self):
-        time_regex = re.compile(r'^(\d*\.\d+|\d+)$')
+    def deleteFile(self, fileName):
+        try:
+            if os.path.exists(fileName):
+                os.remove(fileName)
+        except:
+            pass
+
+    def extract_kernel_time(self, kernel_num, stdout):
+        re_str = r'kernel'+str(kernel_num)+'\s*:\s*(\d*.\d+)ms'
+        time_regex = re.compile(re_str)
+        total_time = 0.0
+
+        nmatchedlines = 0
+        for line in stdout.split(os.linesep):
+            line    = line.strip()
+            matches = time_regex.findall(line)
+            if matches:
+                nmatchedlines += 1
+                try:
+                    total_time += float(matches[0])
+                except:
+                    raise internal_exceptions.BinaryRunException("Execution time '%s' is not in the required format" % matches[0])
+        if nmatchedlines == 0:
+            total_time = float("inf")
+        return total_time
+
+    def update_kernel_times(self, stdout):
+        if not config.Arguments.prl_profiling:
+            return
+        for k in config.Arguments.kernels_to_tune:
+            self.per_kernel_time[k] = self.extract_kernel_time(k, stdout)
+
+    def binary(self, best_execution_time=float("inf")):
+        #time_regex = re.compile(r'^(\d*\.\d+|\d+)$')
+        #print config.Arguments.execution_time_regex
+        if config.Arguments.prl_profiling:
+            if self.kernel_num == compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL:
+                re_str = r'compute\s*:\s*(\d*.\d+)ms'
+            else:
+                re_str = r'kernel'+str(self.kernel_num)+'\s*:\s*(\d*.\d+)ms'
+        else:
+            re_str = config.Arguments.execution_time_regex
+
+        print re_str
+        time_regex = re.compile(re_str)
         total_time = 0.0
         status     = enums.Status.passed
+        num_actual_runs = 0
         for run in xrange(1,config.Arguments.runs+1):
-            debug.verbose_message("Run #%d of '%s'" % (run, config.Arguments.run_cmd), __name__)
+            if config.Arguments.cmd_string_complete:
+                run_cmd = config.Arguments.run_cmd
+            else:
+                run_cmd = './'+self.file_name()+'.exe '+config.Arguments.run_cmd_input
+            #run_cmd = config.Arguments.run_cmd
+            debug.verbose_message("Run #%d of '%s'" % (run, run_cmd), __name__)
             start = timeit.default_timer()
-            proc  = subprocess.Popen(config.Arguments.run_cmd, shell=True, stdout=subprocess.PIPE)    
-            stdout, stderr = proc.communicate()
+            self.proc  = subprocess.Popen(run_cmd, shell=True, stdout=subprocess.PIPE)    
+            stdout, stderr = self.proc.communicate()
             end   = timeit.default_timer()
-            if proc.returncode:
-                status = enums.Status.failed
+            if self.proc.returncode:
+                sper_kernel_size_infotatus = enums.Status.failed
                 debug.warning_message("FAILED: '%s'" % config.Arguments.run_cmd)
                 continue
             if config.Arguments.execution_time_from_binary:
                 if not stdout:
                     raise internal_exceptions.BinaryRunException("Expected the binary to dump its execution time. Found nothing")
+                self.update_kernel_times(stdout)
+                nmatchedlines = 0
                 for line in stdout.split(os.linesep):
                     line    = line.strip()
                     matches = time_regex.findall(line)
                     if matches:
+                        nmatchedlines += 1
                         try:
                             total_time += float(matches[0])
                         except:
                             raise internal_exceptions.BinaryRunException("Execution time '%s' is not in the required format" % matches[0])
+                if nmatchedlines == 0:
+                    raise internal_exceptions.BinaryRunException("Regular expression did not match anything on the program's output")
             else:
                 total_time += end - start
+
+            num_actual_runs +=1
+            per_var = 1 + config.Arguments.max_exec_time_var/100
+            time = per_var  * best_execution_time
+            if total_time > time * num_actual_runs:
+                #print "Execution time of cur test case is worst than the best so far, stopping at first run" 
+                break
+
         self.status = status
         config.time_binary += total_time
-        self.execution_time = total_time/config.Arguments.runs
+        if num_actual_runs != 0:
+            self.execution_time = total_time/num_actual_runs
+        else:
+            self.execution_time = total_time
+
+        self.deleteFile(self.file_name()+'.exe')
+        self.deleteFile(self.file_name()+'_host.c')
+        self.deleteFile(self.file_name()+'_host_kernel.cl')
+        self.deleteFile(self.file_name()+'_host.cu')
+        self.deleteFile(self.file_name()+'_kernel.cu')
+        self.deleteFile(self.file_name()+'_kernel.hu')
+        self.deleteFile(self.file_name()+'_host_kernel.hu')
+        self.deleteFile(self.file_name()+'_host_kernel.h')
+        self.deleteFile(self.file_name())
+
+ 
+    def run_with_timeout(self, timeout=2):
+        print "executing task " + str(self.ID)
+        timeout = config.Arguments.timeout_ppcg
+        try:
+            thread = threading.Thread(target=self.binary)
+            thread.start()
+            thread.join(timeout)
+            if thread.is_alive():
+                print("Timeout: terminating the procs")
+                self.proc.terminate()
+                thread.join()
+                self.status = enums.Status.timeout
+        except:
+            print("Exception running"+str(self.ID))
+            self.status = enums.Status.timeout
+
+        return
         
+               
     def __str__(self):
-        return "ID %d: fitness %f" % (self.ID, self.fitness)
-    
\ No newline at end of file
+        return "ID %4d: execution time = %3f, ppcg = %s, status = %s" % (self.ID, self.execution_time, self.ppcg_cmd_line_flags, self.status)
+    
diff --git a/main.py b/main.py
index dddba88..5e04a9e 100755
--- a/main.py
+++ b/main.py
@@ -9,6 +9,7 @@
 import sys
 
 def print_summary(search):
+    return 
     try:
         if config.Arguments.results_file is not None:
             old_stdout    = sys.stdout
@@ -16,6 +17,7 @@ def print_summary(search):
             sys.stdout    = output_stream
         config.summarise_timing()
         search.summarise()
+        search.logall()
     finally:
         if config.Arguments.results_file is not None:
             output_stream.close()
@@ -26,6 +28,8 @@ def autotune():
         search = heuristic_search.GA()
     elif config.Arguments.autotune_subcommand == enums.SearchStrategy.random:
         search = heuristic_search.Random()
+    elif config.Arguments.autotune_subcommand == enums.SearchStrategy.exhaustive:
+        search = heuristic_search.Exhaustive()
     elif config.Arguments.autotune_subcommand == enums.SearchStrategy.simulated_annealing:
         search = heuristic_search.SimulatedAnnealing()
     else:
@@ -127,9 +131,22 @@ def string_csv(string):
     building_and_running_group.add_argument("--run-cmd",
                                             metavar="<STRING>",
                                             help="how to run the generated binary from the auto-tuner",
-                                            required=True)
+                                            required=False)
+    
     
-    runs = 5
+    building_and_running_group.add_argument("--run-cmd-input",
+                                            metavar="<STRING>",
+                                            help="input to the generated binary from the auto-tuner",
+                                            required=False,
+                                            default="")
+
+    
+    building_and_running_group.add_argument("--cmd-string-complete",
+                                            action="store_true",
+                                            help="dont modify the cmd string, note the output file nmaes should be part of cmd lines",
+                                            default=False)
+
+    runs = 1
     building_and_running_group.add_argument("--runs",
                                             type=int,
                                             metavar="<int>",
@@ -141,11 +158,32 @@ def string_csv(string):
                                             help="assume that the binary prints its execution time to standard output (rather than measuring the execution time through Python)",
                                             default=False)
     
+    building_and_running_group.add_argument("--binary-file-name",
+                                            metavar="<STRING>",
+                                            help="name of the generated binary from the auto-tuner",
+                                            required=False,
+                                            default="")
+    
+    building_and_running_group.add_argument("--execution-time-regex",
+                            type=str,
+                            help="regular expression format for execution time",
+                            default=r'^(\d*\.\d+|\d+)$')
+    
+    
+    building_and_running_group.add_argument("--prl-profiling",
+                                            action="store_true",
+                                            help="Using prl profiling, used to extract timing info from prl profiling output",
+                                            default=False)
+
+    building_and_running_group.add_argument("--no-concurrent-kernel-tuning",
+                                            action="store_false",
+                                            help="Do not tune multiple kernels at the same time",
+                                            default=True)
     # PPCG options
     ppcg_group = parser.add_argument_group("PPCG arguments")
     
     ppcg_group.add_argument("--target",
-                            choices=[enums.Targets.cuda, enums.Targets.opencl],
+                            choices=[enums.Targets.cuda, enums.Targets.opencl, enums.Targets.prl],
                             help="the target to generate code for",
                             default=enums.Targets.opencl)
     
@@ -165,6 +203,13 @@ def string_csv(string):
                             metavar="<LIST>",
                             help="consider only these values when tuning the shared memory size (default: %s)" % (shared_memory_possibilties),
                             default=shared_memory_possibilties)
+     
+    kernels_list = [compiler_flags.SizesFlag.ALL_KERNELS_SENTINEL]
+    ppcg_group.add_argument("--kernels-to-tune",
+                            type=int_csv,
+                            metavar="<LIST>",
+                            help="consider only these kernels values when tuning (default: all)",
+                            default=kernels_list)
     
     tile_size_range = (2**0, 2**6)
     ppcg_group.add_argument("--tile-size-range",
@@ -234,6 +279,16 @@ def string_csv(string):
                             help="do not tune kernel sizes individually, i.e. use a uniform tile size for all kernels and let PPCG decide on suitable block and grid sizes",
                             default=False)
     
+    ppcg_group.add_argument("--no-shared-memory",
+                            action="store_false",
+                            help="do not consider shared memory while autotuning",
+                            default=True)
+    
+    ppcg_group.add_argument("--no-private-memory",
+                            action="store_false",
+                            help="do not consider private memory  while autotuning",
+                            default=True)
+
     ppcg_group.add_argument("--all-isl-options",
                             action=ISLAction,
                             metavar="",
@@ -330,10 +385,74 @@ def string_csv(string):
                                default=randoms,
                                help="the number of random tests to generate (default: %d)" % randoms)
     
-    parser.parse_args(namespace=config.Arguments)
+    
+    parser_exhaustive = search_subparsers.add_parser(enums.SearchStrategy.exhaustive)
+
+    parser_exhaustive.add_argument("--params-from-file",
+                         action="store_true",
+                         help="read the paramters from the explore-params py",
+                         default=False)
+
+    parser_exhaustive.add_argument("--only-powers-of-two",
+                         action="store_true",
+                         help="Search for parameter values that are powers of two",
+                         default=False)
+
 
+    parser_exhaustive.add_argument("--filter-testcases",
+                         action="store_true",
+                         help="few heursitics to reduce search space such as tile size multiple of block size, tile size > block size etc..",
+                         default=True)
+
+    parser_exhaustive.add_argument("--parallelize-compilation",
+                         action="store_true",
+                         help="parallelize ppcg compilation and execution of test case",
+                         default=False)
+    
+    
+    num_compile_threads = 1
+    parser_exhaustive.add_argument("--num-compile-threads",
+                               type=int,
+                               metavar="<int>",
+                               default=num_compile_threads,
+                               help="number of threads to use for ppcg compilation (default: %d)" % num_compile_threads)
+    
+    
+    max_work_group_size = 1024
+    parser_exhaustive.add_argument("--max-work-group-size",
+                               type=int,
+                               metavar="<int>",
+                               default=max_work_group_size,
+                               help="max work group size, test cases with work group size greater than this value will be filtered out (default: %d)" % max_work_group_size)
+    
+    
+    
+    min_work_group_size = 1 
+    parser_exhaustive.add_argument("--min-work-group-size",
+                               type=int,
+                               metavar="<int>",
+                               default=min_work_group_size,
+                               help="min work group size, test cases with work group size lesser than this value will be filtered out (default: %d)" % min_work_group_size)
+    
+    timeout = 500
+    parser_exhaustive.add_argument("--timeout-ppcg",
+                               type=int,
+                               metavar="<int>",
+                               default=timeout,
+                               help="timeout for ppcg compilation and testcase execution (default: %d sec)" % num_compile_threads)
+    
+    
+    max_exec_time_var = 20 
+    parser_exhaustive.add_argument("--max-exec-time-var",
+                               type=int,
+                               metavar="<int>",
+                               default=max_exec_time_var,
+                               help="max allowed variance for execution time. If the execution time of a test case is greater that best so far + max-exec-time-var then number of runs is restricted to 1 (default: %d )" % max_exec_time_var)
+    
+    parser.parse_args(namespace=config.Arguments)
+  
 if __name__ == "__main__":
     the_command_line()
     setup_PPCG_flags()
     autotune()    
-        
\ No newline at end of file
+