diff --git a/api/common/benchmark.py b/api/common/benchmark.py
index 1cc7e713bd..4482fde66f 100644
--- a/api/common/benchmark.py
+++ b/api/common/benchmark.py
@@ -122,7 +122,12 @@ def layers(self, api_name, module_name=None, **kwargs):
     def append_gradients(self, targets, inputs):
         pass
 
-    def get_running_stats(self, use_gpu, config, runtimes, walltimes=None):
+    def get_running_stats(self,
+                          use_gpu,
+                          config,
+                          runtimes,
+                          walltimes=None,
+                          repeat=None):
         try:
             module_name = "torch" if self._framework == "pytorch" else self._framework
             module = importlib.import_module(module_name)
@@ -143,9 +148,17 @@ def get_running_stats(self, use_gpu, config, runtimes, walltimes=None):
         if walltimes is not None:
             stats["wall_time"] = walltimes
 
-        flop, byte = self.compute_flop_and_byte(config)
-        if flop is not None:
-            stats["flop"] = flop
-        if byte is not None:
-            stats["byte"] = byte
+        if repeat is not None:
+            stats["repeat"] = repeat
+
+        try:
+            flop, byte = self.compute_flop_and_byte(config)
+            if flop is not None:
+                stats["flop"] = flop
+            if byte is not None:
+                stats["byte"] = byte
+        except Exception:
+            print("Failed to call compute_flops_and_byte for %s." %
+                  (self._framework))
+
         return stats
diff --git a/api/common/launch.py b/api/common/launch.py
index cdd05027c4..d9a8135b5b 100644
--- a/api/common/launch.py
+++ b/api/common/launch.py
@@ -474,6 +474,8 @@ def _set_args(args, arg, value):
         if task == "speed":
             args.benchmark_script_args.append(" --gpu_time ")
             args.benchmark_script_args.append(str(output_time))
+            _set_args(args.benchmark_script_args,
+                      "--get_status_without_running", "True")
         if task == "scheduling":
             args.benchmark_script_args.append(" --scheduling_times ")
             args.benchmark_script_args.append("\"" + str(output_time) + "\"")
diff --git a/api/common/main.py b/api/common/main.py
index 4f4abfe7b9..c1b8a073e7 100644
--- a/api/common/main.py
+++ b/api/common/main.py
@@ -135,8 +135,13 @@ def parse_args():
             "task should be paddle, tensorflow, tf, pytorch, torch, both")
 
     if args.get_status_without_running:
-        assert args.task == "scheduling", "task must be 'scheduling' if get_status_without_running is True."
-        assert args.scheduling_times != "{}", "scheduling_times can't be {} if task is 'scheduling' and get_status_without_running is True."
+        assert args.task in [
+            "speed", "scheduling"
+        ], "task must be 'speed' or 'scheduling' if get_status_without_running is True."
+        if args.task == "speed":
+            assert args.gpu_time != 0, "gpu_time can't be 0 if task is 'speed' and get_status_without_running is True."
+        if args.task == "scheduling":
+            assert args.scheduling_times != "{}", "scheduling_times can't be {} if task is 'scheduling' and get_status_without_running is True."
 
     if args.task == "accuracy":
         args.repeat = 1
diff --git a/api/common/paddle_op_benchmark.py b/api/common/paddle_op_benchmark.py
index 827d0b55aa..3ad931aeb6 100644
--- a/api/common/paddle_op_benchmark.py
+++ b/api/common/paddle_op_benchmark.py
@@ -442,7 +442,8 @@ def _run_main_iter(step=1):
         # "_run_main_iter" needs to be executed firstly because
         # parameter "self._backward" needs to be update.
         if get_status_without_running:
-            stats = self.get_running_stats(use_gpu, config, None)
+            stats = self.get_running_stats(
+                use_gpu, config, runtimes=None, repeat=repeat)
             return None, stats
 
         runtimes = []
diff --git a/api/common/utils.py b/api/common/utils.py
index 34f027fa39..6435bd5a68 100644
--- a/api/common/utils.py
+++ b/api/common/utils.py
@@ -270,33 +270,27 @@ def check_outputs(output_list,
         sys.exit(1)
 
 
-def print_benchmark_result(result,
-                           task="speed",
-                           log_level=0,
-                           config_params=None):
-    assert isinstance(result, dict), "Input result should be a dict."
+def _print_runtime(log_level, runtimes, walltimes):
+    if runtimes is None:
+        return
 
-    status = collections.OrderedDict()
-    status["framework"] = result["framework"]
-    status["version"] = result["version"]
-    status["name"] = result["name"]
-    status["device"] = result["device"]
-    status["backward"] = result["backward"]
+    # print all times
+    repeat = len(runtimes)
+    seg_range = [0, 0]
+    if log_level == 0:
+        seg_range = [0, repeat]
+    elif log_level == 1 and repeat > 20:
+        seg_range = [10, repeat - 10]
+    for i in range(repeat):
+        if i < seg_range[0] or i >= seg_range[1]:
+            walltime = walltimes[i] if walltimes is not None else 0
+            print("Iter %4d, Runtime: %.5f ms, Walltime: %.5f ms" %
+                  (i, runtimes[i], walltime))
 
-    scheduling_times = result.get("scheduling_times", "{}")
-    if task == "scheduling" and scheduling_times is not None:
-        status["scheduling"] = eval(scheduling_times)
 
-    runtimes = result.get("total", None)
+def _compute_average_runtime(runtimes, walltimes):
     if runtimes is None:
-        status["parameters"] = config_params
-        print(json.dumps(status))
-        return
-
-    walltimes = result.get("wall_time", None)
-    gpu_time = result.get("gpu_time", None)
-    stable = result.get("stable", None)
-    diff = result.get("diff", None)
+        return 0, 0, 0, 0
 
     repeat = len(runtimes)
     for i in range(repeat):
@@ -320,47 +314,70 @@ def print_benchmark_result(result,
         avg_walltime = np.average(np.sort(walltimes)[begin:end])
     else:
         avg_walltime = 0
+    return begin, end, avg_runtime, avg_walltime
 
-    # print all times
-    seg_range = [0, 0]
-    if log_level == 0:
-        seg_range = [0, repeat]
-    elif log_level == 1 and repeat > 20:
-        seg_range = [10, repeat - 10]
-    for i in range(len(runtimes)):
-        if i < seg_range[0] or i >= seg_range[1]:
-            walltime = walltimes[i] if walltimes is not None else 0
-            print("Iter %4d, Runtime: %.5f ms, Walltime: %.5f ms" %
-                  (i, runtimes[i], walltime))
 
-    if avg_runtime - avg_walltime > 0.001:
-        total = avg_runtime - avg_walltime
-    else:
-        print(
-            "Average runtime (%.5f ms) is less than average walltime (%.5f ms)."
-            % (avg_runtime, avg_walltime))
-        total = 0.001
+def print_benchmark_result(result,
+                           task="speed",
+                           log_level=0,
+                           config_params=None):
+    assert isinstance(result, dict), "Input result should be a dict."
+
+    status = collections.OrderedDict()
+    status["framework"] = result["framework"]
+    status["version"] = result["version"]
+    status["name"] = result["name"]
+    status["device"] = result["device"]
+    status["backward"] = result["backward"]
 
+    scheduling_times = result.get("scheduling_times", "{}")
+    if task == "scheduling" and scheduling_times is not None:
+        status["scheduling"] = eval(scheduling_times)
+        status["parameters"] = config_params
+        print(json.dumps(status))
+        return
+
+    stable = result.get("stable", None)
+    diff = result.get("diff", None)
     if stable is not None and diff is not None:
         status["precision"] = collections.OrderedDict()
         status["precision"]["stable"] = stable
         status["precision"]["diff"] = diff
-    status["speed"] = collections.OrderedDict()
-    status["speed"]["repeat"] = repeat
-    status["speed"]["begin"] = begin
-    status["speed"]["end"] = end
-    status["speed"]["total"] = total
-    status["speed"]["wall_time"] = avg_walltime
-    status["speed"]["total_include_wall_time"] = avg_runtime
-    if gpu_time is not None:
-        avg_gpu_time = gpu_time / repeat
-        status["speed"]["gpu_time"] = avg_gpu_time
-
-        flop = result.get("flop", None)
-        byte = result.get("byte", None)
-        if flop is not None and abs(avg_gpu_time) > 1E-6:
-            status["speed"]["gflops"] = float(flop) * 1E-6 / avg_gpu_time
-        if byte is not None and abs(avg_gpu_time) > 1E-6:
-            status["speed"]["gbs"] = float(byte) * 1E-6 / avg_gpu_time
+
+    if task == "speed":
+        runtimes = result.get("total", None)
+        walltimes = result.get("wall_time", None)
+        gpu_time = result.get("gpu_time", None)
+
+        repeat = len(runtimes) if runtimes is not None else result.get(
+            "repeat", 1)
+        begin, end, avg_runtime, avg_walltime = _compute_average_runtime(
+            runtimes, walltimes)
+        _print_runtime(log_level, runtimes, walltimes)
+        if avg_runtime - avg_walltime > 0.001:
+            total = avg_runtime - avg_walltime
+        else:
+            print(
+                "Average runtime (%.5f ms) is less than average walltime (%.5f ms)."
+                % (avg_runtime, avg_walltime))
+            total = 0.001
+
+        status["speed"] = collections.OrderedDict()
+        status["speed"]["repeat"] = repeat
+        status["speed"]["begin"] = begin
+        status["speed"]["end"] = end
+        status["speed"]["total"] = total
+        status["speed"]["wall_time"] = avg_walltime
+        status["speed"]["total_include_wall_time"] = avg_runtime
+        if gpu_time is not None:
+            avg_gpu_time = gpu_time / repeat
+            status["speed"]["gpu_time"] = avg_gpu_time
+
+            flop = result.get("flop", None)
+            byte = result.get("byte", None)
+            if flop is not None and abs(avg_gpu_time) > 1E-6:
+                status["speed"]["gflops"] = float(flop) * 1E-6 / avg_gpu_time
+            if byte is not None and abs(avg_gpu_time) > 1E-6:
+                status["speed"]["gbs"] = float(byte) * 1E-6 / avg_gpu_time
     status["parameters"] = config_params
     print(json.dumps(status))