From 23a99413eb078cb22e0e559a3ecdbacf08956b21 Mon Sep 17 00:00:00 2001 From: yyoean <1114146082@qq.com> Date: Wed, 29 Oct 2025 18:58:15 +0800 Subject: [PATCH 01/21] Add exclusion of zeros in diagnose_matrix --- src/deepxtrace/diagnose.py | 138 +++++++++++++++++++++++++++---------- 1 file changed, 101 insertions(+), 37 deletions(-) diff --git a/src/deepxtrace/diagnose.py b/src/deepxtrace/diagnose.py index e75051b..e6036f9 100644 --- a/src/deepxtrace/diagnose.py +++ b/src/deepxtrace/diagnose.py @@ -117,6 +117,7 @@ class Diagnose: DEEPEP_DIAGNOSE_THRESHOLD_COL: determine threshold for abnormal columns. Default 3.0. DEEPEP_DIAGNOSE_THRESHOLD_ROW: determine threshold for abnormal rows. Default 3.0. DEEPEP_DIAGNOSE_THRESHOLD_POINT: determine threshold for abnormal individual points. Default 5.0. + DEEPEP_DIAGNOSE_EXCLUDING_ZEROS: controls whether excluding zeors in diagnose_matrix. Default 0. """ @@ -161,6 +162,7 @@ def __init__( os.getenv( "DEEPEP_DIAGNOSE_THRESHOLD_POINT", 5.0)) + self.excluing_zeros = int(os.getenv("DEEPEP_DIAGNOSE_EXCLUDING_ZEROS", 0)) # Initialize the diagnose self.group = group @@ -306,7 +308,7 @@ def _setup_logger_internal( @staticmethod def diagnose_matrix( mat, thres_col=3.0, thres_row=3.0, thres_point=5.0, - suppress_points_in_strong_rowscols=True + suppress_points_in_strong_rowscols=True, excluing_zeros=0 ): """ Detect abnormal columns, rows, and individual points in a 2D wait-time matrix. @@ -325,43 +327,105 @@ def diagnose_matrix( "abnormal_points": List[List[int, int, float, float]] # abnormal points, [row, col, value, normalized_value] } """ - # 1. Check for abnormal columns - col_means = mat.mean(axis=0) - # z_col = (col_means - col_means.mean()) / (col_means.std() + 1e-8) - z_col = col_means / (col_means.mean() + 1e-8) - abnormal_cols = [ - [j, col_means[j], z_col[j]] - for j in np.where(z_col > thres_col)[0] - ] - - # 2. Check for abnormal rows - row_means = mat.mean(axis=1) - # z_row = (row_means - row_means.mean()) / (row_means.std() + 1e-8) - z_row = row_means / (row_means.mean() + 1e-8) - abnormal_rows = [ - [i, row_means[i], z_row[i]] - for i in np.where(z_row > thres_row)[0] - ] - - # 3. Check for abnormal single points - # z_all = (mat - mat.mean()) / (mat.std() + 1e-8) - z_all = mat / (mat.mean() + 1e-8) - # Get all positions with z-score > threshold - abnormal_points = [ - [i, j, mat[i, j], z_all[i, j]] - for i in range(mat.shape[0]) - for j in range(mat.shape[1]) - if z_all[i, j] > thres_point - ] - # Optionally remove points that are in already detected abnormal rows - # or columns - if suppress_points_in_strong_rowscols: - strong_rows = [row[0] for row in abnormal_rows] - strong_cols = [col[0] for col in abnormal_cols] + if (excluing_zeros == 0): + # 1. Check for abnormal columns (including zeros) + col_means = mat.mean(axis=0) + # z_col = (col_means - col_means.mean()) / (col_means.std() + 1e-8) + z_col = col_means / (col_means.mean() + 1e-8) + abnormal_cols = [ + [j, col_means[j], z_col[j]] + for j in np.where(z_col > thres_col)[0] + ] + + # 2. Check for abnormal rows (including zeros) + row_means = mat.mean(axis=1) + # z_row = (row_means - row_means.mean()) / (row_means.std() + 1e-8) + z_row = row_means / (row_means.mean() + 1e-8) + abnormal_rows = [ + [i, row_means[i], z_row[i]] + for i in np.where(z_row > thres_row)[0] + ] + + # 3. Check for abnormal single points (including zeros) + # z_all = (mat - mat.mean()) / (mat.std() + 1e-8) + z_all = mat / (mat.mean() + 1e-8) + # Get all positions with z-score > threshold + abnormal_points = [ + [i, j, mat[i, j], z_all[i, j]] + for i in range(mat.shape[0]) + for j in range(mat.shape[1]) + if z_all[i, j] > thres_point + ] + # Optionally remove points that are in already detected abnormal rows or columns + if suppress_points_in_strong_rowscols: + strong_rows = [row[0] for row in abnormal_rows] + strong_cols = [col[0] for col in abnormal_cols] + abnormal_points = [ + [i, j, v, z] for [i, j, v, z] in abnormal_points + if i not in strong_rows and j not in strong_cols + ] + else: + # 1. Check for abnormal columns (excluding zeros in columns) + col_means = np.array([ + mat[:, j][mat[:, j] != 0].mean() # Calculate mean of non-zero values in column + if np.any(mat[:, j] != 0) # If column contains non-zero values + else 0 # Else set to 0 (avoid empty column errors) + for j in range(mat.shape[1]) + ]) + # Calculate normalized values (exclude all-zero columns) + valid_cols = np.where(col_means != 0)[0] # Indices of columns with non-zero mean + z_col = np.zeros_like(col_means) # Initialize all-zero array + if len(valid_cols) > 0: + z_col[valid_cols] = col_means[valid_cols] / (col_means[valid_cols].mean() + 1e-8) + # Detect abnormal columns (only non-zero columns) + abnormal_cols = [ + [j, col_means[j], z_col[j]] + for j in valid_cols + if z_col[j] > thres_col + ] + + # 2. Check for abnormal rows (excluding zeros in rows) + row_means = np.array([ + mat[i, :][mat[i, :] != 0].mean() # Calculate mean of non-zero values in row + if np.any(mat[i, :] != 0) # If row contains non-zero values + else 0 # Else set to 0 (avoid empty row errors) + for i in range(mat.shape[0]) + ]) + # Calculate normalized values (exclude all-zero rows) + valid_rows = np.where(row_means != 0)[0] # Indices of rows with non-zero mean + z_row = np.zeros_like(row_means) # Initialize all-zero array + if len(valid_rows) > 0: + z_row[valid_rows] = row_means[valid_rows] / (row_means[valid_rows].mean() + 1e-8) + # Detect abnormal rows (only non-zero rows) + abnormal_rows = [ + [i, row_means[i], z_row[i]] + for i in valid_rows + if z_row[i] > thres_row + ] + + # 3. Check for abnormal single points (excluding zeros) + mask = mat != 0 # Create mask for non-zero values + z_all = np.zeros_like(mat, dtype=float) # Initialize all-zero array + if np.any(mask): # If non-zero values exist + # Calculate mean of non-zero values (global) + nonzero_mean = mat[mask].mean() + z_all[mask] = mat[mask] / (nonzero_mean + 1e-8) # Normalize only non-zero values + # Detect abnormal points (non-zero values with z-score > threshold) abnormal_points = [ - [i, j, v, z] for [i, j, v, z] in abnormal_points - if i not in strong_rows and j not in strong_cols + [i, j, mat[i, j], z_all[i, j]] + for i in range(mat.shape[0]) + for j in range(mat.shape[1]) + if mask[i, j] and z_all[i, j] > thres_point # Ensure non-zero and abnormal ] + # Optionally remove points in already detected abnormal rows/columns + if suppress_points_in_strong_rowscols: + strong_rows = {row[0] for row in abnormal_rows} # Use set for faster lookup + strong_cols = {col[0] for col in abnormal_cols} + abnormal_points = [ + [i, j, v, z] for [i, j, v, z] in abnormal_points + if i not in strong_rows and j not in strong_cols + ] + # 4. Return for automatic processing return { "abnormal_cols": abnormal_cols, @@ -436,7 +500,7 @@ def _gather_diagnose_stats_internal( stats_arr = torch.stack(self.gather_tensor, dim=0).numpy() for i, name in enumerate(["Dispatch", "Combine"]): res = Diagnose.diagnose_matrix( - stats_arr[:, i, :], thres_col=self.thres_col, thres_row=self.thres_row, thres_point=self.thres_point) + stats_arr[:, i, :], thres_col=self.thres_col, thres_row=self.thres_row, thres_point=self.thres_point, excluing_zeros=self.excluing_zeros) results.append(res) self.logger.info( f"[Diagnose] InstanceID: {self.instance_id} EPSize: {self.group_size}, diagnose: {res}, {name} Wait Recv Cost Per Token Matrix[src_rank, dst_rank]") From 329c1d45a7f012feb620885b4c7b834be94e1cbc Mon Sep 17 00:00:00 2001 From: yyoean <1114146082@qq.com> Date: Wed, 29 Oct 2025 19:04:07 +0800 Subject: [PATCH 02/21] Update src/deepxtrace/diagnose.py Co-authored-by: sourcery-ai[bot] <58596630+sourcery-ai[bot]@users.noreply.github.com> --- src/deepxtrace/diagnose.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deepxtrace/diagnose.py b/src/deepxtrace/diagnose.py index e6036f9..72316c6 100644 --- a/src/deepxtrace/diagnose.py +++ b/src/deepxtrace/diagnose.py @@ -117,7 +117,7 @@ class Diagnose: DEEPEP_DIAGNOSE_THRESHOLD_COL: determine threshold for abnormal columns. Default 3.0. DEEPEP_DIAGNOSE_THRESHOLD_ROW: determine threshold for abnormal rows. Default 3.0. DEEPEP_DIAGNOSE_THRESHOLD_POINT: determine threshold for abnormal individual points. Default 5.0. - DEEPEP_DIAGNOSE_EXCLUDING_ZEROS: controls whether excluding zeors in diagnose_matrix. Default 0. + DEEPEP_DIAGNOSE_EXCLUDING_ZEROS: controls whether excluding zeros in diagnose_matrix. Default 0. """ From dbd190cd71a222580a7e69baa0bbfd8f5c5e154c Mon Sep 17 00:00:00 2001 From: yyoean <1114146082@qq.com> Date: Wed, 29 Oct 2025 19:04:33 +0800 Subject: [PATCH 03/21] Update src/deepxtrace/diagnose.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- src/deepxtrace/diagnose.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deepxtrace/diagnose.py b/src/deepxtrace/diagnose.py index 72316c6..e9973be 100644 --- a/src/deepxtrace/diagnose.py +++ b/src/deepxtrace/diagnose.py @@ -162,7 +162,7 @@ def __init__( os.getenv( "DEEPEP_DIAGNOSE_THRESHOLD_POINT", 5.0)) - self.excluing_zeros = int(os.getenv("DEEPEP_DIAGNOSE_EXCLUDING_ZEROS", 0)) + self.excluding_zeros = int(os.getenv("DEEPEP_DIAGNOSE_EXCLUDING_ZEROS", 0)) # Initialize the diagnose self.group = group From 14e9cd249289a94183c11e6fba533f708993382f Mon Sep 17 00:00:00 2001 From: yyoean <1114146082@qq.com> Date: Wed, 29 Oct 2025 19:45:10 +0800 Subject: [PATCH 04/21] Update src/deepxtrace/diagnose.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- src/deepxtrace/diagnose.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deepxtrace/diagnose.py b/src/deepxtrace/diagnose.py index e9973be..435f741 100644 --- a/src/deepxtrace/diagnose.py +++ b/src/deepxtrace/diagnose.py @@ -327,7 +327,7 @@ def diagnose_matrix( "abnormal_points": List[List[int, int, float, float]] # abnormal points, [row, col, value, normalized_value] } """ - if (excluing_zeros == 0): + if excluing_zeros == 0: # 1. Check for abnormal columns (including zeros) col_means = mat.mean(axis=0) # z_col = (col_means - col_means.mean()) / (col_means.std() + 1e-8) From 5ec3877e399804522bf20fab589e200ad2a42d37 Mon Sep 17 00:00:00 2001 From: yyoean <1114146082@qq.com> Date: Thu, 30 Oct 2025 10:44:33 +0800 Subject: [PATCH 05/21] autopep8 Signed-off-by: yyoean <1114146082@qq.com> --- src/deepxtrace/diagnose.py | 89 +++++++++++++++++++++++++------------- 1 file changed, 59 insertions(+), 30 deletions(-) diff --git a/src/deepxtrace/diagnose.py b/src/deepxtrace/diagnose.py index 435f741..2f39821 100644 --- a/src/deepxtrace/diagnose.py +++ b/src/deepxtrace/diagnose.py @@ -162,7 +162,8 @@ def __init__( os.getenv( "DEEPEP_DIAGNOSE_THRESHOLD_POINT", 5.0)) - self.excluding_zeros = int(os.getenv("DEEPEP_DIAGNOSE_EXCLUDING_ZEROS", 0)) + self.excluding_zeros = int( + os.getenv("DEEPEP_DIAGNOSE_EXCLUDING_ZEROS", 0)) # Initialize the diagnose self.group = group @@ -336,7 +337,7 @@ def diagnose_matrix( [j, col_means[j], z_col[j]] for j in np.where(z_col > thres_col)[0] ] - + # 2. Check for abnormal rows (including zeros) row_means = mat.mean(axis=1) # z_row = (row_means - row_means.mean()) / (row_means.std() + 1e-8) @@ -345,7 +346,7 @@ def diagnose_matrix( [i, row_means[i], z_row[i]] for i in np.where(z_row > thres_row)[0] ] - + # 3. Check for abnormal single points (including zeros) # z_all = (mat - mat.mean()) / (mat.std() + 1e-8) z_all = mat / (mat.mean() + 1e-8) @@ -356,70 +357,88 @@ def diagnose_matrix( for j in range(mat.shape[1]) if z_all[i, j] > thres_point ] - # Optionally remove points that are in already detected abnormal rows or columns + # Optionally remove points that are in already detected abnormal + # rows or columns if suppress_points_in_strong_rowscols: strong_rows = [row[0] for row in abnormal_rows] strong_cols = [col[0] for col in abnormal_cols] abnormal_points = [ [i, j, v, z] for [i, j, v, z] in abnormal_points if i not in strong_rows and j not in strong_cols - ] - else: + ] + else: # 1. Check for abnormal columns (excluding zeros in columns) col_means = np.array([ - mat[:, j][mat[:, j] != 0].mean() # Calculate mean of non-zero values in column - if np.any(mat[:, j] != 0) # If column contains non-zero values - else 0 # Else set to 0 (avoid empty column errors) + # Calculate mean of non-zero values in column + mat[:, j][mat[:, j] != 0].mean() + # If column contains non-zero values + if np.any(mat[:, j] != 0) + # Else set to 0 (avoid empty column errors) + else 0 for j in range(mat.shape[1]) ]) # Calculate normalized values (exclude all-zero columns) - valid_cols = np.where(col_means != 0)[0] # Indices of columns with non-zero mean - z_col = np.zeros_like(col_means) # Initialize all-zero array + # Indices of columns with non-zero mean + valid_cols = np.where(col_means != 0)[0] + # Initialize all-zero array + z_col = np.zeros_like(col_means) if len(valid_cols) > 0: - z_col[valid_cols] = col_means[valid_cols] / (col_means[valid_cols].mean() + 1e-8) + z_col[valid_cols] = col_means[valid_cols] / \ + (col_means[valid_cols].mean() + 1e-8) # Detect abnormal columns (only non-zero columns) abnormal_cols = [ [j, col_means[j], z_col[j]] for j in valid_cols if z_col[j] > thres_col ] - + # 2. Check for abnormal rows (excluding zeros in rows) row_means = np.array([ - mat[i, :][mat[i, :] != 0].mean() # Calculate mean of non-zero values in row - if np.any(mat[i, :] != 0) # If row contains non-zero values - else 0 # Else set to 0 (avoid empty row errors) + # Calculate mean of non-zero values in row + mat[i, :][mat[i, :] != 0].mean() + # If row contains non-zero values + if np.any(mat[i, :] != 0) + # Else set to 0 (avoid empty row errors) + else 0 for i in range(mat.shape[0]) ]) # Calculate normalized values (exclude all-zero rows) - valid_rows = np.where(row_means != 0)[0] # Indices of rows with non-zero mean - z_row = np.zeros_like(row_means) # Initialize all-zero array + # Indices of rows with non-zero mean + valid_rows = np.where(row_means != 0)[0] + # Initialize all-zero array + z_row = np.zeros_like(row_means) if len(valid_rows) > 0: - z_row[valid_rows] = row_means[valid_rows] / (row_means[valid_rows].mean() + 1e-8) + z_row[valid_rows] = row_means[valid_rows] / \ + (row_means[valid_rows].mean() + 1e-8) # Detect abnormal rows (only non-zero rows) abnormal_rows = [ [i, row_means[i], z_row[i]] for i in valid_rows if z_row[i] > thres_row - ] - + ] + # 3. Check for abnormal single points (excluding zeros) mask = mat != 0 # Create mask for non-zero values - z_all = np.zeros_like(mat, dtype=float) # Initialize all-zero array + # Initialize all-zero array + z_all = np.zeros_like(mat, dtype=float) if np.any(mask): # If non-zero values exist # Calculate mean of non-zero values (global) nonzero_mean = mat[mask].mean() - z_all[mask] = mat[mask] / (nonzero_mean + 1e-8) # Normalize only non-zero values + # Normalize only non-zero values + z_all[mask] = mat[mask] / (nonzero_mean + 1e-8) # Detect abnormal points (non-zero values with z-score > threshold) abnormal_points = [ [i, j, mat[i, j], z_all[i, j]] for i in range(mat.shape[0]) for j in range(mat.shape[1]) - if mask[i, j] and z_all[i, j] > thres_point # Ensure non-zero and abnormal + # Ensure non-zero and abnormal + if mask[i, j] and z_all[i, j] > thres_point ] - # Optionally remove points in already detected abnormal rows/columns + # Optionally remove points in already detected abnormal + # rows/columns if suppress_points_in_strong_rowscols: - strong_rows = {row[0] for row in abnormal_rows} # Use set for faster lookup + # Use set for faster lookup + strong_rows = {row[0] for row in abnormal_rows} strong_cols = {col[0] for col in abnormal_cols} abnormal_points = [ [i, j, v, z] for [i, j, v, z] in abnormal_points @@ -462,7 +481,10 @@ def run_diagnose(): except Exception as e: self.logger.info( - f"[Diagnose] InstanceID: {self.instance_id} EPSize: {self.group_size} Rank: {self.rank} dist error: {e}, diagnose thread exit.") + f"[Diagnose] InstanceID: { + self.instance_id} EPSize: { + self.group_size} Rank: { + self.rank} dist error: {e}, diagnose thread exit.") logging.shutdown() return @@ -499,11 +521,18 @@ def _gather_diagnose_stats_internal( else: stats_arr = torch.stack(self.gather_tensor, dim=0).numpy() for i, name in enumerate(["Dispatch", "Combine"]): - res = Diagnose.diagnose_matrix( - stats_arr[:, i, :], thres_col=self.thres_col, thres_row=self.thres_row, thres_point=self.thres_point, excluing_zeros=self.excluing_zeros) + res = Diagnose.diagnose_matrix(stats_arr[:, + i, + :], + thres_col=self.thres_col, + thres_row=self.thres_row, + thres_point=self.thres_point, + excluing_zeros=self.excluing_zeros) results.append(res) self.logger.info( - f"[Diagnose] InstanceID: {self.instance_id} EPSize: {self.group_size}, diagnose: {res}, {name} Wait Recv Cost Per Token Matrix[src_rank, dst_rank]") + f"[Diagnose] InstanceID: { + self.instance_id} EPSize: { + self.group_size}, diagnose: {res}, {name} Wait Recv Cost Per Token Matrix[src_rank, dst_rank]") if self.enable_details: for idx, row in enumerate(stats_arr[:, i, :]): self.logger.info( From 1afd8a9a7b31addc45e58193c8855977a9d0195e Mon Sep 17 00:00:00 2001 From: yyoean <1114146082@qq.com> Date: Thu, 30 Oct 2025 10:54:51 +0800 Subject: [PATCH 06/21] format Signed-off-by: yyoean <1114146082@qq.com> --- src/deepxtrace/diagnose.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/src/deepxtrace/diagnose.py b/src/deepxtrace/diagnose.py index 2f39821..c50d08e 100644 --- a/src/deepxtrace/diagnose.py +++ b/src/deepxtrace/diagnose.py @@ -480,11 +480,7 @@ def run_diagnose(): self._reset_normal_stats() except Exception as e: - self.logger.info( - f"[Diagnose] InstanceID: { - self.instance_id} EPSize: { - self.group_size} Rank: { - self.rank} dist error: {e}, diagnose thread exit.") + self.logger.info(f"[Diagnose] InstanceID: {self.instance_id} EPSize: {self.group_size} Rank: {self.rank} dist error: {e}, diagnose thread exit.") logging.shutdown() return @@ -521,18 +517,11 @@ def _gather_diagnose_stats_internal( else: stats_arr = torch.stack(self.gather_tensor, dim=0).numpy() for i, name in enumerate(["Dispatch", "Combine"]): - res = Diagnose.diagnose_matrix(stats_arr[:, - i, - :], - thres_col=self.thres_col, - thres_row=self.thres_row, - thres_point=self.thres_point, - excluing_zeros=self.excluing_zeros) + res = Diagnose.diagnose_matrix( + stats_arr[:, i, :], thres_col=self.thres_col, thres_row=self.thres_row, thres_point=self.thres_point) results.append(res) self.logger.info( - f"[Diagnose] InstanceID: { - self.instance_id} EPSize: { - self.group_size}, diagnose: {res}, {name} Wait Recv Cost Per Token Matrix[src_rank, dst_rank]") + f"[Diagnose] InstanceID: {self.instance_id} EPSize: {self.group_size}, diagnose: {res}, {name} Wait Recv Cost Per Token Matrix[src_rank, dst_rank]") if self.enable_details: for idx, row in enumerate(stats_arr[:, i, :]): self.logger.info( From 463843a536a7a2960ee0fff27bd670e024419bbf Mon Sep 17 00:00:00 2001 From: yyoean <1114146082@qq.com> Date: Thu, 30 Oct 2025 10:59:36 +0800 Subject: [PATCH 07/21] format Signed-off-by: yyoean <1114146082@qq.com> --- src/deepxtrace/diagnose.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/deepxtrace/diagnose.py b/src/deepxtrace/diagnose.py index c50d08e..29d3db8 100644 --- a/src/deepxtrace/diagnose.py +++ b/src/deepxtrace/diagnose.py @@ -480,7 +480,11 @@ def run_diagnose(): self._reset_normal_stats() except Exception as e: - self.logger.info(f"[Diagnose] InstanceID: {self.instance_id} EPSize: {self.group_size} Rank: {self.rank} dist error: {e}, diagnose thread exit.") + self.logger.info( + f"[Diagnose] InstanceID: { \ + self.instance_id} EPSize: { \ + self.group_size} Rank: { \ + self.rank} dist error: {e}, diagnose thread exit.") logging.shutdown() return @@ -521,7 +525,9 @@ def _gather_diagnose_stats_internal( stats_arr[:, i, :], thres_col=self.thres_col, thres_row=self.thres_row, thres_point=self.thres_point) results.append(res) self.logger.info( - f"[Diagnose] InstanceID: {self.instance_id} EPSize: {self.group_size}, diagnose: {res}, {name} Wait Recv Cost Per Token Matrix[src_rank, dst_rank]") + f"[Diagnose] InstanceID: { \ + self.instance_id} EPSize: { \ + self.group_size}, diagnose: {res}, {name} Wait Recv Cost Per Token Matrix[src_rank, dst_rank]") if self.enable_details: for idx, row in enumerate(stats_arr[:, i, :]): self.logger.info( From dc0f8470067df5978616ffb8d2f0dff2fa51f3db Mon Sep 17 00:00:00 2001 From: yyoean <1114146082@qq.com> Date: Thu, 30 Oct 2025 14:42:06 +0800 Subject: [PATCH 08/21] essential Signed-off-by: yyoean <1114146082@qq.com> --- src/deepxtrace/diagnose.py | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/src/deepxtrace/diagnose.py b/src/deepxtrace/diagnose.py index 29d3db8..ff4ab35 100644 --- a/src/deepxtrace/diagnose.py +++ b/src/deepxtrace/diagnose.py @@ -368,15 +368,7 @@ def diagnose_matrix( ] else: # 1. Check for abnormal columns (excluding zeros in columns) - col_means = np.array([ - # Calculate mean of non-zero values in column - mat[:, j][mat[:, j] != 0].mean() - # If column contains non-zero values - if np.any(mat[:, j] != 0) - # Else set to 0 (avoid empty column errors) - else 0 - for j in range(mat.shape[1]) - ]) + col_means = np.ma.masked_equal(mat, 0).mean(axis=0).filled(0) # Calculate normalized values (exclude all-zero columns) # Indices of columns with non-zero mean valid_cols = np.where(col_means != 0)[0] @@ -393,15 +385,7 @@ def diagnose_matrix( ] # 2. Check for abnormal rows (excluding zeros in rows) - row_means = np.array([ - # Calculate mean of non-zero values in row - mat[i, :][mat[i, :] != 0].mean() - # If row contains non-zero values - if np.any(mat[i, :] != 0) - # Else set to 0 (avoid empty row errors) - else 0 - for i in range(mat.shape[0]) - ]) + row_means = np.ma.masked_equal(mat, 0).mean(axis=1).filled(0) # Calculate normalized values (exclude all-zero rows) # Indices of rows with non-zero mean valid_rows = np.where(row_means != 0)[0] @@ -481,9 +465,9 @@ def run_diagnose(): except Exception as e: self.logger.info( - f"[Diagnose] InstanceID: { \ - self.instance_id} EPSize: { \ - self.group_size} Rank: { \ + f"[Diagnose] InstanceID: { + self.instance_id} EPSize: { + self.group_size} Rank: { self.rank} dist error: {e}, diagnose thread exit.") logging.shutdown() return @@ -525,8 +509,8 @@ def _gather_diagnose_stats_internal( stats_arr[:, i, :], thres_col=self.thres_col, thres_row=self.thres_row, thres_point=self.thres_point) results.append(res) self.logger.info( - f"[Diagnose] InstanceID: { \ - self.instance_id} EPSize: { \ + f"[Diagnose] InstanceID: { + self.instance_id} EPSize: { self.group_size}, diagnose: {res}, {name} Wait Recv Cost Per Token Matrix[src_rank, dst_rank]") if self.enable_details: for idx, row in enumerate(stats_arr[:, i, :]): From 349b537aacc80297244410e565af6fa3999ed6e5 Mon Sep 17 00:00:00 2001 From: yyoean <1114146082@qq.com> Date: Thu, 30 Oct 2025 15:45:56 +0800 Subject: [PATCH 09/21] format Signed-off-by: yyoean <1114146082@qq.com> --- .DS_Store | Bin 0 -> 6148 bytes src/.DS_Store | Bin 0 -> 6148 bytes 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 .DS_Store create mode 100644 src/.DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..f6dd1d85d502e3be5c6a1be1d0a84c7416e243ee GIT binary patch literal 6148 zcmeHK%SyvQ6rE|SO({Ya3SADkEm&>wftwKP4;ayfN=;1BV9b;xHH%WnT7Sqd@q4^? zW&#$o7P0rj%(>5*%z?~%uMXB z1AcpxWh`Y6LGk_j<0#7qgHPUQwl+4~R@>@YcixjMyxh;{nd_%FXkAJf2bJyzSJAYX z*xP3^$^9sqrm7$cr;u`c9VMYGTscp|Ox1ciV0Em{#O^H?qv3I1439?3zF3?b4Eka; zJX|h2*3RzU>BZzZeo5q;rjrBPO12DE@D7Su&8wFtu}mJpQ)O3Kgv0&<{U z7p(4j(?F{y28e+g25^56&=4JirAD=NK!?|7jJFU`K*zTPqA=(fEHy#|gzHj3UCPZ9 zgX?ne3zO# Q<$!b%P=ruN4EzEEU!Af^iU0rr literal 0 HcmV?d00001 diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..53470f38b94129ddc131581c3da181b98e0f2429 GIT binary patch literal 6148 zcmeHK%}N6?5T3MEw-%uXg&qT53szhF!OK$h1zgdCO5JrAU0gS%-CCg(_N*`DllVN& zB&k?h4_-v<3{1Xcem3OGl1TtSvj*$(itgO3!%{8c!msm;KMCM zGtq4Liww}-twM|ZeKcY{O)=zj~kN!-sGjdxKfmX?-RoE4|yTzmIw=w*I3 z=(qjeC5_IMN`gt>56LN}m&I~ge2If zw*;ZI=vvGTq6bBoR78_1>=Q$nbhJww=UU7Snsg9)X8ewwS=bkf(6ggm>TnRQL2j7= zW?-6uq8V1H{-1pR{+~|b5i`IH{3`}T>DW7N Date: Thu, 30 Oct 2025 15:49:34 +0800 Subject: [PATCH 10/21] format Signed-off-by: yyoean <1114146082@qq.com> --- src/deepxtrace/diagnose.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/deepxtrace/diagnose.py b/src/deepxtrace/diagnose.py index ff4ab35..d5d0592 100644 --- a/src/deepxtrace/diagnose.py +++ b/src/deepxtrace/diagnose.py @@ -464,11 +464,7 @@ def run_diagnose(): self._reset_normal_stats() except Exception as e: - self.logger.info( - f"[Diagnose] InstanceID: { - self.instance_id} EPSize: { - self.group_size} Rank: { - self.rank} dist error: {e}, diagnose thread exit.") + self.logger.info(f"[Diagnose] InstanceID: {self.instance_id} EPSize: {self.group_size} Rank: {self.rank} dist error: {e}, diagnose thread exit.") logging.shutdown() return From d5da6558cbf83e95b29297d47b2bd239d4fa7322 Mon Sep 17 00:00:00 2001 From: yyoean <1114146082@qq.com> Date: Thu, 30 Oct 2025 15:52:32 +0800 Subject: [PATCH 11/21] format Signed-off-by: yyoean <1114146082@qq.com> --- src/deepxtrace/diagnose.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/deepxtrace/diagnose.py b/src/deepxtrace/diagnose.py index d5d0592..594ceef 100644 --- a/src/deepxtrace/diagnose.py +++ b/src/deepxtrace/diagnose.py @@ -464,7 +464,8 @@ def run_diagnose(): self._reset_normal_stats() except Exception as e: - self.logger.info(f"[Diagnose] InstanceID: {self.instance_id} EPSize: {self.group_size} Rank: {self.rank} dist error: {e}, diagnose thread exit.") + self.logger.info( + f"[Diagnose] InstanceID: {self.instance_id} EPSize: {self.group_size} Rank: {self.rank} dist error: {e}, diagnose thread exit.") logging.shutdown() return From 0e18f72ba87aaa14c6321f094de1b2c431f7b700 Mon Sep 17 00:00:00 2001 From: yyoean <1114146082@qq.com> Date: Thu, 30 Oct 2025 15:55:23 +0800 Subject: [PATCH 12/21] formart Signed-off-by: yyoean <1114146082@qq.com> --- src/deepxtrace/diagnose.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/deepxtrace/diagnose.py b/src/deepxtrace/diagnose.py index 594ceef..80f58d6 100644 --- a/src/deepxtrace/diagnose.py +++ b/src/deepxtrace/diagnose.py @@ -506,9 +506,7 @@ def _gather_diagnose_stats_internal( stats_arr[:, i, :], thres_col=self.thres_col, thres_row=self.thres_row, thres_point=self.thres_point) results.append(res) self.logger.info( - f"[Diagnose] InstanceID: { - self.instance_id} EPSize: { - self.group_size}, diagnose: {res}, {name} Wait Recv Cost Per Token Matrix[src_rank, dst_rank]") + f"[Diagnose] InstanceID: {self.instance_id} EPSize: {self.group_size}, diagnose: {res}, {name} Wait Recv Cost Per Token Matrix[src_rank, dst_rank]") if self.enable_details: for idx, row in enumerate(stats_arr[:, i, :]): self.logger.info( From 0f261d086f440242158d4aefa99c0ee91c7e1c0c Mon Sep 17 00:00:00 2001 From: yyoean <1114146082@qq.com> Date: Thu, 30 Oct 2025 15:59:07 +0800 Subject: [PATCH 13/21] format Signed-off-by: yyoean <1114146082@qq.com> --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index ddc2bd1..c65674c 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,8 @@ dist/ __pycache__/ *.so *~ +src/.DS_Store +.DS_Store venv/ # due to using nox and pytest From 51be7ffa805330ae723a4d4ee8ca18dff9f094d3 Mon Sep 17 00:00:00 2001 From: yyoean <1114146082@qq.com> Date: Thu, 30 Oct 2025 16:01:36 +0800 Subject: [PATCH 14/21] format Signed-off-by: yyoean <1114146082@qq.com> --- .DS_Store | Bin 6148 -> 0 bytes .gitignore | 2 -- src/.DS_Store | Bin 6148 -> 0 bytes 3 files changed, 2 deletions(-) delete mode 100644 .DS_Store delete mode 100644 src/.DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index f6dd1d85d502e3be5c6a1be1d0a84c7416e243ee..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%SyvQ6rE|SO({Ya3SADkEm&>wftwKP4;ayfN=;1BV9b;xHH%WnT7Sqd@q4^? zW&#$o7P0rj%(>5*%z?~%uMXB z1AcpxWh`Y6LGk_j<0#7qgHPUQwl+4~R@>@YcixjMyxh;{nd_%FXkAJf2bJyzSJAYX z*xP3^$^9sqrm7$cr;u`c9VMYGTscp|Ox1ciV0Em{#O^H?qv3I1439?3zF3?b4Eka; zJX|h2*3RzU>BZzZeo5q;rjrBPO12DE@D7Su&8wFtu}mJpQ)O3Kgv0&<{U z7p(4j(?F{y28e+g25^56&=4JirAD=NK!?|7jJFU`K*zTPqA=(fEHy#|gzHj3UCPZ9 zgX?ne3zO# Q<$!b%P=ruN4EzEEU!Af^iU0rr diff --git a/.gitignore b/.gitignore index c65674c..ddc2bd1 100644 --- a/.gitignore +++ b/.gitignore @@ -7,8 +7,6 @@ dist/ __pycache__/ *.so *~ -src/.DS_Store -.DS_Store venv/ # due to using nox and pytest diff --git a/src/.DS_Store b/src/.DS_Store deleted file mode 100644 index 53470f38b94129ddc131581c3da181b98e0f2429..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%}N6?5T3MEw-%uXg&qT53szhF!OK$h1zgdCO5JrAU0gS%-CCg(_N*`DllVN& zB&k?h4_-v<3{1Xcem3OGl1TtSvj*$(itgO3!%{8c!msm;KMCM zGtq4Liww}-twM|ZeKcY{O)=zj~kN!-sGjdxKfmX?-RoE4|yTzmIw=w*I3 z=(qjeC5_IMN`gt>56LN}m&I~ge2If zw*;ZI=vvGTq6bBoR78_1>=Q$nbhJww=UU7Snsg9)X8ewwS=bkf(6ggm>TnRQL2j7= zW?-6uq8V1H{-1pR{+~|b5i`IH{3`}T>DW7N Date: Mon, 3 Nov 2025 19:13:55 +0800 Subject: [PATCH 15/21] update Signed-off-by: yyoean <1114146082@qq.com> --- src/deepxtrace/diagnose.py | 76 ++++++++++++-------------------------- 1 file changed, 24 insertions(+), 52 deletions(-) diff --git a/src/deepxtrace/diagnose.py b/src/deepxtrace/diagnose.py index 80f58d6..f753f27 100644 --- a/src/deepxtrace/diagnose.py +++ b/src/deepxtrace/diagnose.py @@ -309,7 +309,7 @@ def _setup_logger_internal( @staticmethod def diagnose_matrix( mat, thres_col=3.0, thres_row=3.0, thres_point=5.0, - suppress_points_in_strong_rowscols=True, excluing_zeros=0 + suppress_points_in_strong_rowscols=True, excluding_zeros=0 ): """ Detect abnormal columns, rows, and individual points in a 2D wait-time matrix. @@ -328,25 +328,31 @@ def diagnose_matrix( "abnormal_points": List[List[int, int, float, float]] # abnormal points, [row, col, value, normalized_value] } """ - if excluing_zeros == 0: + if excluding_zeros == 0: # 1. Check for abnormal columns (including zeros) col_means = mat.mean(axis=0) - # z_col = (col_means - col_means.mean()) / (col_means.std() + 1e-8) - z_col = col_means / (col_means.mean() + 1e-8) - abnormal_cols = [ - [j, col_means[j], z_col[j]] - for j in np.where(z_col > thres_col)[0] - ] - + elif excluding_zeros == 1: + col_means = np.ma.masked_equal(mat, 0).mean(axis=0).filled(0) + # z_col = (col_means - col_means.mean()) / (col_means.std() + 1e-8) + z_col = col_means / (col_means.mean() + 1e-8) + abnormal_cols = [ + [j, col_means[j], z_col[j]] + for j in np.where(z_col > thres_col)[0] + ] + + if excluding_zeros == 0: # 2. Check for abnormal rows (including zeros) row_means = mat.mean(axis=1) - # z_row = (row_means - row_means.mean()) / (row_means.std() + 1e-8) - z_row = row_means / (row_means.mean() + 1e-8) - abnormal_rows = [ - [i, row_means[i], z_row[i]] - for i in np.where(z_row > thres_row)[0] - ] - + elif excluding_zeros == 1: + row_means = np.ma.masked_equal(mat, 0).mean(axis=1).filled(0) + # z_row = (row_means - row_means.mean()) / (row_means.std() + 1e-8) + z_row = row_means / (row_means.mean() + 1e-8) + abnormal_rows = [ + [i, row_means[i], z_row[i]] + for i in np.where(z_row > thres_row)[0] + ] + + if excluding_zeros == 0: # 3. Check for abnormal single points (including zeros) # z_all = (mat - mat.mean()) / (mat.std() + 1e-8) z_all = mat / (mat.mean() + 1e-8) @@ -366,41 +372,7 @@ def diagnose_matrix( [i, j, v, z] for [i, j, v, z] in abnormal_points if i not in strong_rows and j not in strong_cols ] - else: - # 1. Check for abnormal columns (excluding zeros in columns) - col_means = np.ma.masked_equal(mat, 0).mean(axis=0).filled(0) - # Calculate normalized values (exclude all-zero columns) - # Indices of columns with non-zero mean - valid_cols = np.where(col_means != 0)[0] - # Initialize all-zero array - z_col = np.zeros_like(col_means) - if len(valid_cols) > 0: - z_col[valid_cols] = col_means[valid_cols] / \ - (col_means[valid_cols].mean() + 1e-8) - # Detect abnormal columns (only non-zero columns) - abnormal_cols = [ - [j, col_means[j], z_col[j]] - for j in valid_cols - if z_col[j] > thres_col - ] - - # 2. Check for abnormal rows (excluding zeros in rows) - row_means = np.ma.masked_equal(mat, 0).mean(axis=1).filled(0) - # Calculate normalized values (exclude all-zero rows) - # Indices of rows with non-zero mean - valid_rows = np.where(row_means != 0)[0] - # Initialize all-zero array - z_row = np.zeros_like(row_means) - if len(valid_rows) > 0: - z_row[valid_rows] = row_means[valid_rows] / \ - (row_means[valid_rows].mean() + 1e-8) - # Detect abnormal rows (only non-zero rows) - abnormal_rows = [ - [i, row_means[i], z_row[i]] - for i in valid_rows - if z_row[i] > thres_row - ] - + elif excluding_zeros == 1: # 3. Check for abnormal single points (excluding zeros) mask = mat != 0 # Create mask for non-zero values # Initialize all-zero array @@ -503,7 +475,7 @@ def _gather_diagnose_stats_internal( stats_arr = torch.stack(self.gather_tensor, dim=0).numpy() for i, name in enumerate(["Dispatch", "Combine"]): res = Diagnose.diagnose_matrix( - stats_arr[:, i, :], thres_col=self.thres_col, thres_row=self.thres_row, thres_point=self.thres_point) + stats_arr[:, i, :], thres_col=self.thres_col, thres_row=self.thres_row, thres_point=self.thres_point, excluding_zeros=self.excluding_zeros) results.append(res) self.logger.info( f"[Diagnose] InstanceID: {self.instance_id} EPSize: {self.group_size}, diagnose: {res}, {name} Wait Recv Cost Per Token Matrix[src_rank, dst_rank]") From cce3028feba5a1755e1550e9d9694639bf71b526 Mon Sep 17 00:00:00 2001 From: yyoean <1114146082@qq.com> Date: Mon, 3 Nov 2025 19:16:44 +0800 Subject: [PATCH 16/21] format Signed-off-by: yyoean <1114146082@qq.com> --- src/deepxtrace/diagnose.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deepxtrace/diagnose.py b/src/deepxtrace/diagnose.py index f753f27..931917b 100644 --- a/src/deepxtrace/diagnose.py +++ b/src/deepxtrace/diagnose.py @@ -351,7 +351,7 @@ def diagnose_matrix( [i, row_means[i], z_row[i]] for i in np.where(z_row > thres_row)[0] ] - + if excluding_zeros == 0: # 3. Check for abnormal single points (including zeros) # z_all = (mat - mat.mean()) / (mat.std() + 1e-8) From e64c6eda33b8c760f342b0a2f645119ed5cc9285 Mon Sep 17 00:00:00 2001 From: yyoean <1114146082@qq.com> Date: Tue, 4 Nov 2025 19:36:05 +0800 Subject: [PATCH 17/21] simplify Signed-off-by: yyoean <1114146082@qq.com> --- src/deepxtrace/diagnose.py | 78 +++++++++++++------------------------- 1 file changed, 27 insertions(+), 51 deletions(-) diff --git a/src/deepxtrace/diagnose.py b/src/deepxtrace/diagnose.py index 931917b..58448d6 100644 --- a/src/deepxtrace/diagnose.py +++ b/src/deepxtrace/diagnose.py @@ -328,11 +328,8 @@ def diagnose_matrix( "abnormal_points": List[List[int, int, float, float]] # abnormal points, [row, col, value, normalized_value] } """ - if excluding_zeros == 0: - # 1. Check for abnormal columns (including zeros) - col_means = mat.mean(axis=0) - elif excluding_zeros == 1: - col_means = np.ma.masked_equal(mat, 0).mean(axis=0).filled(0) + # 1. Check for abnormal columns (including zeros) + col_means = mat.mean(axis=0) # z_col = (col_means - col_means.mean()) / (col_means.std() + 1e-8) z_col = col_means / (col_means.mean() + 1e-8) abnormal_cols = [ @@ -340,11 +337,8 @@ def diagnose_matrix( for j in np.where(z_col > thres_col)[0] ] - if excluding_zeros == 0: - # 2. Check for abnormal rows (including zeros) - row_means = mat.mean(axis=1) - elif excluding_zeros == 1: - row_means = np.ma.masked_equal(mat, 0).mean(axis=1).filled(0) + # 2. Check for abnormal rows (including zeros) + row_means = mat.mean(axis=1) # z_row = (row_means - row_means.mean()) / (row_means.std() + 1e-8) z_row = row_means / (row_means.mean() + 1e-8) abnormal_rows = [ @@ -356,50 +350,32 @@ def diagnose_matrix( # 3. Check for abnormal single points (including zeros) # z_all = (mat - mat.mean()) / (mat.std() + 1e-8) z_all = mat / (mat.mean() + 1e-8) - # Get all positions with z-score > threshold - abnormal_points = [ - [i, j, mat[i, j], z_all[i, j]] - for i in range(mat.shape[0]) - for j in range(mat.shape[1]) - if z_all[i, j] > thres_point - ] - # Optionally remove points that are in already detected abnormal - # rows or columns - if suppress_points_in_strong_rowscols: - strong_rows = [row[0] for row in abnormal_rows] - strong_cols = [col[0] for col in abnormal_cols] - abnormal_points = [ - [i, j, v, z] for [i, j, v, z] in abnormal_points - if i not in strong_rows and j not in strong_cols - ] elif excluding_zeros == 1: - # 3. Check for abnormal single points (excluding zeros) - mask = mat != 0 # Create mask for non-zero values - # Initialize all-zero array - z_all = np.zeros_like(mat, dtype=float) - if np.any(mask): # If non-zero values exist - # Calculate mean of non-zero values (global) - nonzero_mean = mat[mask].mean() - # Normalize only non-zero values - z_all[mask] = mat[mask] / (nonzero_mean + 1e-8) - # Detect abnormal points (non-zero values with z-score > threshold) + nonzero_values = mat[mat != 0] + if len(nonzero_values) > 0: + mean_val = nonzero_values.mean() + z_all = mat / (mean_val + 1e-8) + else: + mean_val = 0 + # avoid devide zero + z_all = np.zeros_like(mat) + + # Get all positions with z-score > threshold + abnormal_points = [ + [i, j, mat[i, j], z_all[i, j]] + for i in range(mat.shape[0]) + for j in range(mat.shape[1]) + if z_all[i, j] > thres_point + ] + # Optionally remove points that are in already detected abnormal + # rows or columns + if suppress_points_in_strong_rowscols: + strong_rows = [row[0] for row in abnormal_rows] + strong_cols = [col[0] for col in abnormal_cols] abnormal_points = [ - [i, j, mat[i, j], z_all[i, j]] - for i in range(mat.shape[0]) - for j in range(mat.shape[1]) - # Ensure non-zero and abnormal - if mask[i, j] and z_all[i, j] > thres_point + [i, j, v, z] for [i, j, v, z] in abnormal_points + if i not in strong_rows and j not in strong_cols ] - # Optionally remove points in already detected abnormal - # rows/columns - if suppress_points_in_strong_rowscols: - # Use set for faster lookup - strong_rows = {row[0] for row in abnormal_rows} - strong_cols = {col[0] for col in abnormal_cols} - abnormal_points = [ - [i, j, v, z] for [i, j, v, z] in abnormal_points - if i not in strong_rows and j not in strong_cols - ] # 4. Return for automatic processing return { From d33572e03f409fedeb5c3a9252194fdcd692f4df Mon Sep 17 00:00:00 2001 From: yyoean <1114146082@qq.com> Date: Tue, 4 Nov 2025 20:05:17 +0800 Subject: [PATCH 18/21] test.py Signed-off-by: yyoean <1114146082@qq.com> --- src/deepxtrace/diagnose.py | 2 +- tests/test_diagnose.py | 60 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/src/deepxtrace/diagnose.py b/src/deepxtrace/diagnose.py index 58448d6..66c728d 100644 --- a/src/deepxtrace/diagnose.py +++ b/src/deepxtrace/diagnose.py @@ -359,7 +359,7 @@ def diagnose_matrix( mean_val = 0 # avoid devide zero z_all = np.zeros_like(mat) - + # Get all positions with z-score > threshold abnormal_points = [ [i, j, mat[i, j], z_all[i, j]] diff --git a/tests/test_diagnose.py b/tests/test_diagnose.py index c8c4ea9..4cf2ff7 100644 --- a/tests/test_diagnose.py +++ b/tests/test_diagnose.py @@ -70,6 +70,41 @@ def setUp(self): [15, 17, 12, 18, 13, 13, 15, 14], ]) + self.mc2_layered = np.array([ + [169, 537, 530, 294, 173, 128, 139, 140, + 40, 0, 0, 0, 0, 0, 0, 0], + [1617, 196, 207, 170, 187, 151, 887, 174, + 0, 34, 0, 0, 0, 0, 0, 0], + [1626, 210, 194, 186, 174, 162, 864, 160, + 0, 0, 31, 0, 0, 0, 0, 0], + [1635, 324, 341, 186, 178, 153, 866, 169, + 0, 0, 0, 34, 0, 0, 0, 0], + [1635, 543, 534, 302, 176, 125, 847, 140, + 0, 0, 0, 0, 33, 0, 0, 0], + [1712, 681, 671, 401, 232, 102, 877, 132, + 0, 0, 0, 0, 0, 37, 0, 0], + [997, 656, 643, 382, 235, 172, 107, 146, 0, + 0, 0, 0, 0, 0, 42, 0], + [1918, 941, 931, 652, 448, 314, 1064, 199, + 0, 0, 0, 0, 0, 0, 0, 42], + [1480, 0, 0, 0, 0, 0, 0, 0, 167, 239, 343, + 154, 148, 150, 155, 143], + [0, 46, 0, 0, 0, 0, 0, 0, 1599, 169, 237, + 156, 149, 146, 860, 140], + [0, 0, 48, 0, 0, 0, 0, 0, 1610, 161, 168, + 159, 150, 161, 846, 145], + [0, 0, 0, 41, 0, 0, 0, 0, 1687, 320, 452, + 82, 139, 166, 875, 136], + [0, 0, 0, 0, 42, 0, 0, 0, 1802, 481, 616, + 242, 168, 214, 918, 166], + [0, 0, 0, 0, 0, 35, 0, 0, 1746, 417, 559, + 226, 171, 185, 903, 151], + [0, 0, 0, 0, 0, 0, 738, 0, 1011, 393, 529, + 171, 150, 162, 176, 154], + [0, 0, 0, 0, 0, 0, 0, 36, 1866, 555, 693, + 325, 211, 222, 965, 180] + ]) + def test_diagnose_row(self): res = ds.Diagnose.diagnose_matrix(self.abnormal_row) self.assertEqual( @@ -105,6 +140,31 @@ def test_diagnose_point(self): 'abnormal_cols': [], 'abnormal_rows': [], 'abnormal_points': [ [3, 4, 125, 7.279344854723584]]}) + def test_mc2_layered(self): + res = ds.Diagnose.diagnose_matrix( + mat=self.mc2_layered, excluding_zeros=0) + self.assertEqual( + res, { + 'abnormal_cols': [ + [ + 0, 799.3125, 3.2102414457222475]], 'abnormal_rows': [], 'abnormal_points': [ + [ + 9, 8, 1599, 6.421988986422549], [ + 10, 8, 1610, 6.466167772445468], [ + 11, 8, 1687, 6.775419274605904], [ + 12, 8, 1802, 7.237288401209152], [ + 13, 8, 1746, 7.012378217819744], [ + 15, 8, 1866, 7.494328610797046]]}) + + res = ds.Diagnose.diagnose_matrix( + mat=self.mc2_layered, excluding_zeros=1) + self.assertEqual(res, + {'abnormal_cols': [[0, + 799.3125, + 3.2102414457222475]], + 'abnormal_rows': [], + 'abnormal_points': []}) + if __name__ == '__main__': unittest.main() From 7a456edafb4352d18208c74494bc4c06c2c00cab Mon Sep 17 00:00:00 2001 From: yyoean <1114146082@qq.com> Date: Tue, 4 Nov 2025 22:13:52 +0800 Subject: [PATCH 19/21] format Signed-off-by: yyoean <1114146082@qq.com> --- src/deepxtrace/diagnose.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/deepxtrace/diagnose.py b/src/deepxtrace/diagnose.py index 66c728d..16b6fa7 100644 --- a/src/deepxtrace/diagnose.py +++ b/src/deepxtrace/diagnose.py @@ -328,7 +328,7 @@ def diagnose_matrix( "abnormal_points": List[List[int, int, float, float]] # abnormal points, [row, col, value, normalized_value] } """ - # 1. Check for abnormal columns (including zeros) + # 1. Check for abnormal columns col_means = mat.mean(axis=0) # z_col = (col_means - col_means.mean()) / (col_means.std() + 1e-8) z_col = col_means / (col_means.mean() + 1e-8) @@ -337,7 +337,7 @@ def diagnose_matrix( for j in np.where(z_col > thres_col)[0] ] - # 2. Check for abnormal rows (including zeros) + # 2. Check for abnormal rows row_means = mat.mean(axis=1) # z_row = (row_means - row_means.mean()) / (row_means.std() + 1e-8) z_row = row_means / (row_means.mean() + 1e-8) @@ -346,8 +346,8 @@ def diagnose_matrix( for i in np.where(z_row > thres_row)[0] ] + # 3. Check for abnormal single points if excluding_zeros == 0: - # 3. Check for abnormal single points (including zeros) # z_all = (mat - mat.mean()) / (mat.std() + 1e-8) z_all = mat / (mat.mean() + 1e-8) elif excluding_zeros == 1: @@ -367,8 +367,8 @@ def diagnose_matrix( for j in range(mat.shape[1]) if z_all[i, j] > thres_point ] - # Optionally remove points that are in already detected abnormal - # rows or columns + # Optionally remove points that are in already detected abnormal rows + # or columns if suppress_points_in_strong_rowscols: strong_rows = [row[0] for row in abnormal_rows] strong_cols = [col[0] for col in abnormal_cols] From 860c405380b15fa04fb63b3633d3a606c54a2add Mon Sep 17 00:00:00 2001 From: yyoean <1114146082@qq.com> Date: Tue, 4 Nov 2025 22:16:17 +0800 Subject: [PATCH 20/21] format Signed-off-by: yyoean <1114146082@qq.com> --- src/deepxtrace/diagnose.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deepxtrace/diagnose.py b/src/deepxtrace/diagnose.py index 16b6fa7..6506aca 100644 --- a/src/deepxtrace/diagnose.py +++ b/src/deepxtrace/diagnose.py @@ -367,7 +367,7 @@ def diagnose_matrix( for j in range(mat.shape[1]) if z_all[i, j] > thres_point ] - # Optionally remove points that are in already detected abnormal rows + # Optionally remove points that are in already detected abnormal rows # or columns if suppress_points_in_strong_rowscols: strong_rows = [row[0] for row in abnormal_rows] From ed6ee3287f750871cb41de97bfb9540562950300 Mon Sep 17 00:00:00 2001 From: yyoean <1114146082@qq.com> Date: Wed, 5 Nov 2025 16:33:00 +0800 Subject: [PATCH 21/21] format Signed-off-by: yyoean <1114146082@qq.com> --- src/deepxtrace/diagnose.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/deepxtrace/diagnose.py b/src/deepxtrace/diagnose.py index 6506aca..a9b8c55 100644 --- a/src/deepxtrace/diagnose.py +++ b/src/deepxtrace/diagnose.py @@ -356,9 +356,7 @@ def diagnose_matrix( mean_val = nonzero_values.mean() z_all = mat / (mean_val + 1e-8) else: - mean_val = 0 - # avoid devide zero - z_all = np.zeros_like(mat) + z_all = mat # Get all positions with z-score > threshold abnormal_points = [