Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions source/clients/qsh/sge_client_ijs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
*
* Portions of this code are Copyright 2011 Univa Inc.
*
* Portions of this software are Copyright (c) 2023-2025 HPC-Gridware GmbH
* Portions of this software are Copyright (c) 2024-2025 HPC-Gridware GmbH
*
************************************************************************/
/*___INFO__MARK_END__*/
Expand Down Expand Up @@ -291,7 +291,7 @@ static void client_check_window_change(COMM_HANDLE *handle)
*/
received_window_change_signal = 0;
if (ioctl(fileno(stdin), TIOCGWINSZ, &ws) >= 0) {
DPRINTF("sendig WINDOW_SIZE_CTRL_MSG with new window size: %d, %d, %d, %d to shepherd\n",
DPRINTF("sending WINDOW_SIZE_CTRL_MSG with new window size: %d, %d, %d, %d to shepherd\n",
ws.ws_row, ws.ws_col, ws.ws_xpixel, ws.ws_ypixel);

snprintf(buf, sizeof(buf), "WS %d %d %d %d", ws.ws_row, ws.ws_col, ws.ws_xpixel, ws.ws_ypixel);
Expand Down
63 changes: 36 additions & 27 deletions source/common/sge_ijs_comm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
*
* Portions of this software are Copyright (c) 2011 Univa Corporation.
*
* Portions of this software are Copyright (c) 2023-2025 HPC-Gridware GmbH
* Portions of this software are Copyright (c) 2024-2025 HPC-Gridware GmbH
*
************************************************************************/
/*___INFO__MARK_END__*/
Expand Down Expand Up @@ -65,7 +65,6 @@
#include "sge_ijs_comm.h"

extern sig_atomic_t received_signal;

/*
* TODO: Cleanup / Headers
* This is just slightly modified copy of the gdi commlib error handling,
Expand Down Expand Up @@ -309,6 +308,7 @@ int my_log_list_flush_list(cl_raw_list_t* list_p) {
*
* INPUTS
* dstring *err_msg - Gets the error reason in case of error.
* cl_log_func_t - a commlib logging function which will print CL_LOG messages
Comment on lines 310 to +311
Copy link

Copilot AI Jan 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function documentation has been updated to mention the new cl_log_func_t parameter, but the description only says "a commlib logging function which will print CL_LOG messages" without explaining when/why a caller should provide this parameter, what happens if nullptr is passed, or what the expected signature and behavior of the logging function should be. This incomplete documentation makes it difficult for API consumers to understand how to use this parameter correctly.

Suggested change
* dstring *err_msg - Gets the error reason in case of error.
* cl_log_func_t - a commlib logging function which will print CL_LOG messages
* dstring *err_msg - Gets the error reason in case of error.
* cl_log_func_t log_func - Optional commlib logging callback. If non-null,
* this function is invoked for each CL_LOG message
* produced by the communication library. If nullptr
* is passed, the library uses its internal default
* logging behavior. The callback must match the
* cl_log_func_t typedef (see cl_commlib.h), be
* reentrant/thread-safe, and return quickly to avoid
* blocking communication threads.

Copilot uses AI. Check for mistakes.
*
* RESULT
* int - COMM_RETVAL_OK:
Expand All @@ -324,19 +324,20 @@ int my_log_list_flush_list(cl_raw_list_t* list_p) {
* SEE ALSO
* communication/comm_cleanup_lib()
*******************************************************************************/
int comm_init_lib(dstring *err_msg)
int comm_init_lib(dstring *err_msg, cl_log_func_t commlib_log_func)
{
int ret, ret_val = COMM_RETVAL_OK;

DENTER(TOP_LAYER);

/*
* To enable commlib logging to a file (see my_log_list_flush_list()
* for the file path), exchange this line with the one below.
* Caution: On some architectures, logging causes problems!
*/
/*ret = cl_com_setup_commlib(CL_RW_THREAD, CL_LOG_DEBUG, my_log_list_flush_list);*/
ret = cl_com_setup_commlib(CL_RW_THREAD, CL_LOG_OFF, nullptr);
// When we pass a logging function to see commlib logging
// (in sge_shepherd, when compiled with EXTENSIVE_TRACING)
// we want to see INFO logging.
cl_log_type debug_level = CL_LOG_OFF;
if (commlib_log_func != nullptr) {
debug_level = CL_LOG_INFO;
}
ret = cl_com_setup_commlib(CL_RW_THREAD, debug_level, commlib_log_func);
if (ret != CL_RETVAL_OK) {
sge_dstring_sprintf(err_msg, cl_get_error_text(ret));
DPRINTF("cl_com_setup_commlib() failed: %s (%d)\n", sge_dstring_get_string(err_msg), ret);
Expand Down Expand Up @@ -768,9 +769,9 @@ int comm_ignore_timeouts(bool b_ignore, dstring *err_msg)

cl_com_ignore_timeouts(b_ignore);
if (ret != CL_RETVAL_OK) {
sge_dstring_sprintf(err_msg, cl_get_error_text(ret));
DPRINTF("cl_com_ignore_timeouts() failed: %s (%d)\n", sge_dstring_get_string(err_msg), ret);
ret_val = COMM_CANT_SET_IGNORE_TIMEOUTS;
sge_dstring_sprintf(err_msg, cl_get_error_text(ret));
DPRINTF("cl_com_ignore_timeouts() failed: %s (%d)\n", sge_dstring_get_string(err_msg), ret);
ret_val = COMM_CANT_SET_IGNORE_TIMEOUTS;
}
DRETURN(ret_val);
}
Expand Down Expand Up @@ -1223,40 +1224,44 @@ unsigned long comm_write_message(COMM_HANDLE *handle,
*******************************************************************************/
int comm_flush_write_messages(COMM_HANDLE *handle, dstring *err_msg)
{
unsigned long elems = 0;
int ret = 0, retries = 0;
int retries = 0;

elems = cl_com_messages_in_send_queue(handle);
unsigned long elems = cl_com_messages_in_send_queue(handle);
while (elems > 0) {
/*
* Don't set the cl_commlib_trigger()-call to be blocking and
* get rid of the usleep() - it's much slower!
* The last cl_commlib_trigger()-call will take 1 s.
*/
ret = cl_commlib_trigger(handle, 0);
int trigger_ret = cl_commlib_trigger(handle, 0);
/*
* Bail out if trigger fails with an error that indicates that we
* won't be able to send the messages in the near future.
*/
if (ret != CL_RETVAL_OK &&
ret != CL_RETVAL_SELECT_TIMEOUT &&
ret != CL_RETVAL_SELECT_INTERRUPT) {
sge_dstring_sprintf(err_msg, cl_get_error_text(ret));
retries = ret;
break;
if (trigger_ret != CL_RETVAL_OK &&
trigger_ret != CL_RETVAL_SELECT_TIMEOUT &&
trigger_ret != CL_RETVAL_SELECT_INTERRUPT &&
trigger_ret != CL_RETVAL_THREADS_ENABLED) {
sge_dstring_sprintf(err_msg, cl_get_error_text(trigger_ret));
sge_dstring_sprintf_append(err_msg, " - after %d retries", retries);
return trigger_ret;
}

elems = cl_com_messages_in_send_queue(handle);
/*
* We just tried to send the messages and it wasn't possible to send
* all messages - give the network some time to recover.
* @todo CS-1739 cl_commlib_trigger() does *not* wait until all messages are sent!
* @todo Shall we have a maximum number of retries? A timeout?
* But if the qrsh client is suspended, we probably need to wait until it is unsuspended again.
*/
/* TODO (NEW): make this work correctly by calling check_client_alive */
if (elems > 0) {
usleep(10000);
retries--;
retries++;
Comment on lines +1227 to +1261
Copy link

Copilot AI Jan 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The increment operation has been changed from 'retries--' to 'retries++', and the return value from 'return retries' to 'return -retries'. While this appears intentional to change the semantics (negative for success indicating retry count, positive for errors), the variable name 'retries' is now misleading since it's being used as a counter that gets incremented. Consider renaming this variable to 'retry_count' to better reflect its purpose.

Copilot uses AI. Check for mistakes.
}
}
return retries;
return -retries;
Copy link

Copilot AI Jan 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return value semantics of comm_flush_write_messages() have been changed from returning an error code directly to returning a negative retry count on success and a positive error code on failure. This is inconsistent with typical return value conventions where 0 indicates success and non-zero indicates an error. The caller in line 1247 now needs to check if the return value is positive for errors, which is confusing and error-prone. Consider using a more conventional approach with an out parameter for retry count, or documenting this unusual return convention more explicitly.

Suggested change
return -retries;
return 0;

Copilot uses AI. Check for mistakes.
}

/****** sge_ijs_comm/comm_recv_message() **************************************
Expand Down Expand Up @@ -1323,7 +1328,7 @@ int comm_recv_message(COMM_HANDLE *handle, bool b_synchron,
nullptr, /* unresolved_hostname, */
nullptr, /* component_name, */
0, /* component_id, */
false,
b_synchron,
Copy link

Copilot AI Jan 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The parameter b_synchron is now being passed to cl_commlib_receive_message() where previously 'false' was hardcoded. While this change makes the function respect the synchronous/asynchronous mode as intended, this is a significant behavioral change that could affect existing callers. The change appears intentional and correct based on the parameter name, but verify that all call sites expect this new behavior, especially when b_synchron is true.

Copilot uses AI. Check for mistakes.
0,
&message,
&sender);
Expand Down Expand Up @@ -1356,7 +1361,7 @@ int comm_recv_message(COMM_HANDLE *handle, bool b_synchron,
}
}

if(sender != nullptr) {
if (sender != nullptr) {
cl_com_free_endpoint(&sender);
}

Expand Down Expand Up @@ -1407,6 +1412,10 @@ int comm_recv_message(COMM_HANDLE *handle, bool b_synchron,
}
}
} else {
// @todo CS-1739 do we need the cl_commlib_trigger, when we are using multi-threaded commlib?
// if b_synchron is 0, then it does essentially nothing
// otherwise it waits, until a message is available - the same which is done by cl_commlib_receive_message()
// itself
Comment on lines +1415 to +1418
Copy link

Copilot AI Jan 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A TODO comment references CS-1739, but the related code is disabled with #if 0. The comment questions whether cl_commlib_trigger() is needed when using multi-threaded commlib. If this is a legitimate concern that needs investigation, consider creating a tracking issue or resolving it before merging. Leaving disabled code with unresolved questions reduces code maintainability and makes it unclear whether this code path should be removed or fixed.

Suggested change
// @todo CS-1739 do we need the cl_commlib_trigger, when we are using multi-threaded commlib?
// if b_synchron is 0, then it does essentially nothing
// otherwise it waits, until a message is available - the same which is done by cl_commlib_receive_message()
// itself
/* Intentionally call cl_commlib_trigger() here to drive commlib progress.
* For asynchronous operation (b_synchron == 0) this is effectively a no-op,
* while for synchronous operation it will block until a message is available,
* matching the behavior expected by cl_commlib_receive_message() users.
* This call is kept for consistency across single-threaded and multi-threaded
* commlib implementations.
*/

Copilot uses AI. Check for mistakes.
cl_commlib_trigger(handle, b_synchron);
}
DRETURN(ret_val);
Expand Down
4 changes: 2 additions & 2 deletions source/common/sge_ijs_comm.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
*
* Portions of this code are Copyright 2011 Univa Inc.
*
* Portions of this software are Copyright (c) 2023-2024 HPC-Gridware GmbH
* Portions of this software are Copyright (c) 2024-2025 HPC-Gridware GmbH
*
************************************************************************/
/*___INFO__MARK_END__*/
Expand Down Expand Up @@ -96,7 +96,7 @@ typedef struct recv_message_s {
} recv_message_t;


int comm_init_lib(dstring *err_msg);
int comm_init_lib(dstring *err_msg, cl_log_func_t commlib_log_func = nullptr);
Copy link

Copilot AI Jan 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function signature for comm_init_lib() has been updated to include an optional cl_log_func_t parameter with a default value of nullptr. However, this changes the API without a clear deprecation path. Additionally, mixing default parameters with C-style function declarations may cause compatibility issues, particularly if this function is called from C code or across compilation boundaries.

Suggested change
int comm_init_lib(dstring *err_msg, cl_log_func_t commlib_log_func = nullptr);
int comm_init_lib(dstring *err_msg, cl_log_func_t commlib_log_func);

Copilot uses AI. Check for mistakes.
int comm_cleanup_lib(dstring *err_msg);

int comm_open_connection(bool b_server,
Expand Down
5 changes: 3 additions & 2 deletions source/daemons/qmaster/sge_thread_listener.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
*
* All Rights Reserved.
*
* Portions of this software are Copyright (c) 2023-2025 HPC-Gridware GmbH
* Portions of this software are Copyright (c) 2024-2025 HPC-Gridware GmbH
*
************************************************************************/
/*___INFO__MARK_END__*/
Expand Down Expand Up @@ -113,7 +113,8 @@ sge_listener_terminate() {
#if 0
// signal (broadcast) the commlib handle app_condition variable
// this will make it leave waiting for new messages in sge_qmaster_process_message->sge_gdi_get_any_request()
// @todo that's the theory, but it doesn't work
// @todo CS-982 that's the theory, but it doesn't work
// => should work now with the fix of CS-1735
// as it is now shutting down the listener threads takes some time (> 1 second)
cl_thread_trigger_thread_condition(handle->app_condition, 1);
#endif
Expand Down
87 changes: 74 additions & 13 deletions source/daemons/shepherd/sge_shepherd_ijs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,41 @@ extern int received_signal; /* defined in shepherd.c */
* SEE ALSO
*******************************************************************************/
#ifdef EXTENSIVE_TRACING
// This function will output commlib logging to the shepherd trace file
// when EXTENSIVE_TRACING is enabled.
static int
shepherd_log_list_flush_list(cl_raw_list_t* list_p) {

if (list_p == nullptr) {
return CL_RETVAL_LOG_NO_LOGLIST;
}

int ret_val;
if ((ret_val = cl_raw_list_lock(list_p)) != CL_RETVAL_OK) {
return ret_val;
}

// Log all entries of the log list via shepherd_trace().
cl_log_list_elem_t *elem = nullptr;
while ((elem = cl_log_list_get_first_elem(list_p)) != nullptr) {
if (elem->log_parameter == nullptr) {
shepherd_trace("COMMLIB|%s|%s|%s",
cl_thread_convert_state_id(elem->log_thread_state),
cl_log_list_convert_type_id(elem->log_type),
elem->log_message);
} else {
shepherd_trace("COMMLIB|%s|%s|%s %s",
cl_thread_convert_state_id(elem->log_thread_state),
cl_log_list_convert_type_id(elem->log_type),
elem->log_message, elem->log_parameter);
Comment on lines +146 to +155
Copy link

Copilot AI Jan 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The shepherd_log_list_flush_list() function doesn't check if the log_message field is nullptr before using it in shepherd_trace(). If elem->log_message is nullptr, this will result in undefined behavior when passed to the format string. Add a nullptr check for elem->log_message before using it.

Suggested change
if (elem->log_parameter == nullptr) {
shepherd_trace("COMMLIB|%s|%s|%s",
cl_thread_convert_state_id(elem->log_thread_state),
cl_log_list_convert_type_id(elem->log_type),
elem->log_message);
} else {
shepherd_trace("COMMLIB|%s|%s|%s %s",
cl_thread_convert_state_id(elem->log_thread_state),
cl_log_list_convert_type_id(elem->log_type),
elem->log_message, elem->log_parameter);
const char *log_message = (elem->log_message != nullptr) ? elem->log_message : "";
if (elem->log_parameter == nullptr) {
shepherd_trace("COMMLIB|%s|%s|%s",
cl_thread_convert_state_id(elem->log_thread_state),
cl_log_list_convert_type_id(elem->log_type),
log_message);
} else {
shepherd_trace("COMMLIB|%s|%s|%s %s",
cl_thread_convert_state_id(elem->log_thread_state),
cl_log_list_convert_type_id(elem->log_type),
log_message, elem->log_parameter);

Copilot uses AI. Check for mistakes.
}

cl_log_list_del_log(list_p);
}

return cl_raw_list_unlock(list_p);
}

static int trace_buf(const char *buffer, int length, const char *format, ...)
{
int ret;
Expand Down Expand Up @@ -467,7 +502,17 @@ static void* pty_to_commlib(void *t_conf)
shepherd_trace("pty_to_commlib: send_buf() failed -> exiting");
do_exit = 1;
}
comm_flush_write_messages(g_comm_handle, &err_msg);
int flush_ret = comm_flush_write_messages(g_comm_handle, &err_msg);
if (flush_ret > 0) {
// comm_flush_write_messages reported an error - log to trace file
shepherd_trace("pty_to_commlib: comm_flush_write_messages() returned error %d: %s",
flush_ret, sge_dstring_get_string(&err_msg));

} else if (flush_ret < 0) {
#ifdef EXTENSIVE_TRACING
shepherd_trace("pty_to_commlib: comm_flush_write_messages() did %d retries", -flush_ret);
#endif
}
Comment on lines +510 to +515
Copy link

Copilot AI Jan 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The call to comm_flush_write_messages() now returns a value that is checked, but the handling logic appears incorrect. The function now returns negative values for the number of retries on success and positive values for error codes on failure. The condition 'if (flush_ret > 0)' correctly checks for errors, but the else-if condition checking 'flush_ret < 0' treats negative values as indicating retries. However, if flush_ret is 0 (zero retries), the code doesn't handle it, which means successful flushes with zero retries won't be logged even under EXTENSIVE_TRACING. Consider adding an explicit check for the success case (flush_ret == 0 or flush_ret <= 0) to ensure all cases are handled correctly.

Suggested change
} else if (flush_ret < 0) {
#ifdef EXTENSIVE_TRACING
shepherd_trace("pty_to_commlib: comm_flush_write_messages() did %d retries", -flush_ret);
#endif
}
}
#ifdef EXTENSIVE_TRACING
else {
if (flush_ret < 0) {
shepherd_trace("pty_to_commlib: comm_flush_write_messages() did %d retries", -flush_ret);
} else { /* flush_ret == 0: success without retries */
shepherd_trace("pty_to_commlib: comm_flush_write_messages() completed without retries");
}
}
#endif

Copilot uses AI. Check for mistakes.
}
}

Expand Down Expand Up @@ -537,32 +582,39 @@ static void* commlib_to_pty(void *t_conf)
shepherd_trace("commlib_to_pty: no valid handle for stdin available. Exiting!");
}

// Set timeout for synchronous receiving of messages.
cl_com_set_synchron_receive_timeout(g_comm_handle, 1);

while (do_exit == 0) {
/* wait blocking for a message from commlib */
// We wait synchronously (blocking) for a message from commlib, timeout is 1s.
recv_mess.cl_message = nullptr;
recv_mess.data = nullptr;
sge_dstring_free(&err_msg);
sge_dstring_sprintf(&err_msg, "");

#ifdef EXTENSIVE_TRACING
shepherd_trace("commlib_to_pty: calling comm_recv_message() synchronously, timeout %d",
g_comm_handle->synchron_receive_timeout);
#endif

ret = comm_recv_message(g_comm_handle, true, &recv_mess, &err_msg);

/*
* Check if the thread was cancelled. Exit thread if it was.
* It shouldn't be neccessary to do the check here, as the cancel state
* of the thread is 1, i.e. the thread may be cancelled at any time,
#ifdef EXTENSIVE_TRACING
shepherd_trace("commlib_to_pty: comm_recv_message() returned %d, err_msg: %s",
ret, sge_dstring_get_string(&err_msg));
#endif

/*
* Check if the thread was canceled. Exit the thread if it was.
* It shouldn't be necessary to do the check here, as the cancel state
* of the thread is 1, i.e., the thread may be canceled at any time,
* but this doesn't work on some architectures (Darwin, older Solaris).
*/
thread_testcancel(t_conf);
if (g_raised_event > 0) {
do_exit = 1;
continue;
}
#ifdef EXTENSIVE_TRACING
shepherd_trace("commlib_to_pty: comm_recv_message() returned %d, err_msg: %s",
ret, sge_dstring_get_string(&err_msg));
#endif

if (ret != COMM_RETVAL_OK) {
/* handle error cases */
Expand Down Expand Up @@ -788,7 +840,12 @@ parent_loop(int job_pid, const char *childname, int timeout, ckpt_info_t *p_ckpt
*/
sge_dstring_sprintf(err_msg, "");


#ifdef EXTENSIVE_TRACING
ret = comm_init_lib(err_msg, shepherd_log_list_flush_list);
#else
ret = comm_init_lib(err_msg);
#endif
if (ret != COMM_RETVAL_OK) {
shepherd_trace("parent: init comm lib failed: %d", ret);
return 1;
Expand Down Expand Up @@ -935,8 +992,13 @@ parent_loop(int job_pid, const char *childname, int timeout, ckpt_info_t *p_ckpt
/*
* This will wake up all threads waiting for a message
*/
#ifdef EXTENSIVE_TRACING
shepherd_trace("parent: calling cl_thread_trigger_thread_condition()");
#endif
cl_thread_trigger_thread_condition(g_comm_handle->app_condition, 1);

#ifdef EXTENSIVE_TRACING
shepherd_trace("parent: after cl_thread_trigger_thread_condition()");
#endif

close(g_p_ijs_fds->pty_master);

Expand All @@ -952,7 +1014,6 @@ parent_loop(int job_pid, const char *childname, int timeout, ckpt_info_t *p_ckpt
cl_thread_cleanup(thread_pty_to_commlib);
cl_thread_cleanup(thread_commlib_to_pty);


#if 0
{
struct timeb ts;
Expand Down Expand Up @@ -1046,7 +1107,7 @@ int close_parent_loop(int exit_status)

sge_free(&g_hostname);
sge_dstring_free(&err_msg);
shepherd_trace("parent: leaving closinge_parent_loop()");
shepherd_trace("parent: leaving close_parent_loop()");
return 0;
}

Loading