diff --git a/include/CL/cl_ext.h b/include/CL/cl_ext.h index 5d4bcc20..537ed1c0 100644 --- a/include/CL/cl_ext.h +++ b/include/CL/cl_ext.h @@ -2384,6 +2384,66 @@ clCreateBufferWithPropertiesINTEL_fn)( void * host_ptr, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; +/********************************** +* cl_intel_global_variable_access * +***********************************/ + +#define CL_COMMAND_READ_GLOBAL_VARIABLE_INTEL 0x418E +#define CL_COMMAND_WRITE_GLOBAL_VARIABLE_INTEL 0x418F + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadGlobalVariableINTEL( + cl_command_queue command_queue, + cl_program program, + const char* name, + cl_bool blocking_read, + size_t size, + size_t offset, + void* ptr, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) CL_API_SUFFIX__VERSION_1_0; + + +typedef CL_API_ENTRY cl_int (CL_API_CALL * +clEnqueueReadGlobalVariableINTEL_fn)( + cl_command_queue command_queue, + cl_program program, + const char* name, + cl_bool blocking_read, + size_t size, + size_t offset, + const void* ptr, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteGlobalVariableINTEL( + cl_command_queue command_queue, + cl_program program, + const char* name, + cl_bool blocking_write, + size_t size, + size_t offset, + const void* ptr, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL * +clEnqueueWriteGlobalVariableINTEL_fn)( + cl_command_queue command_queue, + cl_program program, + const char* name, + cl_bool blocking_read, + size_t size, + size_t offset, + void* ptr, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) CL_API_SUFFIX__VERSION_1_0; + /****************************************** * cl_intel_mem_channel_property extension * *******************************************/ diff --git a/include/acl.h b/include/acl.h index a351c938..27e8d63b 100644 --- a/include/acl.h +++ b/include/acl.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -478,6 +479,12 @@ typedef class acl_device_program_info_t *acl_device_program_info; */ #define ACL_MEM_CAPABILITY_P2P (1 << 3) +typedef struct acl_device_global_mem_def_t { + std::string name; + unsigned int address; + unsigned int size; +} acl_device_global_mem_def_t; + // Part of acl_device_def_t where members are populated from the information // in the autodiscovery string. This will get updated every time the device // is programmed with a new device binary as the new binary would contain a @@ -496,6 +503,12 @@ typedef struct acl_device_def_autodiscovery_t { std::array global_mem_defs; std::vector acl_hostpipe_info; + + // device global definition + unsigned int num_device_global; + // std::vector device_global_mem_defs; + std::unordered_map + device_global_mem_defs; } acl_device_def_autodiscovery_t; typedef struct acl_device_def_t { diff --git a/include/acl_kernel.h b/include/acl_kernel.h index 1a6f88ef..d1a82e85 100644 --- a/include/acl_kernel.h +++ b/include/acl_kernel.h @@ -54,6 +54,10 @@ void acl_receive_kernel_update(int activation_id, cl_int status); // safe to submit a kernel with subbuffers to the device_op_queue int acl_kernel_has_unmapped_subbuffers(acl_mem_migrate_t *mem_migration); +cl_int set_kernel_arg_mem_pointer_without_checks(cl_kernel kernel, + cl_uint arg_index, + void *arg_value); + #if defined(__cplusplus) } /* extern "C" */ #endif diff --git a/include/acl_mem.h b/include/acl_mem.h index fe7674a8..fe46fca9 100644 --- a/include/acl_mem.h +++ b/include/acl_mem.h @@ -89,6 +89,14 @@ cl_bool acl_is_sub_or_parent_buffer(cl_mem mem); void CL_CALLBACK acl_free_allocation_after_event_completion( cl_event event, cl_int event_command_exec_status, void *callback_data); +void CL_CALLBACK acl_dev_global_cleanup(cl_event event, + cl_int event_command_exec_status, + void *callback_data); + +cl_int acl_extract_device_global_address(cl_kernel kernel, + const char *dev_global_name, + unsigned int *ret_addr); + #ifdef __GNUC__ #pragma GCC visibility pop #endif diff --git a/src/acl_auto_configure.cpp b/src/acl_auto_configure.cpp index 320264ce..5be8fb1d 100644 --- a/src/acl_auto_configure.cpp +++ b/src/acl_auto_configure.cpp @@ -96,6 +96,7 @@ static bool read_uint_counters(const std::string &str, UNREFERENCED_PARAMETER(e); return false; } + return true; } @@ -493,6 +494,61 @@ bool acl_load_device_def_from_str(const std::string &config_str, counters); } + // Read device global information + unsigned int num_device_global = 0; + if (result && counters.back() > 0) { + result = + read_uint_counters(config_str, curr_pos, num_device_global, counters); + devdef.num_device_global = num_device_global; + + // read total number of fields in device global + int total_fields_device_global = 0; + if (result) { + result = read_int_counters(config_str, curr_pos, + total_fields_device_global, counters); + } + + for (auto i = 0U; result && (i < num_device_global); + i++) { // device_global_memories + counters.emplace_back(total_fields_device_global); + + // read device global name + std::string device_global_name; + if (result && counters.back() > 0) { + result = read_string_counters(config_str, curr_pos, device_global_name, + counters); + } + + // read device global address + unsigned int dev_global_addr = 0; // Default + if (result && counters.back() > 0) { + result = + read_uint_counters(config_str, curr_pos, dev_global_addr, counters); + } + // read device global address size + unsigned int dev_global_size = 0; // Default + if (result && counters.back() > 0) { + result = + read_uint_counters(config_str, curr_pos, dev_global_size, counters); + } + + acl_device_global_mem_def_t dev_global_def = { + device_global_name, dev_global_addr, dev_global_size}; + devdef.device_global_mem_defs[device_global_name] = dev_global_def; + + // forward compatibility: bypassing remaining fields at the end of global + // memory + while (result && counters.size() > 0 && + counters.back() > 0) { // total_fields_device_global>0 + std::string tmp; + result = + result && read_string_counters(config_str, curr_pos, tmp, counters); + check_section_counters(counters); + } + counters.pop_back(); // removing total_fields_device_global + } // device_global_memories + } + // forward compatibility: bypassing remaining fields at the end of device // description section while (result && counters.size() > 0 && diff --git a/src/acl_icd_dispatch.cpp b/src/acl_icd_dispatch.cpp index b1700bc2..7bda7f86 100644 --- a/src/acl_icd_dispatch.cpp +++ b/src/acl_icd_dispatch.cpp @@ -50,6 +50,8 @@ clGetExtensionFunctionAddressIntelFPGA(const char *func_name) { ADDFUNCTIONLOOKUP(clResetKernelsIntelFPGA); ADDFUNCTIONLOOKUP(clSetBoardLibraryIntelFPGA); ADDFUNCTIONLOOKUP(clCreateBufferWithPropertiesINTEL); + ADDFUNCTIONLOOKUP(clEnqueueReadGlobalVariableINTEL); + ADDFUNCTIONLOOKUP(clEnqueueWriteGlobalVariableINTEL); // USM APIs are not currently supported on 32bit devices #ifndef __arm__ diff --git a/src/acl_kernel.cpp b/src/acl_kernel.cpp index e93ffc42..8da91119 100644 --- a/src/acl_kernel.cpp +++ b/src/acl_kernel.cpp @@ -837,6 +837,57 @@ CL_API_ENTRY cl_int CL_API_CALL clSetKernelArgSVMPointer( return clSetKernelArgSVMPointerIntelFPGA(kernel, arg_index, arg_value); } +/** + * Set any provided void pointer as kernel arguments + * + * It is assumed that the provided pointer is a valid device address, + * or device global address that kernel can use to point to right address space. + * + * It is the same as `clSetKernelArgMemPointerINTEL` except the validity checks + * are removed. This is because the user provided pointer may not always be usm + * pointer, therefore will not belong to the context (as they are checked in + * clSetKernelArgMemPointerINTEL) + * + * @param kernel the kernel that accept the pointer arg + * @param arg_index which kernel argument accept the value + * @param arg_value the pointer to desired address space + * @return status code, CL_SUCCESS if all operations are successful. + */ +cl_int set_kernel_arg_mem_pointer_without_checks(cl_kernel kernel, + cl_uint arg_index, + void *arg_value) { + acl_lock(); + if (!acl_kernel_is_valid(kernel)) { + UNLOCK_RETURN(CL_INVALID_KERNEL); + } + + cl_context context = kernel->program->context; + + if (arg_index >= kernel->accel_def->iface.args.size()) { + UNLOCK_ERR_RET(CL_INVALID_ARG_INDEX, context, + "Argument index is too large"); + } + + // Determine where to write the value. + size_t start_idx = 0; + size_t iface_arg_size = 0; + l_get_arg_offset_and_size(kernel, arg_index, &start_idx, &iface_arg_size); + safe_memcpy(&(kernel->arg_value[start_idx]), &arg_value, iface_arg_size, + kernel->arg_value_size - start_idx, iface_arg_size); + kernel->arg_is_svm[arg_index] = CL_FALSE; + kernel->arg_is_ptr[arg_index] = CL_TRUE; + + kernel->arg_defined[arg_index] = 1; + + // double vector size if size < arg_index + while (kernel->ptr_arg_vector.size() <= arg_index) { + kernel->ptr_arg_vector.resize(kernel->ptr_arg_vector.size() * 2); + } + kernel->ptr_arg_vector[arg_index] = arg_value; + + UNLOCK_RETURN(CL_SUCCESS); +} + ACL_EXPORT CL_API_ENTRY cl_int CL_API_CALL clSetKernelArgMemPointerINTEL( cl_kernel kernel, cl_uint arg_index, const void *arg_value) { diff --git a/src/acl_mem.cpp b/src/acl_mem.cpp index 59481e9b..f75a8648 100644 --- a/src/acl_mem.cpp +++ b/src/acl_mem.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -405,6 +406,341 @@ int acl_bind_buffer_to_device(cl_device_id device, cl_mem mem) { return 1; } +/** + * Read bytes of data from device global + * + * This function extract the device global address with given name, create + * kernel from the given program, create temporary device usm pointer to hold + * data that will be copied from device global. Launch the copy kernel with + * correct pointers set as src and destination, then enqueue a memory copy + * operation that depend on copy kernel to copy from temporary device usm + * pointer into user provided host pointer. Then register a callback to release + * the necessary events, kernels, memories. + * + * @param command_queue the queue system this copy kernel will belong + * @param program contains copy kernel + * @param name name of device global, used to look up for device global address + * in autodiscovery string + * @param blocking_read whether the operation is blocking or not + * @param size number of bytes to read / write + * @param offset offset from the extracted address of device global + * @param ptr pointer that will hold the data copied from device global + * @param num_events_in_wait_list number of event that copy kernel depend on + * @param event_wait_list events that copy kernel depend on + * @param event the info about the execution of copy kernel will be stored in + * the event + * @return status code, CL_SUCCESS if all operations are successful. + */ +ACL_EXPORT +CL_API_ENTRY cl_int clEnqueueReadGlobalVariableINTEL( + cl_command_queue command_queue, cl_program program, const char *name, + cl_bool blocking_read, size_t size, size_t offset, void *ptr, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event) { + cl_int status; + + // Get copy kernel from the program + cl_kernel kernel = + clCreateKernelIntelFPGA(program, "copy_device_global", &status); + if (status != CL_SUCCESS) { + return status; + } + assert(kernel); + + // Look up device global address with use provided name + unsigned int device_global_addr_num = 0; + status = + acl_extract_device_global_address(kernel, name, &device_global_addr_num); + if (status != CL_SUCCESS) + return status; + uintptr_t dev_global_address = device_global_addr_num; + + // Calculate the offset from the device global address, offset is in byte unit + void *dev_global_ptr = + (void *)(dev_global_address + offset * 8); // 1 unit of offset is 8 bits + // TODO: add checks for whether the copy will be out of bound for device + // global + + // Set the device global pointer as the kernel destination args + status = set_kernel_arg_mem_pointer_without_checks(kernel, 0, dev_global_ptr); + // status = clSetKernelArgMemPointerINTEL(kernel, 1, dev_global_ptr); + if (status != CL_SUCCESS) { + return status; + } + + // Copy device global memory to temporary device usm pointer first to minimize + // area cost + void *tmp_dev_ptr = clDeviceMemAllocINTEL( + command_queue->context, command_queue->device, NULL, size, 1, &status); + if (status != CL_SUCCESS) { + return status; + } + if (!tmp_dev_ptr) { + return CL_MEM_OBJECT_ALLOCATION_FAILURE; + } + + status = clSetKernelArgMemPointerINTEL(kernel, 1, tmp_dev_ptr); + if (status != CL_SUCCESS) { + return status; + } + + // Propagate size information to kernel + status = clSetKernelArg(kernel, 2, sizeof(size_t), (const void *)(&size)); + if (status != CL_SUCCESS) { + return status; + } + + // Enqueue copy kernel execution + cl_event tmp_event = 0; + status = clEnqueueTask(command_queue, kernel, num_events_in_wait_list, + event_wait_list, &tmp_event); + if (status != CL_SUCCESS) { + return status; + } + + // Copy from the temporary device memory into user provided host pointer + // give this event back to user, not the copy kernel one + status = clEnqueueMemcpyINTEL(command_queue, blocking_read, ptr, tmp_dev_ptr, + size, 1, &tmp_event, event); + if (status != CL_SUCCESS) { + return status; + } + + if (blocking_read) { + // If blocking, first wait for event to finish, then clean up the resources. + status = clReleaseEvent(tmp_event); + if (status != CL_SUCCESS) { + return status; + } + status = clMemFreeINTEL(command_queue->context, tmp_dev_ptr); + if (status != CL_SUCCESS) { + return status; + } + status = clReleaseKernel(kernel); + if (status != CL_SUCCESS) { + return status; + } + } else { + // Clean up resources after event finishes + void **callback_data = (void **)acl_malloc(sizeof(void *) * 3); + if (!callback_data) { + return CL_OUT_OF_HOST_MEMORY; + } + callback_data[0] = (void *)(tmp_dev_ptr); + callback_data[1] = (void *)(kernel); + callback_data[2] = (void *)(tmp_event); + clSetEventCallback(*event, CL_COMPLETE, acl_dev_global_cleanup, + (void *)callback_data); + } + + return CL_SUCCESS; +} + +/** + * Write bytes of data from user provided host pointer into device global + * + * This function extract the device global address with given name, create + * kernel from the given program, create temporary device usm pointer to hold + * user provided data through usm copy operation. Launch the copy kernel with + * correct pointers set as src and destination. Then register a callback to + * release the necessary events, kernels, memories. + * + * @param command_queue the queue system this copy kernel will belong + * @param program contains copy kernel + * @param name name of device global, used to look up for device global address + * in autodiscovery string + * @param blocking_write whether the operation is blocking or not + * @param size number of bytes to read / write + * @param offset offset from the extracted address of device global + * @param ptr pointer that will hold the data copied from device global + * @param num_events_in_wait_list number of event that copy kernel depend on + * @param event_wait_list events that copy kernel depend on + * @param event the info about the execution of copy kernel will be stored in + * the event + * @return status code, CL_SUCCESS if all operations are successful. + */ +ACL_EXPORT +CL_API_ENTRY cl_int clEnqueueWriteGlobalVariableINTEL( + cl_command_queue command_queue, cl_program program, const char *name, + cl_bool blocking_write, size_t size, size_t offset, const void *ptr, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event) { + cl_int status; + + // Get copy kernel from the program + cl_kernel kernel = + clCreateKernelIntelFPGA(program, "copy_device_global", &status); + if (status != CL_SUCCESS) { + return status; + } + assert(kernel); + + // Allocate a temporary device usm pointer to hold user data + // This is done to minimize kernel area, as device to device memory + // interconnect occupies least amount of memory + void *src_dev_ptr = clDeviceMemAllocINTEL( + command_queue->context, command_queue->device, NULL, size, 1, &status); + if (status != CL_SUCCESS) { + return status; + } + + // Copy from user provide host pointer to temporary device usm pointer + status = clEnqueueMemcpyINTEL(command_queue, CL_TRUE, src_dev_ptr, ptr, size, + 0, NULL, NULL); + if (status != CL_SUCCESS) { + return status; + } + + // Set the source of copy (in kernel arg) as the temporary device usm pointer + status = clSetKernelArgMemPointerINTEL(kernel, 0, src_dev_ptr); + if (status != CL_SUCCESS) { + return status; + } + + // Look up device global address with use provided name + unsigned int device_global_addr_num = 0; + status = + acl_extract_device_global_address(kernel, name, &device_global_addr_num); + if (status != CL_SUCCESS) + return status; + uintptr_t dev_global_address = device_global_addr_num; + + // TODO: remove the following checks, as it only works for unit test + assert(((unsigned int)dev_global_address) == 0x1024); + // Calculate the offset from the device global address, offset is in byte unit + void *dev_global_ptr = + (void *)(dev_global_address + offset * 8); // 1 unit of offset is 8 bits + // TODO: add checks for whether the copy will be out of bound for device + // global + + // Set the device global pointer as the kernel destination args + status = set_kernel_arg_mem_pointer_without_checks(kernel, 1, dev_global_ptr); + if (status != CL_SUCCESS) { + return status; + } + + // Propagate size information to kernel + status = clSetKernelArg(kernel, 2, sizeof(size_t), (const void *)(&size)); + if (status != CL_SUCCESS) { + return status; + } + + // Enqueue kernel execution + status = clEnqueueTask(command_queue, kernel, num_events_in_wait_list, + event_wait_list, event); + if (status != CL_SUCCESS) { + return status; + } + + acl_lock(); + // If nothing's blocking, then complete right away + acl_idle_update(command_queue->context); + acl_unlock(); + + if (blocking_write) { + // If blocking, first wait for event to finish, then clean up the resources. + status = clWaitForEvents(1, event); + if (status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) { + return status; + } + status = clMemFreeINTEL(command_queue->context, src_dev_ptr); + if (status != CL_SUCCESS) { + return status; + } + status = clReleaseKernel(kernel); + if (status != CL_SUCCESS) { + return status; + } + } else { + // Otherwise, clean up resources after event finishes + void **callback_data = (void **)acl_malloc(sizeof(void *) * 3); + if (!callback_data) { + return CL_OUT_OF_HOST_MEMORY; + } + callback_data[0] = (void *)(src_dev_ptr); + callback_data[1] = (void *)(kernel); + callback_data[2] = NULL; + clSetEventCallback(*event, CL_COMPLETE, acl_dev_global_cleanup, + (void *)callback_data); + } + + return CL_SUCCESS; +} + +/** + * Look up device global address in autodiscovery def using user provided name + * + * @param kernel the copy kernel + * @param dev_global_name name of device global to query + * @param ret_addr hold the resulting address of device global + * @return status code, CL_SUCCESS if all operations are successful. + */ +cl_int acl_extract_device_global_address(cl_kernel kernel, + const char *dev_global_name, + unsigned int *ret_addr) { + acl_lock(); + // In full system flow, the autodiscovery string is available through kernel's + // dev_bin, however it is not in unit testing framework Thus, try to find + // device global definition first on kernel's device bin, if not found then + // try on acl_present_board_def(), which is set in unit test setup steps. + std::unordered_map dev_global_map = + kernel->dev_bin->get_devdef().autodiscovery_def.device_global_mem_defs; + std::unordered_map::const_iterator + dev_global = dev_global_map.find(dev_global_name); + if (dev_global != dev_global_map.end()) { + *ret_addr = dev_global->second.address; + UNLOCK_RETURN(CL_SUCCESS); + } + // Device global name not found in kernel dev_bin, try to find in the sysdef + // setup by unit tests + dev_global_map = acl_present_board_def() + ->device[0] + .autodiscovery_def.device_global_mem_defs; + dev_global = dev_global_map.find(dev_global_name); + if (dev_global != dev_global_map.end()) { + *ret_addr = dev_global->second.address; + UNLOCK_RETURN(CL_SUCCESS); + } + UNLOCK_RETURN(CL_INVALID_VALUE); +} + +/** + * Callback function that executes after copy event finishes + * + * Main responsible for free heap memories, device memories + * + * @param event the event that this callback is registered on + * @param event_command_exec_status the status of of event (queue, complete etc) + * @param callback_data hold the resources that need to be released + * @return nothing + */ +void CL_CALLBACK acl_dev_global_cleanup(cl_event event, + cl_int event_command_exec_status, + void *callback_data) { + void **callback_ptrs = + (void **)callback_data; // callback_ptrs[0] is usm device pointer + // callback_ptrs[1] kernel to be released + // callback_ptrs[2] temporary event to be released + event_command_exec_status = + event_command_exec_status; // Avoiding Windows warning. + event = event; + acl_lock(); + if (callback_ptrs[0]) { + // Free intermediate device usm pointers + clMemFreeINTEL(event->context, callback_ptrs[0]); + } + if (callback_ptrs[1]) { + // Release kernel from hep memory as its associated device memory + clReleaseKernel(((cl_kernel)callback_ptrs[1])); + } + if (callback_ptrs[2]) { + // Release event from hep memory + clReleaseEvent(((cl_event)callback_ptrs[2])); + } + acl_free(callback_data); + acl_unlock(); +} + ACL_EXPORT CL_API_ENTRY cl_mem clCreateBufferWithPropertiesINTEL( cl_context context, const cl_mem_properties_intel *properties, diff --git a/test/acl_auto_configure_test.cpp b/test/acl_auto_configure_test.cpp index 65f950bd..ca71e53a 100644 --- a/test/acl_auto_configure_test.cpp +++ b/test/acl_auto_configure_test.cpp @@ -36,6 +36,7 @@ TEST(auto_configure, simple) { #define VERSIONIDSTRINGIFY(x) #x #define VERSIONIDTOSTR(x) VERSIONIDSTRINGIFY(x) #define DEVICE_FIELDS " 23" +#define DEVICE_FIELDS_DEV_GLOBAL " 30" #define DEVICE_FIELDS_OLD " 18" #define BOARDNAME "de4_gen2x4_swdimm" #define BOARDNAME2 "pcie385_a7" @@ -96,24 +97,31 @@ TEST(auto_configure, simple) { #define IS_SYCL_COMPILE " 1" #define IS_NOT_SYCL_COMPILE " 0" +// device global +#define NUM_DEV_GLOBAL " 2" +#define NUM_DEV_GLOBAL_FIELD " 3" // containing dev_globa_name, address, size +#define DEV_GLOBAL_1 \ + " kernel15_dev_global 4096 2048" // in format of dev_globa_name, address, size +#define DEV_GLOBAL_2 " kernel15_dev_global2 2048 1024" + int parsed; std::string err_str; - ACL_LOCKED( - parsed = acl_load_device_def_from_str( - std::string( - VERSIONIDTOSTR(ACL_AUTO_CONFIGURE_VERSIONID) - DEVICE_FIELDS RANDOM_HASH - " " BOARDNAME IS_NOT_BIG_ENDIAN MEM HOSTPIPE KERNEL_ARG_INFO_NONE - " 1 82 foo" KERNEL_CRA KERNEL_FAST_LAUNCH_DEPTH KERNEL_PERF_MON - KERNEL_WORKGROUP_VARIANT KERNEL_WORKITEM_VARIANT - KERNEL_NUM_VECTOR_LANES1 KERNEL_PROFILE_SCANCHAIN_LENGTH - ARGS_LOCAL_GLOBAL_LONG_PROF KERNEL_PRINTF_FORMATSTRINGS - LD_1024 KERNEL_REQD_WORK_GROUP_SIZE_NONE - KERNEL_MAX_WORK_GROUP_SIZE_NONE - KERNEL_MAX_GLOBAL_WORK_DIM_NONE - KERNEL_USES_GLOBAL_WORK_OFFSET_ENABLED - IS_SYCL_COMPILE), - m_device_def.autodiscovery_def, err_str)); + std::string autodiscovery = std::string( + VERSIONIDTOSTR(ACL_AUTO_CONFIGURE_VERSIONID) + DEVICE_FIELDS_DEV_GLOBAL RANDOM_HASH + " " BOARDNAME IS_NOT_BIG_ENDIAN MEM HOSTPIPE KERNEL_ARG_INFO_NONE + NUM_DEV_GLOBAL NUM_DEV_GLOBAL_FIELD DEV_GLOBAL_1 DEV_GLOBAL_2 + " 1 82 foo" KERNEL_CRA KERNEL_FAST_LAUNCH_DEPTH KERNEL_PERF_MON + KERNEL_WORKGROUP_VARIANT KERNEL_WORKITEM_VARIANT + KERNEL_NUM_VECTOR_LANES1 KERNEL_PROFILE_SCANCHAIN_LENGTH + ARGS_LOCAL_GLOBAL_LONG_PROF KERNEL_PRINTF_FORMATSTRINGS + LD_1024 KERNEL_REQD_WORK_GROUP_SIZE_NONE + KERNEL_MAX_WORK_GROUP_SIZE_NONE + KERNEL_MAX_GLOBAL_WORK_DIM_NONE + KERNEL_USES_GLOBAL_WORK_OFFSET_ENABLED + IS_SYCL_COMPILE); + ACL_LOCKED(parsed = acl_load_device_def_from_str( + autodiscovery, m_device_def.autodiscovery_def, err_str)); CHECK_EQUAL(1, parsed); CHECK_EQUAL(1, m_device_def.autodiscovery_def.num_global_mem_systems); @@ -261,6 +269,26 @@ TEST(auto_configure, simple) { (int)m_device_def.autodiscovery_def.accel[0].max_work_group_size); CHECK_EQUAL(1, (int)m_device_def.autodiscovery_def.accel[0].is_sycl_compile); + CHECK_EQUAL(2, (int)m_device_def.autodiscovery_def.num_device_global); + CHECK(m_device_def.autodiscovery_def.device_global_mem_defs.find( + "kernel15_dev_global") != + m_device_def.autodiscovery_def.device_global_mem_defs.end()); + CHECK(m_device_def.autodiscovery_def.device_global_mem_defs.find( + "kernel15_dev_global2") != + m_device_def.autodiscovery_def.device_global_mem_defs.end()); + CHECK_EQUAL(4096, m_device_def.autodiscovery_def + .device_global_mem_defs["kernel15_dev_global"] + .address); + CHECK_EQUAL(2048, m_device_def.autodiscovery_def + .device_global_mem_defs["kernel15_dev_global"] + .size); + CHECK_EQUAL(2048, m_device_def.autodiscovery_def + .device_global_mem_defs["kernel15_dev_global2"] + .address); + CHECK_EQUAL(1024, m_device_def.autodiscovery_def + .device_global_mem_defs["kernel15_dev_global2"] + .size); + // Check a second parsing. // It should allocate a new string for the name. ACL_LOCKED( @@ -460,8 +488,8 @@ TEST(auto_configure, many_ok_forward_compatibility) { ACL_AUTO_CONFIGURE_VERSIONID) " 28 " "sample40byterandomhash000000000000000000 " "a10gx 0 1 15 DDR 2 1 6 0 2147483648 100 " - "100 100 100 200 200 200 200 0 0 0 0 400 " - "400 400 400 400 47 " + "100 100 100 200 200 200 200 0 0 0 0 2 " + "1 1 1 400 47 " "40 external_sort_stage_0 0 128 1 0 0 1 0 " "1 0 1 10 0 0 4 1 0 0 500 500 500 500 0 0 " "0 0 1 1 1 3 1 1 1 3 1 800 800 800 800 800 " @@ -1175,7 +1203,7 @@ TEST(auto_configure, hostpipe) { "200 " "2 9 host_to_dev 1 0 32 32768 300 300 300 " "300 dev_to_host 0 1 32 32768 300 300 300 " - "300 400 400 400 400 400 0 " + "300 400 1 3 400 400 0 " "1 29 foo 0 128 1 0 0 1 0 1 0 0 0 0 0 0 1 " "1 1 3 1 1 1 3 1 800 800 800 800 800 900 " "900" diff --git a/test/acl_globals_test.cpp b/test/acl_globals_test.cpp index 7c1a361e..fd03a14d 100644 --- a/test/acl_globals_test.cpp +++ b/test/acl_globals_test.cpp @@ -118,6 +118,13 @@ static acl_kernel_interface_t acltest_kernels[] = { {ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(int *), 0, 0, 8}, {ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(int *), 0, 0, 16}, {ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(int *), 0, 0, 1024}, + }}, + {// interface + "copy_device_global", + { + {ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(void *), 0, 0, 1}, + {ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(void *), 0, 0, 1}, + {ACL_ARG_ADDR_NONE, ACL_ARG_BY_VALUE, sizeof(size_t), 0, 0}, }}}; template @@ -191,6 +198,24 @@ static std::vector acltest_complex_system_device0_accel = { {}, {32768, 0, 0}, 1}, + { + 14, // id + ACL_RANGE_FROM_ARRAY(acltest_devicelocal[11]), // mem + acltest_kernels[14], // iface + acltest_laspace_info, // local_aspaces + {0, 0, 0}, // compile_work_group_size + 0, // is_workgroup_invariant + 0, // is_workitem_invariant + 1, // num_vector_lanes + 0, // profiling_words_to_readback + 32768, // max_work_group_size + 3, // max_global_work_dim + {}, // printf_format_info + {32768, 0, 0}, // max_work_group_size_arr + 1, // uses_global_work_offset + 0, // fast_launch_depth + 1, // is_sycl_compile + }, {1, ACL_RANGE_FROM_ARRAY(acltest_devicelocal[1]), acltest_kernels[1], @@ -565,22 +590,31 @@ static acl_system_def_t acltest_complex_system = { 1, 0, /* alloc capabilities */ 0, /* min_host_mem_alignment */ - {"fpga0", - "sample40byterandomhash000000000000000000", - 0, - acltest_complex_system_device0_accel, /* accel */ - {}, /* hal_info */ - 1, // number of global memory systems - { - /* global mem info array */ - { - /* global mem info for memory 0 */ - /* global mem */ ACL_RANGE_FROM_ARRAY(acltest_global), - /* acl_system_global_mem_type_t */ ACL_GLOBAL_MEM_DEVICE_PRIVATE, - /* num_global_bank */ 2, - /* burst_interleaved */ 1, - }, - }}}, + { + "fpga0", + "sample40byterandomhash000000000000000000", + 0, + acltest_complex_system_device0_accel, /* accel */ + {}, /* hal_info */ + 1, // number of global memory systems + { + /* global mem info array */ + { + /* global mem info for memory 0 */ + /* global mem */ ACL_RANGE_FROM_ARRAY(acltest_global), + /* acl_system_global_mem_type_t */ + ACL_GLOBAL_MEM_DEVICE_PRIVATE, + /* num_global_bank */ 2, + /* burst_interleaved */ 1, + }, + }, + {}, // acl_hostpipe_info + 1, // num_device_global + { + // device_global_mem_defs map + {"dev_global_name", {"dev_global_name", 0x1024, 2048}}, + }, + }}, {nullptr, 1, 1, @@ -590,22 +624,31 @@ static acl_system_def_t acltest_complex_system = { 0, 0, /* alloc capabilities */ 0, /* min_host_mem_alignment */ - {"fpga1", - "sample40byterandomhash000000000000000001", - 0, - acltest_complex_system_device1_accel, /* accel */ - {}, /* hal_info */ - 1, // number of global memory systems - { - /* global mem info array */ - { - /* global mem info for memory 0 */ - /* global mem */ ACL_RANGE_FROM_ARRAY(acltest_global), - /* acl_system_global_mem_type_t */ ACL_GLOBAL_MEM_DEVICE_PRIVATE, - /* num_global_bank */ 2, - /* burst_interleaved */ 1, - }, - }}}, + { + "fpga1", + "sample40byterandomhash000000000000000001", + 0, + acltest_complex_system_device1_accel, /* accel */ + {}, /* hal_info */ + 1, // number of global memory systems + { + /* global mem info array */ + { + /* global mem info for memory 0 */ + /* global mem */ ACL_RANGE_FROM_ARRAY(acltest_global), + /* acl_system_global_mem_type_t */ + ACL_GLOBAL_MEM_DEVICE_PRIVATE, + /* num_global_bank */ 2, + /* burst_interleaved */ 1, + }, + }, + {}, // acl_hostpipe_info + 1, // num_device_global + { + // device_global_mem_defs map + {"dev_global_name", {"dev_global_name", 0x1024, 2048}}, + }, + }}, {nullptr, 2, 1, diff --git a/test/acl_program_test.cpp b/test/acl_program_test.cpp index 97539672..1bdd8866 100644 --- a/test/acl_program_test.cpp +++ b/test/acl_program_test.cpp @@ -606,25 +606,25 @@ MT_TEST(acl_program, program_info) { // built stat. to success even before calling clbuildprogram CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_NUM_KERNELS, sizeof(size_t), &num_kernels, 0)); - CHECK_EQUAL(14, num_kernels); + CHECK_EQUAL(15, num_kernels); // This won't happen if program is built with binary since we set the program // built stat. to success even before calling clbuildprogram CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES, 0, NULL, &size_ret)); - CHECK_EQUAL(321, size_ret); + CHECK_EQUAL(340, size_ret); CHECK_EQUAL(CL_SUCCESS, clBuildProgram(program, 0, 0, "", 0, 0)); // after building the program CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_NUM_KERNELS, sizeof(size_t), &num_kernels, 0)); - CHECK_EQUAL(14, num_kernels); + CHECK_EQUAL(15, num_kernels); CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_NUM_KERNELS, sizeof(size_t), &num_kernels, &size_ret)); - CHECK_EQUAL(14, num_kernels); + CHECK_EQUAL(15, num_kernels); CHECK_EQUAL(sizeof(size_t), size_ret); CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_NUM_KERNELS, 0, @@ -633,15 +633,18 @@ MT_TEST(acl_program, program_info) { CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES, 0, NULL, &size_ret)); - CHECK_EQUAL(321, size_ret); + CHECK_EQUAL(340, size_ret); + // CHECK_EQUAL(321, size_ret); names[size_ret] = 100; // making sure extra bytes of memory are not affected. CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES, 2000 * sizeof(char), names, &size_ret)); - CHECK_EQUAL(321, size_ret); // only one kernel named: "foo" + CHECK_EQUAL(340, size_ret); // only one kernel named: "foo" CHECK_EQUAL(100, names[size_ret]); - CHECK_EQUAL(0, strcmp("kernel0_copy_vecin_vecout;" + CHECK_EQUAL(0, strcmp("copy_device_global;" // TODO: eventually we would want + // to hide it from users + "kernel0_copy_vecin_vecout;" "kernel11_task_double;" "kernel12_task_double;" "kernel13_multi_vec_lane;" diff --git a/test/acl_usm_test.cpp b/test/acl_usm_test.cpp index e1f89ecc..6e06c5c1 100644 --- a/test/acl_usm_test.cpp +++ b/test/acl_usm_test.cpp @@ -41,25 +41,30 @@ MT_TEST_GROUP(acl_usm) { void setup() { if (threadNum() == 0) { acl_test_setup_generic_system(); - } + this->load(); + m_program = this->load_program(); + this->build(m_program); + } syncThreads(); - - this->load(); } void teardown() { - unload_context(); - if (m_context) { - clReleaseContext(m_context); - m_context = 0; - } syncThreads(); if (threadNum() == 0) { + this->unload_program(m_program); + unload_context(); + + if (m_context) { + clReleaseContext(m_context); + m_context = 0; + } + acl_test_teardown_generic_system(); } + syncThreads(); acl_test_run_standard_teardown_checks(); } @@ -108,6 +113,38 @@ MT_TEST_GROUP(acl_usm) { ACL_LOCKED(CHECK(acl_context_is_valid(m_context))); } + cl_program load_program() { + cl_int status; + cl_program program; + status = CL_INVALID_VALUE; + const unsigned char *bin = (const unsigned char *)"0"; + size_t bin_length = 1; + cl_int bin_status; + program = clCreateProgramWithBinary(m_context, 1, &m_device[0], &bin_length, + &bin, &bin_status, &status); + CHECK_EQUAL(CL_SUCCESS, status); + CHECK(program); + ACL_LOCKED(CHECK(acl_program_is_valid(program))); + + return program; + } + void unload_program(cl_program program) { + int program_is_valid; + ACL_LOCKED(program_is_valid = acl_program_is_valid(program)); + if (program_is_valid) { + cl_int status; + while (program->num_kernels) { + clReleaseKernel(program->kernel_list->kernel); + } + CHECK_EQUAL(1, acl_ref_count(program)); + status = clReleaseProgram(program); + CHECK_EQUAL(CL_SUCCESS, status); + } + } + void build(cl_program program) { + CHECK_EQUAL(CL_SUCCESS, clBuildProgram(program, 0, 0, "", 0, 0)); + } + void load_backing_store_context(void) { unload_context(); cl_context_properties props[] = { @@ -165,6 +202,7 @@ MT_TEST_GROUP(acl_usm) { cl_uint m_num_devices; cl_device_id m_device[ACL_MAX_DEVICE]; cl_command_queue m_cq; + cl_program m_program; public: bool yeah; @@ -1206,4 +1244,68 @@ MT_TEST(acl_usm, memfill_usm) { ACL_LOCKED(acl_print_debug_msg("end memfill_usm\n")); } +MT_TEST(acl_usm, read_device_global) { + ACL_LOCKED(acl_print_debug_msg("begin read_device_global\n")); + char str[100]; + const size_t strsize = sizeof(str) / sizeof(char); // includes NUL (!) + char resultbuf[strsize]; + cl_int status; + + cl_event write_event = 0; + cl_event read_event = 0; + + // Prepare host memory + syncThreads(); + // Host pointer example + void *src_ptr = malloc(strsize); + CHECK(src_ptr != NULL); + + syncThreads(); + // Write to device global + status = clEnqueueWriteGlobalVariableINTEL(m_cq, m_program, "dev_global_name", + CL_FALSE, strsize, 0, src_ptr, 0, + NULL, &write_event); + CHECK_EQUAL(CL_SUCCESS, status); + + // Read from device global, with dependence on write event + status = clEnqueueReadGlobalVariableINTEL(m_cq, m_program, "dev_global_name", + CL_FALSE, strsize, 0, src_ptr, 1, + &write_event, &read_event); + CHECK_EQUAL(CL_SUCCESS, status); + + // Manually set "write device global" event done + int write_activation_id = write_event->cmd.info.ndrange_kernel + .invocation_wrapper->image->activation_id; + acltest_call_kernel_update_callback(write_activation_id, CL_RUNNING); + acltest_call_kernel_update_callback(write_activation_id, CL_COMPLETE); + + // Nudge the scheduler to take above finish into account + acl_lock(); + // If nothing's blocking, then complete right away + acl_idle_update(m_cq->context); + acl_unlock(); + + // The event returned from read device global is not the copy kernel launch + // event Therefore need to first get the event that it depend on, then + // manually set it to complete + auto last_event = read_event->depend_on.end(); + last_event--; + cl_event read_copy_kernel_event = *last_event; + int read_activation_id = read_copy_kernel_event->cmd.info.ndrange_kernel + .invocation_wrapper->image->activation_id; + acltest_call_kernel_update_callback(read_activation_id, CL_RUNNING); + acltest_call_kernel_update_callback(read_activation_id, CL_COMPLETE); + // Now the usm copy operation will execute + + // Block on all event completion + CHECK_EQUAL(CL_SUCCESS, clFinish(m_cq)); + CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(write_event)); + CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(read_event)); + + // Host pointer example + free(src_ptr); + + ACL_LOCKED(acl_print_debug_msg("end read_write_buf\n")); +} + #endif // __arm__