From 22dd47748e12e43eb9a2b62628c5cc6bc0eff84e Mon Sep 17 00:00:00 2001 From: Sherry Yuan Date: Fri, 21 Jan 2022 13:59:12 -0500 Subject: [PATCH 01/12] Device global implementation --------------------------------- Currently the device global pointer is being mocked by passing an actual device allocation into the function. However this should be removed after the pointer is available through autodiscovery string. --- include/CL/cl_ext.h | 60 +++++++++++++++++++ include/acl_kernel.h | 4 ++ src/acl_icd_dispatch.cpp | 2 + src/acl_kernel.cpp | 35 +++++++++++ src/acl_mem.cpp | 114 ++++++++++++++++++++++++++++++++++++ test/acl_globals_test.cpp | 22 +++++++ test/acl_program_test.cpp | 14 +++-- test/acl_usm_test.cpp | 120 +++++++++++++++++++++++++++++++++++--- 8 files changed, 357 insertions(+), 14 deletions(-) diff --git a/include/CL/cl_ext.h b/include/CL/cl_ext.h index 5d4bcc20..537ed1c0 100644 --- a/include/CL/cl_ext.h +++ b/include/CL/cl_ext.h @@ -2384,6 +2384,66 @@ clCreateBufferWithPropertiesINTEL_fn)( void * host_ptr, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; +/********************************** +* cl_intel_global_variable_access * +***********************************/ + +#define CL_COMMAND_READ_GLOBAL_VARIABLE_INTEL 0x418E +#define CL_COMMAND_WRITE_GLOBAL_VARIABLE_INTEL 0x418F + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadGlobalVariableINTEL( + cl_command_queue command_queue, + cl_program program, + const char* name, + cl_bool blocking_read, + size_t size, + size_t offset, + void* ptr, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) CL_API_SUFFIX__VERSION_1_0; + + +typedef CL_API_ENTRY cl_int (CL_API_CALL * +clEnqueueReadGlobalVariableINTEL_fn)( + cl_command_queue command_queue, + cl_program program, + const char* name, + cl_bool blocking_read, + size_t size, + size_t offset, + const void* ptr, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteGlobalVariableINTEL( + cl_command_queue command_queue, + cl_program program, + const char* name, + cl_bool blocking_write, + size_t size, + size_t offset, + const void* ptr, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL * +clEnqueueWriteGlobalVariableINTEL_fn)( + cl_command_queue command_queue, + cl_program program, + const char* name, + cl_bool blocking_read, + size_t size, + size_t offset, + void* ptr, + cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, + cl_event* event) CL_API_SUFFIX__VERSION_1_0; + /****************************************** * cl_intel_mem_channel_property extension * *******************************************/ diff --git a/include/acl_kernel.h b/include/acl_kernel.h index 1a6f88ef..d1a82e85 100644 --- a/include/acl_kernel.h +++ b/include/acl_kernel.h @@ -54,6 +54,10 @@ void acl_receive_kernel_update(int activation_id, cl_int status); // safe to submit a kernel with subbuffers to the device_op_queue int acl_kernel_has_unmapped_subbuffers(acl_mem_migrate_t *mem_migration); +cl_int set_kernel_arg_mem_pointer_without_checks(cl_kernel kernel, + cl_uint arg_index, + void *arg_value); + #if defined(__cplusplus) } /* extern "C" */ #endif diff --git a/src/acl_icd_dispatch.cpp b/src/acl_icd_dispatch.cpp index b1700bc2..7bda7f86 100644 --- a/src/acl_icd_dispatch.cpp +++ b/src/acl_icd_dispatch.cpp @@ -50,6 +50,8 @@ clGetExtensionFunctionAddressIntelFPGA(const char *func_name) { ADDFUNCTIONLOOKUP(clResetKernelsIntelFPGA); ADDFUNCTIONLOOKUP(clSetBoardLibraryIntelFPGA); ADDFUNCTIONLOOKUP(clCreateBufferWithPropertiesINTEL); + ADDFUNCTIONLOOKUP(clEnqueueReadGlobalVariableINTEL); + ADDFUNCTIONLOOKUP(clEnqueueWriteGlobalVariableINTEL); // USM APIs are not currently supported on 32bit devices #ifndef __arm__ diff --git a/src/acl_kernel.cpp b/src/acl_kernel.cpp index e93ffc42..b816fa6c 100644 --- a/src/acl_kernel.cpp +++ b/src/acl_kernel.cpp @@ -837,6 +837,41 @@ CL_API_ENTRY cl_int CL_API_CALL clSetKernelArgSVMPointer( return clSetKernelArgSVMPointerIntelFPGA(kernel, arg_index, arg_value); } +cl_int set_kernel_arg_mem_pointer_without_checks(cl_kernel kernel, + cl_uint arg_index, + void *arg_value) { + acl_lock(); + if (!acl_kernel_is_valid(kernel)) { + UNLOCK_RETURN(CL_INVALID_KERNEL); + } + + cl_context context = kernel->program->context; + + if (arg_index >= kernel->accel_def->iface.args.size()) { + UNLOCK_ERR_RET(CL_INVALID_ARG_INDEX, context, + "Argument index is too large"); + } + + // Determine where to write the value. + size_t start_idx = 0; + size_t iface_arg_size = 0; + l_get_arg_offset_and_size(kernel, arg_index, &start_idx, &iface_arg_size); + safe_memcpy(&(kernel->arg_value[start_idx]), &arg_value, iface_arg_size, + kernel->arg_value_size - start_idx, iface_arg_size); + kernel->arg_is_svm[arg_index] = CL_FALSE; + kernel->arg_is_ptr[arg_index] = CL_TRUE; + + kernel->arg_defined[arg_index] = 1; + + // double vector size if size < arg_index + while (kernel->ptr_arg_vector.size() <= arg_index) { + kernel->ptr_arg_vector.resize(kernel->ptr_arg_vector.size() * 2); + } + kernel->ptr_arg_vector[arg_index] = arg_value; + + UNLOCK_RETURN(CL_SUCCESS); +} + ACL_EXPORT CL_API_ENTRY cl_int CL_API_CALL clSetKernelArgMemPointerINTEL( cl_kernel kernel, cl_uint arg_index, const void *arg_value) { diff --git a/src/acl_mem.cpp b/src/acl_mem.cpp index 59481e9b..dd8f7070 100644 --- a/src/acl_mem.cpp +++ b/src/acl_mem.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -405,6 +406,119 @@ int acl_bind_buffer_to_device(cl_device_id device, cl_mem mem) { return 1; } +ACL_EXPORT +// CL_API_ENTRY cl_int clEnqueueReadGlobalVariableINTEL() { +CL_API_ENTRY cl_int clEnqueueReadGlobalVariableINTEL( + cl_command_queue command_queue, cl_program program, const char *name, + cl_bool blocking_write, size_t size, size_t offset, void *ptr, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event) { + + // TODO: get dev_global_ptr from autodiscovery instead later + // return 0; + return clEnqueueWriteGlobalVariableINTEL( + command_queue, program, name, blocking_write, size, offset, ptr, + num_events_in_wait_list, event_wait_list, event); +} + +ACL_EXPORT +CL_API_ENTRY cl_int clEnqueueWriteGlobalVariableINTEL( + cl_command_queue command_queue, cl_program program, const char *name, + cl_bool blocking_write, size_t size, size_t offset, const void *ptr, + cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event) { + cl_int status; + + cl_kernel kernel = clCreateKernelIntelFPGA(program, name, &status); + if (status != CL_SUCCESS) { + return status; + } + + // do we support ptr being a buffer instead of usm pointer? it seems to be the + // case on spec (only usm host pointer) Given kernel arg must be a deivce usm + // pointer: When ptr is a host/shared usm pointer, this function is expected + // to copy it to device first? yes (currently discussing whether kernel arg + // accept host usm instead) + void *src_dev_ptr = clDeviceMemAllocINTEL( + command_queue->context, command_queue->device, NULL, size, 1, &status); + if (status != CL_SUCCESS) { + return status; + } + + // This copy operation have to be blocking + // cl_event to_dev_event = 0; + status = clEnqueueMemcpyINTEL(command_queue, CL_TRUE, src_dev_ptr, ptr, size, + 0, NULL, NULL); + if (status != CL_SUCCESS) { + return status; + } + // if (to_dev_event->execution_status != CL_COMPLETE) { + // return CL_INVALID_OPERATION; + // } + + status = clSetKernelArgMemPointerINTEL(kernel, 0, src_dev_ptr); + if (status != CL_SUCCESS) { + return status; + } + // should kernel header contain offset or not? no + // offset is always byte offset? yes + // Assuming below returns same thing as `clDeviceMemAllocINTEL`, where exactly + // is dev global ptr being read from? What format is the read dev global, is + // it a pointer just like the return value of `clDeviceMemAllocINTEL` or is it + // something else? (its an unsigned value indicating address) + // TODO: When passing dev global pointer directly to + // clSetKernelArgMemPointerINTEL, make sure REMOVE_VALID_CHECKS is defined. + // Otherwise this ptr is not existing in context -> cause checks to fail + // TODO: get dev_global_address from autodiscovery instead later + // dev_addr_t dev_global_address = + // kernel->dev_bin->get_devdef().autodiscovery_def.? + uintptr_t dev_global_address = 0x4000000; + void *dev_global_ptr2 = + (void *)(dev_global_address + offset * 8); // 1 unit of offset is 8 bits + status = + set_kernel_arg_mem_pointer_without_checks(kernel, 1, dev_global_ptr2); + // status = clSetKernelArgMemPointerINTEL(kernel, 1, dev_global_ptr2); + if (status != CL_SUCCESS) { + return status; + } + status = clSetKernelArg(kernel, 2, sizeof(size_t), (const void *)(&size)); + if (status != CL_SUCCESS) { + return status; + } + + status = clEnqueueTask(command_queue, kernel, num_events_in_wait_list, + event_wait_list, event); + if (status != CL_SUCCESS) { + return status; + } + + acl_lock(); + // If nothing's blocking, then complete right away + acl_idle_update(command_queue->context); + acl_unlock(); + + if (blocking_write) { + status = clWaitForEvents(1, event); + } + + if (blocking_write && + status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) { + return status; + } + + // Free allocated device memory + status = clMemFreeINTEL(command_queue->context, src_dev_ptr); + if (status != CL_SUCCESS) { + return status; + } + status = clReleaseKernel(kernel); + if (status != CL_SUCCESS) { + return status; + } + + return CL_SUCCESS; +} + ACL_EXPORT CL_API_ENTRY cl_mem clCreateBufferWithPropertiesINTEL( cl_context context, const cl_mem_properties_intel *properties, diff --git a/test/acl_globals_test.cpp b/test/acl_globals_test.cpp index 7c1a361e..d47b9e91 100644 --- a/test/acl_globals_test.cpp +++ b/test/acl_globals_test.cpp @@ -118,6 +118,14 @@ static acl_kernel_interface_t acltest_kernels[] = { {ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(int *), 0, 0, 8}, {ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(int *), 0, 0, 16}, {ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(int *), 0, 0, 1024}, + }}, + {// interface + "kernel15_dev_global", + { + {ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(int *), 0, 0, 1}, + {ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(int *), 0, 0, 1}, + {ACL_ARG_ADDR_NONE, ACL_ARG_BY_VALUE, sizeof(size_t), 0, 0}, + // {ACL_ARG_ADDR_NONE, ACL_ARG_BY_VALUE, sizeof(size_t), 0, 0}, }}}; template @@ -191,6 +199,20 @@ static std::vector acltest_complex_system_device0_accel = { {}, {32768, 0, 0}, 1}, + {14, + ACL_RANGE_FROM_ARRAY(acltest_devicelocal[11]), + acltest_kernels[14], + acltest_laspace_info, + {0, 0, 0}, + 0, + 0, + 1, + 0, + 32768, + 3, + {}, + {32768, 0, 0}, + 1}, {1, ACL_RANGE_FROM_ARRAY(acltest_devicelocal[1]), acltest_kernels[1], diff --git a/test/acl_program_test.cpp b/test/acl_program_test.cpp index 97539672..064ccadc 100644 --- a/test/acl_program_test.cpp +++ b/test/acl_program_test.cpp @@ -606,25 +606,25 @@ MT_TEST(acl_program, program_info) { // built stat. to success even before calling clbuildprogram CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_NUM_KERNELS, sizeof(size_t), &num_kernels, 0)); - CHECK_EQUAL(14, num_kernels); + CHECK_EQUAL(15, num_kernels); // This won't happen if program is built with binary since we set the program // built stat. to success even before calling clbuildprogram CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES, 0, NULL, &size_ret)); - CHECK_EQUAL(321, size_ret); + CHECK_EQUAL(341, size_ret); CHECK_EQUAL(CL_SUCCESS, clBuildProgram(program, 0, 0, "", 0, 0)); // after building the program CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_NUM_KERNELS, sizeof(size_t), &num_kernels, 0)); - CHECK_EQUAL(14, num_kernels); + CHECK_EQUAL(15, num_kernels); CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_NUM_KERNELS, sizeof(size_t), &num_kernels, &size_ret)); - CHECK_EQUAL(14, num_kernels); + CHECK_EQUAL(15, num_kernels); CHECK_EQUAL(sizeof(size_t), size_ret); CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_NUM_KERNELS, 0, @@ -633,19 +633,21 @@ MT_TEST(acl_program, program_info) { CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES, 0, NULL, &size_ret)); - CHECK_EQUAL(321, size_ret); + CHECK_EQUAL(341, size_ret); + // CHECK_EQUAL(321, size_ret); names[size_ret] = 100; // making sure extra bytes of memory are not affected. CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES, 2000 * sizeof(char), names, &size_ret)); - CHECK_EQUAL(321, size_ret); // only one kernel named: "foo" + CHECK_EQUAL(341, size_ret); // only one kernel named: "foo" CHECK_EQUAL(100, names[size_ret]); CHECK_EQUAL(0, strcmp("kernel0_copy_vecin_vecout;" "kernel11_task_double;" "kernel12_task_double;" "kernel13_multi_vec_lane;" "kernel14_svm_arg_alignment;" + "kernel15_dev_global;" "kernel1_vecadd_vecin_vecin_vecout;" "kernel2_vecscale_vecin_scalar_vecout;" "kernel3_locals;" diff --git a/test/acl_usm_test.cpp b/test/acl_usm_test.cpp index e1f89ecc..fb553426 100644 --- a/test/acl_usm_test.cpp +++ b/test/acl_usm_test.cpp @@ -30,6 +30,16 @@ #include #include +// A default empty program binary +#define EXAMPLE_BINARY \ + const unsigned char *l_bin[m_num_devices]; \ + size_t l_bin_lengths[m_num_devices]; \ + cl_int l_bin_status[m_num_devices]; \ + for (cl_uint i = 0; i < m_num_devices; i++) { \ + l_bin[i] = (const unsigned char *)"0"; \ + l_bin_lengths[i] = 1; \ + } + static void CL_CALLBACK notify_me_print(const char *errinfo, const void *private_info, size_t cb, void *user_data); @@ -41,25 +51,30 @@ MT_TEST_GROUP(acl_usm) { void setup() { if (threadNum() == 0) { acl_test_setup_generic_system(); - } + this->load(); + m_program = this->load_program(); + this->build(m_program); + } syncThreads(); - - this->load(); } void teardown() { - unload_context(); - if (m_context) { - clReleaseContext(m_context); - m_context = 0; - } syncThreads(); if (threadNum() == 0) { + this->unload_program(m_program); + unload_context(); + + if (m_context) { + clReleaseContext(m_context); + m_context = 0; + } + acl_test_teardown_generic_system(); } + syncThreads(); acl_test_run_standard_teardown_checks(); } @@ -108,6 +123,46 @@ MT_TEST_GROUP(acl_usm) { ACL_LOCKED(CHECK(acl_context_is_valid(m_context))); } + cl_program load_program() { + cl_int status; + cl_program program; + EXAMPLE_BINARY; + + status = CL_INVALID_VALUE; + size_t example_bin_len = 0; + const unsigned char *example_bin = + acl_test_get_example_binary(&example_bin_len); + program = clCreateProgramWithBinary(m_context, m_num_devices, &m_device[0], + l_bin_lengths, l_bin, &l_bin_status[0], + &status); + { + cl_uint i; + for (i = 0; i < m_num_devices; i++) { + CHECK_EQUAL(CL_SUCCESS, l_bin_status[i]); + } + } + CHECK_EQUAL(CL_SUCCESS, status); + CHECK(program); + ACL_LOCKED(CHECK(acl_program_is_valid(program))); + return program; + } + void unload_program(cl_program program) { + int program_is_valid; + ACL_LOCKED(program_is_valid = acl_program_is_valid(program)); + if (program_is_valid) { + cl_int status; + while (program->num_kernels) { + clReleaseKernel(program->kernel_list->kernel); + } + CHECK_EQUAL(1, acl_ref_count(program)); + status = clReleaseProgram(program); + CHECK_EQUAL(CL_SUCCESS, status); + } + } + void build(cl_program program) { + CHECK_EQUAL(CL_SUCCESS, clBuildProgram(program, 0, 0, "", 0, 0)); + } + void load_backing_store_context(void) { unload_context(); cl_context_properties props[] = { @@ -165,6 +220,7 @@ MT_TEST_GROUP(acl_usm) { cl_uint m_num_devices; cl_device_id m_device[ACL_MAX_DEVICE]; cl_command_queue m_cq; + cl_program m_program; public: bool yeah; @@ -1206,4 +1262,52 @@ MT_TEST(acl_usm, memfill_usm) { ACL_LOCKED(acl_print_debug_msg("end memfill_usm\n")); } +MT_TEST(acl_usm, read_device_global) { + ACL_LOCKED(acl_print_debug_msg("begin read_device_global\n")); + char str[100]; + const size_t strsize = sizeof(str) / sizeof(char); // includes NUL (!) + char resultbuf[strsize]; + cl_int status; + + // Don't translate device addresses in the test HAL because we already + // will be passing in "device" memory that are actually in the host + // address space of the test executable. Ugh. + acltest_hal_emulate_device_mem = false; + + cl_event write_event = 0; + cl_event copy_event = 0; + cl_event read_event = 0; + + // Prepare host memory + syncThreads(); + // Host pointer example + void *src_ptr = malloc(strsize); + CHECK(src_ptr != NULL); + // Host USM example + // void* src_ptr = clHostMemAllocINTEL(m_context, NULL, strsize, 1, &status); + // CHECK_EQUAL(CL_SUCCESS, status); + // CHECK(src_ptr != NULL); + + syncThreads(); + + // Function of interest + status = clEnqueueReadGlobalVariableINTEL( + m_cq, m_program, "kernel15_dev_global", CL_FALSE, strsize, 0, src_ptr, 0, + NULL, ©_event); + CHECK_EQUAL(CL_SUCCESS, status); + int activation_id = copy_event->cmd.info.ndrange_kernel.invocation_wrapper + ->image->activation_id; + acltest_call_kernel_update_callback(activation_id, CL_RUNNING); + acltest_call_kernel_update_callback(activation_id, CL_COMPLETE); + CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(copy_event)); + CHECK_EQUAL(CL_SUCCESS, clFinish(m_cq)); + + // Host pointer example + free(src_ptr); + // Host USM example + // CHECK_EQUAL(CL_SUCCESS, clMemFreeINTEL(m_context, src_ptr)); + + ACL_LOCKED(acl_print_debug_msg("end read_write_buf\n")); +} + #endif // __arm__ From 28d4a4186a820e17fa247fca44efbf8d6cd72680 Mon Sep 17 00:00:00 2001 From: Sherry Yuan Date: Fri, 28 Jan 2022 11:59:43 -0500 Subject: [PATCH 02/12] change kernel arg size to sizeof(void*) --- test/acl_globals_test.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/acl_globals_test.cpp b/test/acl_globals_test.cpp index d47b9e91..c7e3938d 100644 --- a/test/acl_globals_test.cpp +++ b/test/acl_globals_test.cpp @@ -122,10 +122,9 @@ static acl_kernel_interface_t acltest_kernels[] = { {// interface "kernel15_dev_global", { - {ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(int *), 0, 0, 1}, - {ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(int *), 0, 0, 1}, + {ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(void *), 0, 0, 1}, + {ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(void *), 0, 0, 1}, {ACL_ARG_ADDR_NONE, ACL_ARG_BY_VALUE, sizeof(size_t), 0, 0}, - // {ACL_ARG_ADDR_NONE, ACL_ARG_BY_VALUE, sizeof(size_t), 0, 0}, }}}; template From a8e4e243279551e0a7ab4ef2c16647403f37aec1 Mon Sep 17 00:00:00 2001 From: Sherry Yuan Date: Fri, 28 Jan 2022 12:10:30 -0500 Subject: [PATCH 03/12] temporarily remove release kernel --- src/acl_mem.cpp | 8 ++++---- test/acl_usm_test.cpp | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/acl_mem.cpp b/src/acl_mem.cpp index dd8f7070..b79c1b3d 100644 --- a/src/acl_mem.cpp +++ b/src/acl_mem.cpp @@ -511,10 +511,10 @@ CL_API_ENTRY cl_int clEnqueueWriteGlobalVariableINTEL( if (status != CL_SUCCESS) { return status; } - status = clReleaseKernel(kernel); - if (status != CL_SUCCESS) { - return status; - } + // status = clReleaseKernel(kernel); + // if (status != CL_SUCCESS) { + // return status; + // } return CL_SUCCESS; } diff --git a/test/acl_usm_test.cpp b/test/acl_usm_test.cpp index fb553426..800d6368 100644 --- a/test/acl_usm_test.cpp +++ b/test/acl_usm_test.cpp @@ -1299,8 +1299,8 @@ MT_TEST(acl_usm, read_device_global) { ->image->activation_id; acltest_call_kernel_update_callback(activation_id, CL_RUNNING); acltest_call_kernel_update_callback(activation_id, CL_COMPLETE); - CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(copy_event)); CHECK_EQUAL(CL_SUCCESS, clFinish(m_cq)); + CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(copy_event)); // Host pointer example free(src_ptr); From 7da91ae3a5f9d4fa2720f75a63e0fd6cfb8b31e8 Mon Sep 17 00:00:00 2001 From: Sherry Yuan Date: Fri, 28 Jan 2022 13:27:50 -0500 Subject: [PATCH 04/12] avoid declaraing array with variable length as windows MSVC does not allow it --- test/acl_usm_test.cpp | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/test/acl_usm_test.cpp b/test/acl_usm_test.cpp index 800d6368..79110b3d 100644 --- a/test/acl_usm_test.cpp +++ b/test/acl_usm_test.cpp @@ -30,16 +30,6 @@ #include #include -// A default empty program binary -#define EXAMPLE_BINARY \ - const unsigned char *l_bin[m_num_devices]; \ - size_t l_bin_lengths[m_num_devices]; \ - cl_int l_bin_status[m_num_devices]; \ - for (cl_uint i = 0; i < m_num_devices; i++) { \ - l_bin[i] = (const unsigned char *)"0"; \ - l_bin_lengths[i] = 1; \ - } - static void CL_CALLBACK notify_me_print(const char *errinfo, const void *private_info, size_t cb, void *user_data); @@ -126,7 +116,17 @@ MT_TEST_GROUP(acl_usm) { cl_program load_program() { cl_int status; cl_program program; - EXAMPLE_BINARY; + // const unsigned char *l_bin[m_num_devices]; + // size_t l_bin_lengths[m_num_devices]; + // cl_int l_bin_status[m_num_devices]; + const unsigned char **l_bin = new const unsigned char *[m_num_devices]; + size_t* l_bin_lengths = new size_t[m_num_devices]; + cl_int* l_bin_status = new cl_int[m_num_devices]; + + for (cl_uint i = 0; i < m_num_devices; i++) { + l_bin[i] = (const unsigned char *)"0"; + l_bin_lengths[i] = 1; + } status = CL_INVALID_VALUE; size_t example_bin_len = 0; @@ -144,6 +144,10 @@ MT_TEST_GROUP(acl_usm) { CHECK_EQUAL(CL_SUCCESS, status); CHECK(program); ACL_LOCKED(CHECK(acl_program_is_valid(program))); + + delete l_bin_status; + delete l_bin_lengths; + delete l_bin; return program; } void unload_program(cl_program program) { From b215103f7dc836a9db6a1801bc952fb3bcad83b6 Mon Sep 17 00:00:00 2001 From: Sherry Yuan Date: Fri, 28 Jan 2022 15:44:57 -0500 Subject: [PATCH 05/12] fix klocwork issues --- test/acl_usm_test.cpp | 32 +++++--------------------------- 1 file changed, 5 insertions(+), 27 deletions(-) diff --git a/test/acl_usm_test.cpp b/test/acl_usm_test.cpp index 79110b3d..e0924ae4 100644 --- a/test/acl_usm_test.cpp +++ b/test/acl_usm_test.cpp @@ -116,38 +116,16 @@ MT_TEST_GROUP(acl_usm) { cl_program load_program() { cl_int status; cl_program program; - // const unsigned char *l_bin[m_num_devices]; - // size_t l_bin_lengths[m_num_devices]; - // cl_int l_bin_status[m_num_devices]; - const unsigned char **l_bin = new const unsigned char *[m_num_devices]; - size_t* l_bin_lengths = new size_t[m_num_devices]; - cl_int* l_bin_status = new cl_int[m_num_devices]; - - for (cl_uint i = 0; i < m_num_devices; i++) { - l_bin[i] = (const unsigned char *)"0"; - l_bin_lengths[i] = 1; - } - status = CL_INVALID_VALUE; - size_t example_bin_len = 0; - const unsigned char *example_bin = - acl_test_get_example_binary(&example_bin_len); - program = clCreateProgramWithBinary(m_context, m_num_devices, &m_device[0], - l_bin_lengths, l_bin, &l_bin_status[0], - &status); - { - cl_uint i; - for (i = 0; i < m_num_devices; i++) { - CHECK_EQUAL(CL_SUCCESS, l_bin_status[i]); - } - } + const unsigned char *bin = (const unsigned char *)"0"; + size_t bin_length = 1; + cl_int bin_status; + program = clCreateProgramWithBinary(m_context, 1, &m_device[0], &bin_length, + &bin, &bin_status, &status); CHECK_EQUAL(CL_SUCCESS, status); CHECK(program); ACL_LOCKED(CHECK(acl_program_is_valid(program))); - delete l_bin_status; - delete l_bin_lengths; - delete l_bin; return program; } void unload_program(cl_program program) { From 819c180a6507a133c49ab276a8e0fcc1c98fe1e8 Mon Sep 17 00:00:00 2001 From: Sherry Yuan Date: Fri, 4 Feb 2022 14:51:07 -0500 Subject: [PATCH 06/12] Test both read and write device global --- include/acl_mem.h | 4 ++ src/acl_mem.cpp | 156 +++++++++++++++++++++++++++++++++++------- test/acl_usm_test.cpp | 50 ++++++++++---- 3 files changed, 172 insertions(+), 38 deletions(-) diff --git a/include/acl_mem.h b/include/acl_mem.h index fe7674a8..e2f3078d 100644 --- a/include/acl_mem.h +++ b/include/acl_mem.h @@ -89,6 +89,10 @@ cl_bool acl_is_sub_or_parent_buffer(cl_mem mem); void CL_CALLBACK acl_free_allocation_after_event_completion( cl_event event, cl_int event_command_exec_status, void *callback_data); +void CL_CALLBACK acl_dev_global_cleanup(cl_event event, + cl_int event_command_exec_status, + void *callback_data); + #ifdef __GNUC__ #pragma GCC visibility pop #endif diff --git a/src/acl_mem.cpp b/src/acl_mem.cpp index b79c1b3d..5899f30a 100644 --- a/src/acl_mem.cpp +++ b/src/acl_mem.cpp @@ -410,15 +410,93 @@ ACL_EXPORT // CL_API_ENTRY cl_int clEnqueueReadGlobalVariableINTEL() { CL_API_ENTRY cl_int clEnqueueReadGlobalVariableINTEL( cl_command_queue command_queue, cl_program program, const char *name, - cl_bool blocking_write, size_t size, size_t offset, void *ptr, + cl_bool blocking_read, size_t size, size_t offset, void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) { + cl_int status; - // TODO: get dev_global_ptr from autodiscovery instead later - // return 0; - return clEnqueueWriteGlobalVariableINTEL( - command_queue, program, name, blocking_write, size, offset, ptr, - num_events_in_wait_list, event_wait_list, event); + cl_kernel kernel = clCreateKernelIntelFPGA(program, name, &status); + if (status != CL_SUCCESS) { + return status; + } + + // dev_addr_t dev_global_address = + // kernel->dev_bin->get_devdef().autodiscovery_def.? + uintptr_t dev_global_address = 0x4000000; + void *dev_global_ptr = + (void *)(dev_global_address + offset * 8); // 1 unit of offset is 8 bits + status = set_kernel_arg_mem_pointer_without_checks(kernel, 0, dev_global_ptr); + // status = clSetKernelArgMemPointerINTEL(kernel, 1, dev_global_ptr); + if (status != CL_SUCCESS) { + return status; + } + + // Copy device global memory to temporary device usm pointer first + void *tmp_dev_ptr = clDeviceMemAllocINTEL( + command_queue->context, command_queue->device, NULL, size, 1, &status); + if (status != CL_SUCCESS) { + return status; + } + if (!tmp_dev_ptr) { + return CL_MEM_OBJECT_ALLOCATION_FAILURE; + } + + status = clSetKernelArgMemPointerINTEL(kernel, 1, tmp_dev_ptr); + if (status != CL_SUCCESS) { + return status; + } + + // Set size kernel arg + status = clSetKernelArg(kernel, 2, sizeof(size_t), (const void *)(&size)); + if (status != CL_SUCCESS) { + return status; + } + + cl_event tmp_event = 0; + status = clEnqueueTask(command_queue, kernel, num_events_in_wait_list, + event_wait_list, &tmp_event); + if (status != CL_SUCCESS) { + return status; + } + std::cerr << tmp_event->cmd.info.ndrange_kernel.invocation_wrapper->image + ->activation_id + << std::endl; + + // copy from the temporary device memory into user provided pointer + std::cerr << "read: copy from tmp dev pointer to source pointer" << std::endl; + status = clEnqueueMemcpyINTEL(command_queue, blocking_read, ptr, tmp_dev_ptr, + size, 1, &tmp_event, event); + if (status != CL_SUCCESS) { + return status; + } + + if (blocking_read) { + status = clReleaseEvent(tmp_event); + if (status != CL_SUCCESS) { + return status; + } + status = clMemFreeINTEL(command_queue->context, tmp_dev_ptr); + if (status != CL_SUCCESS) { + return status; + } + status = clReleaseKernel(kernel); + if (status != CL_SUCCESS) { + return status; + } + } else { + // Clean up resources after event finishes + void **callback_data = (void **)acl_malloc(sizeof(void *) * 3); + if (!callback_data) { + return CL_OUT_OF_HOST_MEMORY; + } + callback_data[0] = (void *)(tmp_dev_ptr); + callback_data[1] = (void *)(kernel); + callback_data[2] = (void *)(tmp_event); + clSetEventCallback(*event, CL_COMPLETE, acl_dev_global_cleanup, + (void *)callback_data); + } + + return CL_SUCCESS; } ACL_EXPORT @@ -452,6 +530,7 @@ CL_API_ENTRY cl_int clEnqueueWriteGlobalVariableINTEL( if (status != CL_SUCCESS) { return status; } + // if (to_dev_event->execution_status != CL_COMPLETE) { // return CL_INVALID_OPERATION; // } @@ -473,11 +552,10 @@ CL_API_ENTRY cl_int clEnqueueWriteGlobalVariableINTEL( // dev_addr_t dev_global_address = // kernel->dev_bin->get_devdef().autodiscovery_def.? uintptr_t dev_global_address = 0x4000000; - void *dev_global_ptr2 = + void *dev_global_ptr = (void *)(dev_global_address + offset * 8); // 1 unit of offset is 8 bits - status = - set_kernel_arg_mem_pointer_without_checks(kernel, 1, dev_global_ptr2); - // status = clSetKernelArgMemPointerINTEL(kernel, 1, dev_global_ptr2); + status = set_kernel_arg_mem_pointer_without_checks(kernel, 1, dev_global_ptr); + // status = clSetKernelArgMemPointerINTEL(kernel, 1, dev_global_ptr); if (status != CL_SUCCESS) { return status; } @@ -499,24 +577,54 @@ CL_API_ENTRY cl_int clEnqueueWriteGlobalVariableINTEL( if (blocking_write) { status = clWaitForEvents(1, event); + if (status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) { + return status; + } + status = clMemFreeINTEL(command_queue->context, src_dev_ptr); + if (status != CL_SUCCESS) { + return status; + } + status = clReleaseKernel(kernel); + if (status != CL_SUCCESS) { + return status; + } + } else { + // Clean up resources after event finishes + void **callback_data = (void **)acl_malloc(sizeof(void *) * 3); + if (!callback_data) { + return CL_OUT_OF_HOST_MEMORY; + } + callback_data[0] = (void *)(src_dev_ptr); + callback_data[1] = (void *)(kernel); + clSetEventCallback(*event, CL_COMPLETE, acl_dev_global_cleanup, + (void *)callback_data); } - if (blocking_write && - status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) { - return status; - } + return CL_SUCCESS; +} - // Free allocated device memory - status = clMemFreeINTEL(command_queue->context, src_dev_ptr); - if (status != CL_SUCCESS) { - return status; +void CL_CALLBACK acl_dev_global_cleanup(cl_event event, + cl_int event_command_exec_status, + void *callback_data) { + void **callback_ptrs = + (void **)callback_data; // callback_ptrs[0] is usm device pointer + // callback_ptrs[1] kernel to be released + // callback_ptrs[2] temporary event to be released + event_command_exec_status = + event_command_exec_status; // Avoiding Windows warning. + event = event; + acl_lock(); + if (callback_ptrs[0]) { + clMemFreeINTEL(event->context, callback_ptrs[0]); } - // status = clReleaseKernel(kernel); - // if (status != CL_SUCCESS) { - // return status; - // } - - return CL_SUCCESS; + if (callback_ptrs[1]) { + clReleaseKernel(((cl_kernel)callback_ptrs[1])); + } + if (callback_ptrs[2]) { + clReleaseEvent(((cl_event)callback_ptrs[2])); + } + acl_free(callback_data); + acl_unlock(); } ACL_EXPORT diff --git a/test/acl_usm_test.cpp b/test/acl_usm_test.cpp index e0924ae4..5693ad44 100644 --- a/test/acl_usm_test.cpp +++ b/test/acl_usm_test.cpp @@ -1251,13 +1251,7 @@ MT_TEST(acl_usm, read_device_global) { char resultbuf[strsize]; cl_int status; - // Don't translate device addresses in the test HAL because we already - // will be passing in "device" memory that are actually in the host - // address space of the test executable. Ugh. - acltest_hal_emulate_device_mem = false; - cl_event write_event = 0; - cl_event copy_event = 0; cl_event read_event = 0; // Prepare host memory @@ -1271,18 +1265,46 @@ MT_TEST(acl_usm, read_device_global) { // CHECK(src_ptr != NULL); syncThreads(); + // Write to device global + status = clEnqueueWriteGlobalVariableINTEL( + m_cq, m_program, "kernel15_dev_global", CL_FALSE, strsize, 0, src_ptr, 0, + NULL, &write_event); + CHECK_EQUAL(CL_SUCCESS, status); - // Function of interest + // Read from device global, with dependence on write event status = clEnqueueReadGlobalVariableINTEL( - m_cq, m_program, "kernel15_dev_global", CL_FALSE, strsize, 0, src_ptr, 0, - NULL, ©_event); + m_cq, m_program, "kernel15_dev_global", CL_FALSE, strsize, 0, src_ptr, 1, + &write_event, &read_event); CHECK_EQUAL(CL_SUCCESS, status); - int activation_id = copy_event->cmd.info.ndrange_kernel.invocation_wrapper - ->image->activation_id; - acltest_call_kernel_update_callback(activation_id, CL_RUNNING); - acltest_call_kernel_update_callback(activation_id, CL_COMPLETE); + + // Manually set "write device global" event done + int write_activation_id = write_event->cmd.info.ndrange_kernel + .invocation_wrapper->image->activation_id; + acltest_call_kernel_update_callback(write_activation_id, CL_RUNNING); + acltest_call_kernel_update_callback(write_activation_id, CL_COMPLETE); + + // Nudge the scheduler to take above finish into account + acl_lock(); + // If nothing's blocking, then complete right away + acl_idle_update(m_cq->context); + acl_unlock(); + + // The event returned from read device global is not the copy kernel launch + // event Therefore need to first get the event that it depend on, then + // manually set it to complete + auto last_event = read_event->depend_on.end(); + last_event--; + cl_event read_copy_kernel_event = *last_event; + int read_activation_id = read_copy_kernel_event->cmd.info.ndrange_kernel + .invocation_wrapper->image->activation_id; + acltest_call_kernel_update_callback(read_activation_id, CL_RUNNING); + acltest_call_kernel_update_callback(read_activation_id, CL_COMPLETE); + // Now the usm copy operation will execute + + // Block on all event completion CHECK_EQUAL(CL_SUCCESS, clFinish(m_cq)); - CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(copy_event)); + CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(write_event)); + CHECK_EQUAL(CL_SUCCESS, clReleaseEvent(read_event)); // Host pointer example free(src_ptr); From 51a15d1c630d91d7f814f77fc964c3bde58f46da Mon Sep 17 00:00:00 2001 From: Sherry Yuan Date: Mon, 7 Feb 2022 12:50:32 -0500 Subject: [PATCH 07/12] use null to indicate optional clean up callback data --- src/acl_mem.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/acl_mem.cpp b/src/acl_mem.cpp index 5899f30a..32081956 100644 --- a/src/acl_mem.cpp +++ b/src/acl_mem.cpp @@ -596,6 +596,7 @@ CL_API_ENTRY cl_int clEnqueueWriteGlobalVariableINTEL( } callback_data[0] = (void *)(src_dev_ptr); callback_data[1] = (void *)(kernel); + callback_data[2] = NULL; clSetEventCallback(*event, CL_COMPLETE, acl_dev_global_cleanup, (void *)callback_data); } From ded93034813faf2601e2c110a34d7edf4e65b9e0 Mon Sep 17 00:00:00 2001 From: Sherry Yuan Date: Fri, 11 Feb 2022 15:05:29 -0500 Subject: [PATCH 08/12] Per kernel autodiscovery change ----------------------------------------- Currently each kernel will receive the info about device global address and size This is potentially not desired, a better design would be to have device global at autodiscovery device level instead, and kernel query for such information during runtime. --- include/acl.h | 2 ++ src/acl_auto_configure.cpp | 24 +++++++++++++++++++++++ src/acl_mem.cpp | 6 ++++-- test/acl_auto_configure_test.cpp | 20 ++++++++++++++----- test/acl_globals_test.cpp | 33 ++++++++++++++++++-------------- 5 files changed, 64 insertions(+), 21 deletions(-) diff --git a/include/acl.h b/include/acl.h index a351c938..990d13a1 100644 --- a/include/acl.h +++ b/include/acl.h @@ -230,6 +230,8 @@ typedef struct { fast_launch_depth; /* How many kernels can be buffered on the device, 0 means no buffering just one can execute*/ unsigned int is_sycl_compile; /* [1] SYCL compile; [0] OpenCL compile*/ + unsigned int device_global_address; /* Address of kernel's device global*/ + unsigned int device_global_size; /* Size of address space of device global used by this kernel*/ } acl_accel_def_t; /* An ACL system definition. diff --git a/src/acl_auto_configure.cpp b/src/acl_auto_configure.cpp index 320264ce..ab24197b 100644 --- a/src/acl_auto_configure.cpp +++ b/src/acl_auto_configure.cpp @@ -873,6 +873,30 @@ bool acl_load_device_def_from_str(const std::string &config_str, devdef.accel[i].is_sycl_compile, counters); } + devdef.accel[i].device_global_address = + 0; // Initializing for backward compatability + std::cerr << result << std::endl; + std::cerr << (counters.back() > 0) << std::endl; + if (result && counters.back() > 0) { + std::cerr << "read dev global address" << std::endl; + result = read_uint_counters(config_str, curr_pos, + devdef.accel[i].device_global_address, counters); + }else { + std::cerr << "read dev global address fail" << std::endl; + } + + + devdef.accel[i].device_global_size = + 0; // Initializing for backward compatability + if (result && counters.back() > 0) { + std::cerr << "read dev global size" << std::endl; + result = read_uint_counters(config_str, curr_pos, + devdef.accel[i].device_global_size, counters); + }else { + std::cerr << "read dev global size fail" << std::endl; + + } + // forward compatibility: bypassing remaining fields at the end of kernel // description section while (result && counters.size() > 0 && diff --git a/src/acl_mem.cpp b/src/acl_mem.cpp index 32081956..f6fce490 100644 --- a/src/acl_mem.cpp +++ b/src/acl_mem.cpp @@ -421,8 +421,10 @@ CL_API_ENTRY cl_int clEnqueueReadGlobalVariableINTEL( } // dev_addr_t dev_global_address = - // kernel->dev_bin->get_devdef().autodiscovery_def.? - uintptr_t dev_global_address = 0x4000000; + uintptr_t dev_global_address = kernel->accel_def->device_global_address; + assert(kernel->accel_def->device_global_address == 4096); // TODO: remove when merging + // uintptr_t dev_global_address = 0x4000000; + // TODO: add checks for whether the copy will be out of bound for device global void *dev_global_ptr = (void *)(dev_global_address + offset * 8); // 1 unit of offset is 8 bits status = set_kernel_arg_mem_pointer_without_checks(kernel, 0, dev_global_ptr); diff --git a/test/acl_auto_configure_test.cpp b/test/acl_auto_configure_test.cpp index 65f950bd..988e04a9 100644 --- a/test/acl_auto_configure_test.cpp +++ b/test/acl_auto_configure_test.cpp @@ -96,15 +96,17 @@ TEST(auto_configure, simple) { #define IS_SYCL_COMPILE " 1" #define IS_NOT_SYCL_COMPILE " 0" +// device global information +#define KERNEL_DEVICE_GLOBAL_ADDRESS " 4096" +#define KERNEL_DEVICE_GLOBAL_SIZE " 2048" + int parsed; std::string err_str; - ACL_LOCKED( - parsed = acl_load_device_def_from_str( - std::string( + std::string autodiscovery = std::string( VERSIONIDTOSTR(ACL_AUTO_CONFIGURE_VERSIONID) DEVICE_FIELDS RANDOM_HASH " " BOARDNAME IS_NOT_BIG_ENDIAN MEM HOSTPIPE KERNEL_ARG_INFO_NONE - " 1 82 foo" KERNEL_CRA KERNEL_FAST_LAUNCH_DEPTH KERNEL_PERF_MON + " 1 84 foo" KERNEL_CRA KERNEL_FAST_LAUNCH_DEPTH KERNEL_PERF_MON // 84 = number of kernel field KERNEL_WORKGROUP_VARIANT KERNEL_WORKITEM_VARIANT KERNEL_NUM_VECTOR_LANES1 KERNEL_PROFILE_SCANCHAIN_LENGTH ARGS_LOCAL_GLOBAL_LONG_PROF KERNEL_PRINTF_FORMATSTRINGS @@ -112,8 +114,13 @@ TEST(auto_configure, simple) { KERNEL_MAX_WORK_GROUP_SIZE_NONE KERNEL_MAX_GLOBAL_WORK_DIM_NONE KERNEL_USES_GLOBAL_WORK_OFFSET_ENABLED - IS_SYCL_COMPILE), + IS_SYCL_COMPILE KERNEL_DEVICE_GLOBAL_ADDRESS KERNEL_DEVICE_GLOBAL_SIZE); + std::cerr << autodiscovery << std::endl; + ACL_LOCKED( + parsed = acl_load_device_def_from_str( + autodiscovery, m_device_def.autodiscovery_def, err_str)); + std::cerr << err_str << std::endl; CHECK_EQUAL(1, parsed); CHECK_EQUAL(1, m_device_def.autodiscovery_def.num_global_mem_systems); @@ -260,6 +267,9 @@ TEST(auto_configure, simple) { CHECK_EQUAL(0, (int)m_device_def.autodiscovery_def.accel[0].max_work_group_size); CHECK_EQUAL(1, (int)m_device_def.autodiscovery_def.accel[0].is_sycl_compile); + CHECK_EQUAL(4096, (int)m_device_def.autodiscovery_def.accel[0].device_global_address); + CHECK_EQUAL(2048, (int)m_device_def.autodiscovery_def.accel[0].device_global_size); + // Check a second parsing. // It should allocate a new string for the name. diff --git a/test/acl_globals_test.cpp b/test/acl_globals_test.cpp index c7e3938d..c2d1f1b1 100644 --- a/test/acl_globals_test.cpp +++ b/test/acl_globals_test.cpp @@ -198,20 +198,25 @@ static std::vector acltest_complex_system_device0_accel = { {}, {32768, 0, 0}, 1}, - {14, - ACL_RANGE_FROM_ARRAY(acltest_devicelocal[11]), - acltest_kernels[14], - acltest_laspace_info, - {0, 0, 0}, - 0, - 0, - 1, - 0, - 32768, - 3, - {}, - {32768, 0, 0}, - 1}, + {14, // id + ACL_RANGE_FROM_ARRAY(acltest_devicelocal[11]), // mem + acltest_kernels[14], // iface + acltest_laspace_info, // local_aspaces + {0, 0, 0}, // compile_work_group_size + 0, // is_workgroup_invariant + 0, // is_workitem_invariant + 1, // num_vector_lanes + 0, // profiling_words_to_readback + 32768, // max_work_group_size + 3, // max_global_work_dim + {}, // printf_format_info + {32768, 0, 0}, // max_work_group_size_arr + 1, // uses_global_work_offset + 0, // fast_launch_depth + 1, // is_sycl_compile + 4096, // device_global_address + 2048, // device_global_size + }, {1, ACL_RANGE_FROM_ARRAY(acltest_devicelocal[1]), acltest_kernels[1], From c767c31d0880e9b9002f42354af7d48f419d8f2b Mon Sep 17 00:00:00 2001 From: Sherry Yuan Date: Fri, 11 Feb 2022 21:49:54 -0500 Subject: [PATCH 09/12] Add device global in device autodiscovery definition ---------------------------------------------------------- The autodiscovery specifies how many device global there are in the device Specify for each device global how many attribute it has (currently 3) The 3 attribute are: device global name, address, size The device global name are used in runtime to get address of device global when given the name in clEnqueueReadGlobalVariableINTEL or clEnqueueWriteGlobalVariableINTEL --- include/acl.h | 14 +++++- src/acl_auto_configure.cpp | 80 ++++++++++++++++++++++---------- src/acl_mem.cpp | 10 ++-- test/acl_auto_configure_test.cpp | 71 +++++++++++++++++----------- test/acl_globals_test.cpp | 37 +++++++-------- 5 files changed, 137 insertions(+), 75 deletions(-) diff --git a/include/acl.h b/include/acl.h index 990d13a1..a6e03dca 100644 --- a/include/acl.h +++ b/include/acl.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -230,8 +231,6 @@ typedef struct { fast_launch_depth; /* How many kernels can be buffered on the device, 0 means no buffering just one can execute*/ unsigned int is_sycl_compile; /* [1] SYCL compile; [0] OpenCL compile*/ - unsigned int device_global_address; /* Address of kernel's device global*/ - unsigned int device_global_size; /* Size of address space of device global used by this kernel*/ } acl_accel_def_t; /* An ACL system definition. @@ -480,6 +479,12 @@ typedef class acl_device_program_info_t *acl_device_program_info; */ #define ACL_MEM_CAPABILITY_P2P (1 << 3) +typedef struct acl_device_global_mem_def_t { + std::string name; + unsigned int address; + unsigned int size; +} acl_device_global_mem_def_t; + // Part of acl_device_def_t where members are populated from the information // in the autodiscovery string. This will get updated every time the device // is programmed with a new device binary as the new binary would contain a @@ -498,6 +503,11 @@ typedef struct acl_device_def_autodiscovery_t { std::array global_mem_defs; std::vector acl_hostpipe_info; + + // device global definition + unsigned int num_device_global; + std::unordered_map + device_global_mem_defs; } acl_device_def_autodiscovery_t; typedef struct acl_device_def_t { diff --git a/src/acl_auto_configure.cpp b/src/acl_auto_configure.cpp index ab24197b..61b02e9c 100644 --- a/src/acl_auto_configure.cpp +++ b/src/acl_auto_configure.cpp @@ -96,6 +96,7 @@ static bool read_uint_counters(const std::string &str, UNREFERENCED_PARAMETER(e); return false; } + return true; } @@ -493,6 +494,61 @@ bool acl_load_device_def_from_str(const std::string &config_str, counters); } + // Read device global information + unsigned int num_device_global = 0; + if (result && counters.back() > 0) { + result = + read_uint_counters(config_str, curr_pos, num_device_global, counters); + devdef.num_device_global = num_device_global; + + for (auto i = 0U; result && (i < num_device_global); + i++) { // device_global_memories + // read total number of fields in global_memories + int total_fields_device_global = 0; + if (counters.back() > 0) { + result = read_int_counters(config_str, curr_pos, + total_fields_device_global, counters); + } + + counters.emplace_back(total_fields_device_global); + + // read device global name + std::string device_global_name; + if (result && counters.back() > 0) { + result = read_string_counters(config_str, curr_pos, device_global_name, + counters); + } + + // read device global address + unsigned int dev_global_addr = 0; // Default + if (result && counters.back() > 0) { + result = + read_uint_counters(config_str, curr_pos, dev_global_addr, counters); + } + // read device global address size + unsigned int dev_global_size = 0; // Default + if (result && counters.back() > 0) { + result = + read_uint_counters(config_str, curr_pos, dev_global_size, counters); + } + + acl_device_global_mem_def_t dev_global_def = { + device_global_name, dev_global_addr, dev_global_size}; + devdef.device_global_mem_defs[device_global_name] = dev_global_def; + + // forward compatibility: bypassing remaining fields at the end of global + // memory + while (result && counters.size() > 0 && + counters.back() > 0) { // total_fields_device_global>0 + std::string tmp; + result = + result && read_string_counters(config_str, curr_pos, tmp, counters); + check_section_counters(counters); + } + counters.pop_back(); // removing total_fields_device_global + } // device_global_memories + } + // forward compatibility: bypassing remaining fields at the end of device // description section while (result && counters.size() > 0 && @@ -873,30 +929,6 @@ bool acl_load_device_def_from_str(const std::string &config_str, devdef.accel[i].is_sycl_compile, counters); } - devdef.accel[i].device_global_address = - 0; // Initializing for backward compatability - std::cerr << result << std::endl; - std::cerr << (counters.back() > 0) << std::endl; - if (result && counters.back() > 0) { - std::cerr << "read dev global address" << std::endl; - result = read_uint_counters(config_str, curr_pos, - devdef.accel[i].device_global_address, counters); - }else { - std::cerr << "read dev global address fail" << std::endl; - } - - - devdef.accel[i].device_global_size = - 0; // Initializing for backward compatability - if (result && counters.back() > 0) { - std::cerr << "read dev global size" << std::endl; - result = read_uint_counters(config_str, curr_pos, - devdef.accel[i].device_global_size, counters); - }else { - std::cerr << "read dev global size fail" << std::endl; - - } - // forward compatibility: bypassing remaining fields at the end of kernel // description section while (result && counters.size() > 0 && diff --git a/src/acl_mem.cpp b/src/acl_mem.cpp index f6fce490..81349d37 100644 --- a/src/acl_mem.cpp +++ b/src/acl_mem.cpp @@ -421,10 +421,12 @@ CL_API_ENTRY cl_int clEnqueueReadGlobalVariableINTEL( } // dev_addr_t dev_global_address = - uintptr_t dev_global_address = kernel->accel_def->device_global_address; - assert(kernel->accel_def->device_global_address == 4096); // TODO: remove when merging - // uintptr_t dev_global_address = 0x4000000; - // TODO: add checks for whether the copy will be out of bound for device global + // uintptr_t dev_global_address = + // kernel->dev_bin->get_devdef().autodiscovery_def.device_global_mem_defs[name]; + // uintptr_t dev_global_address = kernel->accel_def->device_global_address; + uintptr_t dev_global_address = 0x4000000; + // TODO: add checks for whether the copy will be out of bound for device + // global void *dev_global_ptr = (void *)(dev_global_address + offset * 8); // 1 unit of offset is 8 bits status = set_kernel_arg_mem_pointer_without_checks(kernel, 0, dev_global_ptr); diff --git a/test/acl_auto_configure_test.cpp b/test/acl_auto_configure_test.cpp index 988e04a9..3cefa709 100644 --- a/test/acl_auto_configure_test.cpp +++ b/test/acl_auto_configure_test.cpp @@ -36,6 +36,7 @@ TEST(auto_configure, simple) { #define VERSIONIDSTRINGIFY(x) #x #define VERSIONIDTOSTR(x) VERSIONIDSTRINGIFY(x) #define DEVICE_FIELDS " 23" +#define DEVICE_FIELDS_DEV_GLOBAL " 32" #define DEVICE_FIELDS_OLD " 18" #define BOARDNAME "de4_gen2x4_swdimm" #define BOARDNAME2 "pcie385_a7" @@ -96,31 +97,32 @@ TEST(auto_configure, simple) { #define IS_SYCL_COMPILE " 1" #define IS_NOT_SYCL_COMPILE " 0" -// device global information -#define KERNEL_DEVICE_GLOBAL_ADDRESS " 4096" -#define KERNEL_DEVICE_GLOBAL_SIZE " 2048" +// device global +#define NUM_DEV_GLOBAL " 2" +#define NUM_DEV_GLOBAL_FIELD " 3" // containing dev_globa_name, address, size +#define DEV_GLOBAL_1 \ + " kernel15_dev_global 4096 2048" // in format of dev_globa_name, address, size +#define DEV_GLOBAL_2 " kernel15_dev_global2 2048 1024" int parsed; std::string err_str; std::string autodiscovery = std::string( - VERSIONIDTOSTR(ACL_AUTO_CONFIGURE_VERSIONID) - DEVICE_FIELDS RANDOM_HASH - " " BOARDNAME IS_NOT_BIG_ENDIAN MEM HOSTPIPE KERNEL_ARG_INFO_NONE - " 1 84 foo" KERNEL_CRA KERNEL_FAST_LAUNCH_DEPTH KERNEL_PERF_MON // 84 = number of kernel field - KERNEL_WORKGROUP_VARIANT KERNEL_WORKITEM_VARIANT - KERNEL_NUM_VECTOR_LANES1 KERNEL_PROFILE_SCANCHAIN_LENGTH - ARGS_LOCAL_GLOBAL_LONG_PROF KERNEL_PRINTF_FORMATSTRINGS - LD_1024 KERNEL_REQD_WORK_GROUP_SIZE_NONE - KERNEL_MAX_WORK_GROUP_SIZE_NONE - KERNEL_MAX_GLOBAL_WORK_DIM_NONE - KERNEL_USES_GLOBAL_WORK_OFFSET_ENABLED - IS_SYCL_COMPILE KERNEL_DEVICE_GLOBAL_ADDRESS KERNEL_DEVICE_GLOBAL_SIZE); - std::cerr << autodiscovery << std::endl; - ACL_LOCKED( - parsed = acl_load_device_def_from_str( - autodiscovery, - m_device_def.autodiscovery_def, err_str)); - std::cerr << err_str << std::endl; + VERSIONIDTOSTR(ACL_AUTO_CONFIGURE_VERSIONID) + DEVICE_FIELDS_DEV_GLOBAL RANDOM_HASH + " " BOARDNAME IS_NOT_BIG_ENDIAN MEM HOSTPIPE KERNEL_ARG_INFO_NONE + NUM_DEV_GLOBAL NUM_DEV_GLOBAL_FIELD DEV_GLOBAL_1 NUM_DEV_GLOBAL_FIELD + DEV_GLOBAL_2 + " 1 82 foo" KERNEL_CRA KERNEL_FAST_LAUNCH_DEPTH KERNEL_PERF_MON + KERNEL_WORKGROUP_VARIANT KERNEL_WORKITEM_VARIANT + KERNEL_NUM_VECTOR_LANES1 KERNEL_PROFILE_SCANCHAIN_LENGTH + ARGS_LOCAL_GLOBAL_LONG_PROF KERNEL_PRINTF_FORMATSTRINGS + LD_1024 KERNEL_REQD_WORK_GROUP_SIZE_NONE + KERNEL_MAX_WORK_GROUP_SIZE_NONE + KERNEL_MAX_GLOBAL_WORK_DIM_NONE + KERNEL_USES_GLOBAL_WORK_OFFSET_ENABLED + IS_SYCL_COMPILE); + ACL_LOCKED(parsed = acl_load_device_def_from_str( + autodiscovery, m_device_def.autodiscovery_def, err_str)); CHECK_EQUAL(1, parsed); CHECK_EQUAL(1, m_device_def.autodiscovery_def.num_global_mem_systems); @@ -267,9 +269,26 @@ TEST(auto_configure, simple) { CHECK_EQUAL(0, (int)m_device_def.autodiscovery_def.accel[0].max_work_group_size); CHECK_EQUAL(1, (int)m_device_def.autodiscovery_def.accel[0].is_sycl_compile); - CHECK_EQUAL(4096, (int)m_device_def.autodiscovery_def.accel[0].device_global_address); - CHECK_EQUAL(2048, (int)m_device_def.autodiscovery_def.accel[0].device_global_size); + CHECK_EQUAL(2, (int)m_device_def.autodiscovery_def.num_device_global); + CHECK(m_device_def.autodiscovery_def.device_global_mem_defs.find( + "kernel15_dev_global") != + m_device_def.autodiscovery_def.device_global_mem_defs.end()); + CHECK(m_device_def.autodiscovery_def.device_global_mem_defs.find( + "kernel15_dev_global2") != + m_device_def.autodiscovery_def.device_global_mem_defs.end()); + CHECK_EQUAL(4096, m_device_def.autodiscovery_def + .device_global_mem_defs["kernel15_dev_global"] + .address); + CHECK_EQUAL(2048, m_device_def.autodiscovery_def + .device_global_mem_defs["kernel15_dev_global"] + .size); + CHECK_EQUAL(2048, m_device_def.autodiscovery_def + .device_global_mem_defs["kernel15_dev_global2"] + .address); + CHECK_EQUAL(1024, m_device_def.autodiscovery_def + .device_global_mem_defs["kernel15_dev_global2"] + .size); // Check a second parsing. // It should allocate a new string for the name. @@ -470,8 +489,8 @@ TEST(auto_configure, many_ok_forward_compatibility) { ACL_AUTO_CONFIGURE_VERSIONID) " 28 " "sample40byterandomhash000000000000000000 " "a10gx 0 1 15 DDR 2 1 6 0 2147483648 100 " - "100 100 100 200 200 200 200 0 0 0 0 400 " - "400 400 400 400 47 " + "100 100 100 200 200 200 200 0 0 0 0 2 " + "1 1 1 400 47 " "40 external_sort_stage_0 0 128 1 0 0 1 0 " "1 0 1 10 0 0 4 1 0 0 500 500 500 500 0 0 " "0 0 1 1 1 3 1 1 1 3 1 800 800 800 800 800 " @@ -1185,7 +1204,7 @@ TEST(auto_configure, hostpipe) { "200 " "2 9 host_to_dev 1 0 32 32768 300 300 300 " "300 dev_to_host 0 1 32 32768 300 300 300 " - "300 400 400 400 400 400 0 " + "300 400 1 3 400 400 0 " "1 29 foo 0 128 1 0 0 1 0 1 0 0 0 0 0 0 1 " "1 1 3 1 1 1 3 1 800 800 800 800 800 900 " "900" diff --git a/test/acl_globals_test.cpp b/test/acl_globals_test.cpp index c2d1f1b1..2e1214ac 100644 --- a/test/acl_globals_test.cpp +++ b/test/acl_globals_test.cpp @@ -198,25 +198,24 @@ static std::vector acltest_complex_system_device0_accel = { {}, {32768, 0, 0}, 1}, - {14, // id - ACL_RANGE_FROM_ARRAY(acltest_devicelocal[11]), // mem - acltest_kernels[14], // iface - acltest_laspace_info, // local_aspaces - {0, 0, 0}, // compile_work_group_size - 0, // is_workgroup_invariant - 0, // is_workitem_invariant - 1, // num_vector_lanes - 0, // profiling_words_to_readback - 32768, // max_work_group_size - 3, // max_global_work_dim - {}, // printf_format_info - {32768, 0, 0}, // max_work_group_size_arr - 1, // uses_global_work_offset - 0, // fast_launch_depth - 1, // is_sycl_compile - 4096, // device_global_address - 2048, // device_global_size - }, + { + 14, // id + ACL_RANGE_FROM_ARRAY(acltest_devicelocal[11]), // mem + acltest_kernels[14], // iface + acltest_laspace_info, // local_aspaces + {0, 0, 0}, // compile_work_group_size + 0, // is_workgroup_invariant + 0, // is_workitem_invariant + 1, // num_vector_lanes + 0, // profiling_words_to_readback + 32768, // max_work_group_size + 3, // max_global_work_dim + {}, // printf_format_info + {32768, 0, 0}, // max_work_group_size_arr + 1, // uses_global_work_offset + 0, // fast_launch_depth + 1, // is_sycl_compile + }, {1, ACL_RANGE_FROM_ARRAY(acltest_devicelocal[1]), acltest_kernels[1], From 5eb553a3f872661a0380e3c2648d02d47367646b Mon Sep 17 00:00:00 2001 From: Sherry Yuan Date: Fri, 4 Mar 2022 15:42:19 -0500 Subject: [PATCH 10/12] Change autodiscovery format ------------------------------------ Now the autodiscovery format is changed to num_device_global num_field_in_device_global [(dev_global_name, dev_global_addr, devl_global_size), (dev_global_name, dev_global_addr, devl_global_size), ...] This is because each device global will have same number of field anyways, so itis not necessary ot keep a number of field for each device global --- src/acl_auto_configure.cpp | 14 +++++++------- test/acl_auto_configure_test.cpp | 5 ++--- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/acl_auto_configure.cpp b/src/acl_auto_configure.cpp index 61b02e9c..5be8fb1d 100644 --- a/src/acl_auto_configure.cpp +++ b/src/acl_auto_configure.cpp @@ -501,15 +501,15 @@ bool acl_load_device_def_from_str(const std::string &config_str, read_uint_counters(config_str, curr_pos, num_device_global, counters); devdef.num_device_global = num_device_global; + // read total number of fields in device global + int total_fields_device_global = 0; + if (result) { + result = read_int_counters(config_str, curr_pos, + total_fields_device_global, counters); + } + for (auto i = 0U; result && (i < num_device_global); i++) { // device_global_memories - // read total number of fields in global_memories - int total_fields_device_global = 0; - if (counters.back() > 0) { - result = read_int_counters(config_str, curr_pos, - total_fields_device_global, counters); - } - counters.emplace_back(total_fields_device_global); // read device global name diff --git a/test/acl_auto_configure_test.cpp b/test/acl_auto_configure_test.cpp index 3cefa709..ca71e53a 100644 --- a/test/acl_auto_configure_test.cpp +++ b/test/acl_auto_configure_test.cpp @@ -36,7 +36,7 @@ TEST(auto_configure, simple) { #define VERSIONIDSTRINGIFY(x) #x #define VERSIONIDTOSTR(x) VERSIONIDSTRINGIFY(x) #define DEVICE_FIELDS " 23" -#define DEVICE_FIELDS_DEV_GLOBAL " 32" +#define DEVICE_FIELDS_DEV_GLOBAL " 30" #define DEVICE_FIELDS_OLD " 18" #define BOARDNAME "de4_gen2x4_swdimm" #define BOARDNAME2 "pcie385_a7" @@ -110,8 +110,7 @@ TEST(auto_configure, simple) { VERSIONIDTOSTR(ACL_AUTO_CONFIGURE_VERSIONID) DEVICE_FIELDS_DEV_GLOBAL RANDOM_HASH " " BOARDNAME IS_NOT_BIG_ENDIAN MEM HOSTPIPE KERNEL_ARG_INFO_NONE - NUM_DEV_GLOBAL NUM_DEV_GLOBAL_FIELD DEV_GLOBAL_1 NUM_DEV_GLOBAL_FIELD - DEV_GLOBAL_2 + NUM_DEV_GLOBAL NUM_DEV_GLOBAL_FIELD DEV_GLOBAL_1 DEV_GLOBAL_2 " 1 82 foo" KERNEL_CRA KERNEL_FAST_LAUNCH_DEPTH KERNEL_PERF_MON KERNEL_WORKGROUP_VARIANT KERNEL_WORKITEM_VARIANT KERNEL_NUM_VECTOR_LANES1 KERNEL_PROFILE_SCANCHAIN_LENGTH From 9716fab70606c5e67593f7677b713e79110ae811 Mon Sep 17 00:00:00 2001 From: Sherry Yuan Date: Fri, 11 Mar 2022 17:56:04 -0500 Subject: [PATCH 11/12] Add clear functional documentation and line comments --- include/acl.h | 1 + include/acl_mem.h | 4 + src/acl_kernel.cpp | 16 ++++ src/acl_mem.cpp | 193 +++++++++++++++++++++++++++++--------- test/acl_globals_test.cpp | 84 ++++++++++------- test/acl_program_test.cpp | 11 ++- test/acl_usm_test.cpp | 12 +-- 7 files changed, 234 insertions(+), 87 deletions(-) diff --git a/include/acl.h b/include/acl.h index a6e03dca..27e8d63b 100644 --- a/include/acl.h +++ b/include/acl.h @@ -506,6 +506,7 @@ typedef struct acl_device_def_autodiscovery_t { // device global definition unsigned int num_device_global; + // std::vector device_global_mem_defs; std::unordered_map device_global_mem_defs; } acl_device_def_autodiscovery_t; diff --git a/include/acl_mem.h b/include/acl_mem.h index e2f3078d..fe46fca9 100644 --- a/include/acl_mem.h +++ b/include/acl_mem.h @@ -93,6 +93,10 @@ void CL_CALLBACK acl_dev_global_cleanup(cl_event event, cl_int event_command_exec_status, void *callback_data); +cl_int acl_extract_device_global_address(cl_kernel kernel, + const char *dev_global_name, + unsigned int *ret_addr); + #ifdef __GNUC__ #pragma GCC visibility pop #endif diff --git a/src/acl_kernel.cpp b/src/acl_kernel.cpp index b816fa6c..8da91119 100644 --- a/src/acl_kernel.cpp +++ b/src/acl_kernel.cpp @@ -837,6 +837,22 @@ CL_API_ENTRY cl_int CL_API_CALL clSetKernelArgSVMPointer( return clSetKernelArgSVMPointerIntelFPGA(kernel, arg_index, arg_value); } +/** + * Set any provided void pointer as kernel arguments + * + * It is assumed that the provided pointer is a valid device address, + * or device global address that kernel can use to point to right address space. + * + * It is the same as `clSetKernelArgMemPointerINTEL` except the validity checks + * are removed. This is because the user provided pointer may not always be usm + * pointer, therefore will not belong to the context (as they are checked in + * clSetKernelArgMemPointerINTEL) + * + * @param kernel the kernel that accept the pointer arg + * @param arg_index which kernel argument accept the value + * @param arg_value the pointer to desired address space + * @return status code, CL_SUCCESS if all operations are successful. + */ cl_int set_kernel_arg_mem_pointer_without_checks(cl_kernel kernel, cl_uint arg_index, void *arg_value) { diff --git a/src/acl_mem.cpp b/src/acl_mem.cpp index 81349d37..1404fef6 100644 --- a/src/acl_mem.cpp +++ b/src/acl_mem.cpp @@ -406,8 +406,32 @@ int acl_bind_buffer_to_device(cl_device_id device, cl_mem mem) { return 1; } +/** + * Read bytes of data from device global + * + * This function extract the device global address with given name, create + * kernel from the given program, create temporary device usm pointer to hold + * data that will be copied from device global. Launch the copy kernel with + * correct pointers set as src and destination, then enqueue a memory copy + * operation that depend on copy kernel to copy from temporary device usm + * pointer into user provided host pointer. Then register a callback to release + * the necessary events, kernels, memories. + * + * @param command_queue the queue system this copy kernel will belong + * @param program contains copy kernel + * @param name name of device global, used to look up for device global address + * in autodiscovery string + * @param blocking_read whether the operation is blocking or not + * @param size number of bytes to read / write + * @param offset offset from the extracted address of device global + * @param ptr pointer that will hold the data copied from device global + * @param num_events_in_wait_list number of event that copy kernel depend on + * @param event_wait_list events that copy kernel depend on + * @param event the info about the execution of copy kernel will be stored in + * the event + * @return status code, CL_SUCCESS if all operations are successful. + */ ACL_EXPORT -// CL_API_ENTRY cl_int clEnqueueReadGlobalVariableINTEL() { CL_API_ENTRY cl_int clEnqueueReadGlobalVariableINTEL( cl_command_queue command_queue, cl_program program, const char *name, cl_bool blocking_read, size_t size, size_t offset, void *ptr, @@ -415,27 +439,36 @@ CL_API_ENTRY cl_int clEnqueueReadGlobalVariableINTEL( cl_event *event) { cl_int status; - cl_kernel kernel = clCreateKernelIntelFPGA(program, name, &status); + // Get copy kernel from the program + cl_kernel kernel = + clCreateKernelIntelFPGA(program, "copy_device_global", &status); if (status != CL_SUCCESS) { return status; } - // dev_addr_t dev_global_address = - // uintptr_t dev_global_address = - // kernel->dev_bin->get_devdef().autodiscovery_def.device_global_mem_defs[name]; - // uintptr_t dev_global_address = kernel->accel_def->device_global_address; - uintptr_t dev_global_address = 0x4000000; - // TODO: add checks for whether the copy will be out of bound for device - // global + // Look up device global address with use provided name + unsigned int device_global_addr_num = 0; + status = + acl_extract_device_global_address(kernel, name, &device_global_addr_num); + if (status != CL_SUCCESS) + return status; + uintptr_t dev_global_address = device_global_addr_num; + + // Calculate the offset from the device global address, offset is in byte unit void *dev_global_ptr = (void *)(dev_global_address + offset * 8); // 1 unit of offset is 8 bits + // TODO: add checks for whether the copy will be out of bound for device + // global + + // Set the device global pointer as the kernel destination args status = set_kernel_arg_mem_pointer_without_checks(kernel, 0, dev_global_ptr); // status = clSetKernelArgMemPointerINTEL(kernel, 1, dev_global_ptr); if (status != CL_SUCCESS) { return status; } - // Copy device global memory to temporary device usm pointer first + // Copy device global memory to temporary device usm pointer first to minimize + // area cost void *tmp_dev_ptr = clDeviceMemAllocINTEL( command_queue->context, command_queue->device, NULL, size, 1, &status); if (status != CL_SUCCESS) { @@ -450,24 +483,22 @@ CL_API_ENTRY cl_int clEnqueueReadGlobalVariableINTEL( return status; } - // Set size kernel arg + // Propagate size information to kernel status = clSetKernelArg(kernel, 2, sizeof(size_t), (const void *)(&size)); if (status != CL_SUCCESS) { return status; } + // Enqueue copy kernel execution cl_event tmp_event = 0; status = clEnqueueTask(command_queue, kernel, num_events_in_wait_list, event_wait_list, &tmp_event); if (status != CL_SUCCESS) { return status; } - std::cerr << tmp_event->cmd.info.ndrange_kernel.invocation_wrapper->image - ->activation_id - << std::endl; - // copy from the temporary device memory into user provided pointer - std::cerr << "read: copy from tmp dev pointer to source pointer" << std::endl; + // Copy from the temporary device memory into user provided host pointer + // give this event back to user, not the copy kernel one status = clEnqueueMemcpyINTEL(command_queue, blocking_read, ptr, tmp_dev_ptr, size, 1, &tmp_event, event); if (status != CL_SUCCESS) { @@ -475,6 +506,7 @@ CL_API_ENTRY cl_int clEnqueueReadGlobalVariableINTEL( } if (blocking_read) { + // If blocking, first wait for event to finish, then clean up the resources. status = clReleaseEvent(tmp_event); if (status != CL_SUCCESS) { return status; @@ -503,6 +535,29 @@ CL_API_ENTRY cl_int clEnqueueReadGlobalVariableINTEL( return CL_SUCCESS; } +/** + * Write bytes of data from user provided host pointer into device global + * + * This function extract the device global address with given name, create + * kernel from the given program, create temporary device usm pointer to hold + * user provided data through usm copy operation. Launch the copy kernel with + * correct pointers set as src and destination. Then register a callback to + * release the necessary events, kernels, memories. + * + * @param command_queue the queue system this copy kernel will belong + * @param program contains copy kernel + * @param name name of device global, used to look up for device global address + * in autodiscovery string + * @param blocking_write whether the operation is blocking or not + * @param size number of bytes to read / write + * @param offset offset from the extracted address of device global + * @param ptr pointer that will hold the data copied from device global + * @param num_events_in_wait_list number of event that copy kernel depend on + * @param event_wait_list events that copy kernel depend on + * @param event the info about the execution of copy kernel will be stored in + * the event + * @return status code, CL_SUCCESS if all operations are successful. + */ ACL_EXPORT CL_API_ENTRY cl_int clEnqueueWriteGlobalVariableINTEL( cl_command_queue command_queue, cl_program program, const char *name, @@ -511,63 +566,64 @@ CL_API_ENTRY cl_int clEnqueueWriteGlobalVariableINTEL( cl_event *event) { cl_int status; - cl_kernel kernel = clCreateKernelIntelFPGA(program, name, &status); + // Get copy kernel from the program + cl_kernel kernel = + clCreateKernelIntelFPGA(program, "copy_device_global", &status); if (status != CL_SUCCESS) { return status; } - // do we support ptr being a buffer instead of usm pointer? it seems to be the - // case on spec (only usm host pointer) Given kernel arg must be a deivce usm - // pointer: When ptr is a host/shared usm pointer, this function is expected - // to copy it to device first? yes (currently discussing whether kernel arg - // accept host usm instead) + // Allocate a temporary device usm pointer to hold user data + // This is done to minimize kernel area, as device to device memory + // interconnect occupies least amount of memory void *src_dev_ptr = clDeviceMemAllocINTEL( command_queue->context, command_queue->device, NULL, size, 1, &status); if (status != CL_SUCCESS) { return status; } - // This copy operation have to be blocking - // cl_event to_dev_event = 0; + // Copy from user provide host pointer to temporary device usm pointer status = clEnqueueMemcpyINTEL(command_queue, CL_TRUE, src_dev_ptr, ptr, size, 0, NULL, NULL); if (status != CL_SUCCESS) { return status; } - // if (to_dev_event->execution_status != CL_COMPLETE) { - // return CL_INVALID_OPERATION; - // } - + // Set the source of copy (in kernel arg) as the temporary device usm pointer status = clSetKernelArgMemPointerINTEL(kernel, 0, src_dev_ptr); if (status != CL_SUCCESS) { return status; } - // should kernel header contain offset or not? no - // offset is always byte offset? yes - // Assuming below returns same thing as `clDeviceMemAllocINTEL`, where exactly - // is dev global ptr being read from? What format is the read dev global, is - // it a pointer just like the return value of `clDeviceMemAllocINTEL` or is it - // something else? (its an unsigned value indicating address) - // TODO: When passing dev global pointer directly to - // clSetKernelArgMemPointerINTEL, make sure REMOVE_VALID_CHECKS is defined. - // Otherwise this ptr is not existing in context -> cause checks to fail - // TODO: get dev_global_address from autodiscovery instead later - // dev_addr_t dev_global_address = - // kernel->dev_bin->get_devdef().autodiscovery_def.? - uintptr_t dev_global_address = 0x4000000; + + // Look up device global address with use provided name + unsigned int device_global_addr_num = 0; + status = + acl_extract_device_global_address(kernel, name, &device_global_addr_num); + if (status != CL_SUCCESS) + return status; + uintptr_t dev_global_address = device_global_addr_num; + + // TODO: remove the following checks, as it only works for unit test + assert(((unsigned int)dev_global_address) == 0x1024); + // Calculate the offset from the device global address, offset is in byte unit void *dev_global_ptr = (void *)(dev_global_address + offset * 8); // 1 unit of offset is 8 bits + // TODO: add checks for whether the copy will be out of bound for device + // global + + // Set the device global pointer as the kernel destination args status = set_kernel_arg_mem_pointer_without_checks(kernel, 1, dev_global_ptr); - // status = clSetKernelArgMemPointerINTEL(kernel, 1, dev_global_ptr); if (status != CL_SUCCESS) { return status; } + + // Propagate size information to kernel status = clSetKernelArg(kernel, 2, sizeof(size_t), (const void *)(&size)); if (status != CL_SUCCESS) { return status; } + // Enqueue kernel execution status = clEnqueueTask(command_queue, kernel, num_events_in_wait_list, event_wait_list, event); if (status != CL_SUCCESS) { @@ -580,6 +636,7 @@ CL_API_ENTRY cl_int clEnqueueWriteGlobalVariableINTEL( acl_unlock(); if (blocking_write) { + // If blocking, first wait for event to finish, then clean up the resources. status = clWaitForEvents(1, event); if (status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) { return status; @@ -593,7 +650,7 @@ CL_API_ENTRY cl_int clEnqueueWriteGlobalVariableINTEL( return status; } } else { - // Clean up resources after event finishes + // Otherwise, clean up resources after event finishes void **callback_data = (void **)acl_malloc(sizeof(void *) * 3); if (!callback_data) { return CL_OUT_OF_HOST_MEMORY; @@ -608,6 +665,53 @@ CL_API_ENTRY cl_int clEnqueueWriteGlobalVariableINTEL( return CL_SUCCESS; } +/** + * Look up device global address in autodiscovery def using user provided name + * + * @param kernel the copy kernel + * @param dev_global_name name of device global to query + * @param ret_addr hold the resulting address of device global + * @return status code, CL_SUCCESS if all operations are successful. + */ +cl_int acl_extract_device_global_address(cl_kernel kernel, + const char *dev_global_name, + unsigned int *ret_addr) { + acl_lock(); + // In full system flow, the autodiscovery string is available through kernel's + // dev_bin, however it is not in unit testing framework Thus, try to find + // device global definition first on kernel's device bin, if not found then + // try on acl_present_board_def(), which is set in unit test setup steps. + std::unordered_map dev_global_map = + kernel->dev_bin->get_devdef().autodiscovery_def.device_global_mem_defs; + std::unordered_map::const_iterator + dev_global = dev_global_map.find(dev_global_name); + if (dev_global != dev_global_map.end()) { + *ret_addr = dev_global->second.address; + UNLOCK_RETURN(CL_SUCCESS); + } + // Device global name not found in kernel dev_bin, try to find in the sysdef + // setup by unit tests + dev_global_map = acl_present_board_def() + ->device[0] + .autodiscovery_def.device_global_mem_defs; + dev_global = dev_global_map.find(dev_global_name); + if (dev_global != dev_global_map.end()) { + *ret_addr = dev_global->second.address; + UNLOCK_RETURN(CL_SUCCESS); + } + UNLOCK_RETURN(CL_INVALID_VALUE); +} + +/** + * Callback function that executes after copy event finishes + * + * Main responsible for free heap memories, device memories + * + * @param event the event that this callback is registered on + * @param event_command_exec_status the status of of event (queue, complete etc) + * @param callback_data hold the resources that need to be released + * @return nothing + */ void CL_CALLBACK acl_dev_global_cleanup(cl_event event, cl_int event_command_exec_status, void *callback_data) { @@ -620,12 +724,15 @@ void CL_CALLBACK acl_dev_global_cleanup(cl_event event, event = event; acl_lock(); if (callback_ptrs[0]) { + // Free intermediate device usm pointers clMemFreeINTEL(event->context, callback_ptrs[0]); } if (callback_ptrs[1]) { + // Release kernel from hep memory as its associated device memory clReleaseKernel(((cl_kernel)callback_ptrs[1])); } if (callback_ptrs[2]) { + // Release event from hep memory clReleaseEvent(((cl_event)callback_ptrs[2])); } acl_free(callback_data); diff --git a/test/acl_globals_test.cpp b/test/acl_globals_test.cpp index 2e1214ac..fd03a14d 100644 --- a/test/acl_globals_test.cpp +++ b/test/acl_globals_test.cpp @@ -120,7 +120,7 @@ static acl_kernel_interface_t acltest_kernels[] = { {ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(int *), 0, 0, 1024}, }}, {// interface - "kernel15_dev_global", + "copy_device_global", { {ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(void *), 0, 0, 1}, {ACL_ARG_ADDR_GLOBAL, ACL_ARG_MEM_OBJ, sizeof(void *), 0, 0, 1}, @@ -590,22 +590,31 @@ static acl_system_def_t acltest_complex_system = { 1, 0, /* alloc capabilities */ 0, /* min_host_mem_alignment */ - {"fpga0", - "sample40byterandomhash000000000000000000", - 0, - acltest_complex_system_device0_accel, /* accel */ - {}, /* hal_info */ - 1, // number of global memory systems - { - /* global mem info array */ - { - /* global mem info for memory 0 */ - /* global mem */ ACL_RANGE_FROM_ARRAY(acltest_global), - /* acl_system_global_mem_type_t */ ACL_GLOBAL_MEM_DEVICE_PRIVATE, - /* num_global_bank */ 2, - /* burst_interleaved */ 1, - }, - }}}, + { + "fpga0", + "sample40byterandomhash000000000000000000", + 0, + acltest_complex_system_device0_accel, /* accel */ + {}, /* hal_info */ + 1, // number of global memory systems + { + /* global mem info array */ + { + /* global mem info for memory 0 */ + /* global mem */ ACL_RANGE_FROM_ARRAY(acltest_global), + /* acl_system_global_mem_type_t */ + ACL_GLOBAL_MEM_DEVICE_PRIVATE, + /* num_global_bank */ 2, + /* burst_interleaved */ 1, + }, + }, + {}, // acl_hostpipe_info + 1, // num_device_global + { + // device_global_mem_defs map + {"dev_global_name", {"dev_global_name", 0x1024, 2048}}, + }, + }}, {nullptr, 1, 1, @@ -615,22 +624,31 @@ static acl_system_def_t acltest_complex_system = { 0, 0, /* alloc capabilities */ 0, /* min_host_mem_alignment */ - {"fpga1", - "sample40byterandomhash000000000000000001", - 0, - acltest_complex_system_device1_accel, /* accel */ - {}, /* hal_info */ - 1, // number of global memory systems - { - /* global mem info array */ - { - /* global mem info for memory 0 */ - /* global mem */ ACL_RANGE_FROM_ARRAY(acltest_global), - /* acl_system_global_mem_type_t */ ACL_GLOBAL_MEM_DEVICE_PRIVATE, - /* num_global_bank */ 2, - /* burst_interleaved */ 1, - }, - }}}, + { + "fpga1", + "sample40byterandomhash000000000000000001", + 0, + acltest_complex_system_device1_accel, /* accel */ + {}, /* hal_info */ + 1, // number of global memory systems + { + /* global mem info array */ + { + /* global mem info for memory 0 */ + /* global mem */ ACL_RANGE_FROM_ARRAY(acltest_global), + /* acl_system_global_mem_type_t */ + ACL_GLOBAL_MEM_DEVICE_PRIVATE, + /* num_global_bank */ 2, + /* burst_interleaved */ 1, + }, + }, + {}, // acl_hostpipe_info + 1, // num_device_global + { + // device_global_mem_defs map + {"dev_global_name", {"dev_global_name", 0x1024, 2048}}, + }, + }}, {nullptr, 2, 1, diff --git a/test/acl_program_test.cpp b/test/acl_program_test.cpp index 064ccadc..1bdd8866 100644 --- a/test/acl_program_test.cpp +++ b/test/acl_program_test.cpp @@ -612,7 +612,7 @@ MT_TEST(acl_program, program_info) { // built stat. to success even before calling clbuildprogram CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES, 0, NULL, &size_ret)); - CHECK_EQUAL(341, size_ret); + CHECK_EQUAL(340, size_ret); CHECK_EQUAL(CL_SUCCESS, clBuildProgram(program, 0, 0, "", 0, 0)); @@ -633,21 +633,22 @@ MT_TEST(acl_program, program_info) { CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES, 0, NULL, &size_ret)); - CHECK_EQUAL(341, size_ret); + CHECK_EQUAL(340, size_ret); // CHECK_EQUAL(321, size_ret); names[size_ret] = 100; // making sure extra bytes of memory are not affected. CHECK_EQUAL(CL_SUCCESS, clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES, 2000 * sizeof(char), names, &size_ret)); - CHECK_EQUAL(341, size_ret); // only one kernel named: "foo" + CHECK_EQUAL(340, size_ret); // only one kernel named: "foo" CHECK_EQUAL(100, names[size_ret]); - CHECK_EQUAL(0, strcmp("kernel0_copy_vecin_vecout;" + CHECK_EQUAL(0, strcmp("copy_device_global;" // TODO: eventually we would want + // to hide it from users + "kernel0_copy_vecin_vecout;" "kernel11_task_double;" "kernel12_task_double;" "kernel13_multi_vec_lane;" "kernel14_svm_arg_alignment;" - "kernel15_dev_global;" "kernel1_vecadd_vecin_vecin_vecout;" "kernel2_vecscale_vecin_scalar_vecout;" "kernel3_locals;" diff --git a/test/acl_usm_test.cpp b/test/acl_usm_test.cpp index 5693ad44..04fcdcf0 100644 --- a/test/acl_usm_test.cpp +++ b/test/acl_usm_test.cpp @@ -1266,15 +1266,15 @@ MT_TEST(acl_usm, read_device_global) { syncThreads(); // Write to device global - status = clEnqueueWriteGlobalVariableINTEL( - m_cq, m_program, "kernel15_dev_global", CL_FALSE, strsize, 0, src_ptr, 0, - NULL, &write_event); + status = clEnqueueWriteGlobalVariableINTEL(m_cq, m_program, "dev_global_name", + CL_FALSE, strsize, 0, src_ptr, 0, + NULL, &write_event); CHECK_EQUAL(CL_SUCCESS, status); // Read from device global, with dependence on write event - status = clEnqueueReadGlobalVariableINTEL( - m_cq, m_program, "kernel15_dev_global", CL_FALSE, strsize, 0, src_ptr, 1, - &write_event, &read_event); + status = clEnqueueReadGlobalVariableINTEL(m_cq, m_program, "dev_global_name", + CL_FALSE, strsize, 0, src_ptr, 1, + &write_event, &read_event); CHECK_EQUAL(CL_SUCCESS, status); // Manually set "write device global" event done From 144020cb7700a7af5912b7e1e5edc321481a3329 Mon Sep 17 00:00:00 2001 From: Sherry Yuan Date: Fri, 1 Apr 2022 15:10:08 -0400 Subject: [PATCH 12/12] Fix klocwork and remove commented code. --- src/acl_mem.cpp | 2 ++ test/acl_usm_test.cpp | 6 ------ 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/acl_mem.cpp b/src/acl_mem.cpp index 1404fef6..f75a8648 100644 --- a/src/acl_mem.cpp +++ b/src/acl_mem.cpp @@ -445,6 +445,7 @@ CL_API_ENTRY cl_int clEnqueueReadGlobalVariableINTEL( if (status != CL_SUCCESS) { return status; } + assert(kernel); // Look up device global address with use provided name unsigned int device_global_addr_num = 0; @@ -572,6 +573,7 @@ CL_API_ENTRY cl_int clEnqueueWriteGlobalVariableINTEL( if (status != CL_SUCCESS) { return status; } + assert(kernel); // Allocate a temporary device usm pointer to hold user data // This is done to minimize kernel area, as device to device memory diff --git a/test/acl_usm_test.cpp b/test/acl_usm_test.cpp index 04fcdcf0..6e06c5c1 100644 --- a/test/acl_usm_test.cpp +++ b/test/acl_usm_test.cpp @@ -1259,10 +1259,6 @@ MT_TEST(acl_usm, read_device_global) { // Host pointer example void *src_ptr = malloc(strsize); CHECK(src_ptr != NULL); - // Host USM example - // void* src_ptr = clHostMemAllocINTEL(m_context, NULL, strsize, 1, &status); - // CHECK_EQUAL(CL_SUCCESS, status); - // CHECK(src_ptr != NULL); syncThreads(); // Write to device global @@ -1308,8 +1304,6 @@ MT_TEST(acl_usm, read_device_global) { // Host pointer example free(src_ptr); - // Host USM example - // CHECK_EQUAL(CL_SUCCESS, clMemFreeINTEL(m_context, src_ptr)); ACL_LOCKED(acl_print_debug_msg("end read_write_buf\n")); }