diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..336c011 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,112 @@ +#################################################################################### +# START 1. Basic setup for cmake +#################################################################################### + +# basic setup for cmake +cmake_minimum_required(VERSION 3.18 FATAL_ERROR) + +if(POLICY CMP0074) + cmake_policy(SET CMP0074 NEW) +endif() + +set(CMAKE_INCLUDE_CURRENT_DIR ON) +set(CMAKE_INCLUDE_DIRECTORIES_PROJECT_BEFORE ON) +set(CMAKE_COLOR_MAKEFILE ON) +set(CMAKE_CXX_STANDARD_REQUIRED True) +# Disable gnu exentions +set(CMAKE_CXX_EXTENSIONS ON) + +# Define the project +project("dedisp" VERSION 1.0.0 LANGUAGES CXX CUDA C) + +# DEDISP may be built to run using CUDA. Future version may be +# written for HIP or SYCL, which we call the +# Target type. By default, the target is CUDA. +if(DEFINED ENV{DEDISP_TARGET}) + set(DEFTARGET $ENV{DEDISP_TARGET}) +else() + set(DEFTARGET "CUDA") +endif() + +set(VALID_TARGET_TYPES CUDA) #HIP SYCL +set(DEDISP_TARGET_TYPE + "${DEFTARGET}" + CACHE STRING "Choose the type of target, options are: ${VALID_TARGET_TYPES}") +set_property(CACHE DEDISP_TARGET_TYPE PROPERTY STRINGS CUDA) + +# CUDA specific part of CMakeLists +#set(CMAKE_CUDA_EXTENSIONS OFF) +find_package(CUDAToolkit REQUIRED) + +string(TOUPPER ${DEDISP_TARGET_TYPE} CHECK_TARGET_TYPE) +list(FIND VALID_TARGET_TYPES ${CHECK_TARGET_TYPE} TARGET_TYPE_VALID) + +if(TARGET_TYPE_VALID LESS 0) + message(SEND_ERROR "Please specify a valid DEDISP_TARGET_TYPE type! Valid target types are:" "${VALID_TARGET_TYPES}") +endif() + +# Git +find_package(Git) +if(GIT_FOUND) + execute_process( + COMMAND ${GIT_EXECUTABLE} show + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + RESULT_VARIABLE IS_GIT_REPOSIITORY + OUTPUT_QUIET ERROR_QUIET) + if(${IS_GIT_REPOSIITORY} EQUAL 0) + execute_process( + COMMAND ${GIT_EXECUTABLE} describe --abbrev=0 + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE GITTAG + OUTPUT_STRIP_TRAILING_WHITESPACE) + # we use git rev-list and pipe that through wc here. Newer git versions support --count as option to rev-list but + # that might not always be available + execute_process( + COMMAND ${GIT_EXECUTABLE} rev-list ${GITTAG}..HEAD + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + COMMAND wc -l + OUTPUT_VARIABLE GITCOUNT + OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND ${GIT_EXECUTABLE} describe --match 1 --always --long --dirty + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE GITVERSION + OUTPUT_STRIP_TRAILING_WHITESPACE) + endif() +endif(GIT_FOUND) + +option(DEDISP_TEXTURE "Use texture support (reference/obj)" ON) +add_compile_definitions(DEDISP_USE_TEXTURE=${DEDISP_USE_TEXTURE} DEDISP_TEXTURE) + +option(DEDISP_BUILD_TESTS "Build test suite" ON) +option(BUILD_SHARED_LIBS "Build using shared libraries" OFF) + +# Print the configuration details to stdout +message(STATUS "") +message(STATUS "${PROJECT_NAME} ${PROJECT_VERSION} (${GITVERSION}) **") +message(STATUS "cmake version: ${CMAKE_VERSION}") +message(STATUS "Source location: ${CMAKE_SOURCE_DIR}") +message(STATUS "Build location: ${CMAKE_BINARY_DIR}") +message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") +message(STATUS "DEDISP target: ${DEDISP_TARGET_TYPE}") +message(STATUS "DEDISP texture: ${DEDISP_TEXTURE}") +message(STATUS "DEDISP build tests: ${DEDISP_BUILD_TESTS}") +message(STATUS "DEDISP build shared libs: ${BUILD_SHARED_LIBS}") + +# Add src, tests +add_subdirectory(src) + +if(DEDISP_BUILD_TESTS) + add_subdirectory(example) +endif() + +# Install project cmake targets +include(CMakePackageConfigHelpers) +write_basic_package_version_file( + ${PROJECT_NAME}-config-version.cmake + VERSION ${DEDISP_VERSION} + COMPATIBILITY AnyNewerVersion +) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake + DESTINATION ${CMAKE_INSTALL_PREFIX}/cmake/${PROJECT_NAME} +) diff --git a/README.md b/README.md index 440becf..d606133 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,38 @@ # dedisp This repositry is derived from Ben Barsdell's original GPU De-dedispersion library (code.google.com/p/dedisp) -Installation Instructions: - - 1. git clone https://github.com/ajameson/dedisp.git - 2. Update Makefile.inc with your CUDA path, Install Dir and GPU architecture. e.g. - * CUDA_PATH ?= /usr/local/cuda-8.0.61 - * INSTALL_DIR = $(HOME)/opt/dedisp - * GPU_ARCH = sm_60 - 3. make && make install +Installation Instructions (CMake): + +## Using command line CMake + + 1. `git clone https://github.com/ajameson/dedisp.git` + 2. `mkdir build; cd build` + +Using command line CMake, choose relevant ON/OFF values, ## two digit CUDA architecture, +and optional install path + + 3. `cmake -DDEDISP_USE_TEXTURE=ON \ + -DDEDISP_BUILD_TESTS=ON \ + -DBUILD_SHARED_LIBS=ON \ + -DCMAKE_CUDA_ARCHITECTURES=70 \ + -DCMAKE_INSTALL_PREFIX="/path/to/install" \ + -DCMAKE_CUDA_COMPILER="/path/to/nvcc" ../dedisp` + +Alternatively, use the CMake GUI via + + 3. `ccmake ../dedisp` + +Last, make (and install) the componenets + + 4. `make -j install` + +## Using Makefile + + 1. `git clone https://github.com/ajameson/dedisp.git` + 2. Update `Makefile.inc` with your CUDA path, Install Dir and GPU architecture. e.g. + * `CUDA_PATH ?= /usr/local/cuda-12.4` + * `INSTALL_DIR = $(HOME)/opt/dedisp` + * `GPU_ARCH = sm_60` + 3. `make && make install` - This will build a shared object library named libdedisp.so which is a prerequisite for Heimdall. The dedisp header files will be installed into INSTALL_DIR/include and the library into INSTALL_DIR/lib. + Either of these will build a shared object library named `libdedisp.so` which is a prerequisite for Heimdall. The dedisp header files will be installed into `INSTALL_DIR/include`, the test and runtime library into `INSTALL_DIR/lib`. The CMake method will build static libraries `libdedisp.a, libdedisp_test.a` or a shared libraries `libdedisp.so, libdedisp_test.so` via the CMake variable `BUILD_SHARED_LIBS`. Install locations are unchanged, with the additions that the test executable will be placed into `INSTALL_DIR/bin` and `libdedisp_test.a/so` will be placed in `INSTALL_DIR/lib`. CMake's usual boiler plate configuration files are in the CMake default paths. diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt new file mode 100644 index 0000000..cfd8c01 --- /dev/null +++ b/example/CMakeLists.txt @@ -0,0 +1,19 @@ +include_directories(${PROJECT_SOURCE_DIR}/src) + +set (DEDISP_TEST_OBJS + gasdev.c + ran1.c + ) + +# generate libs +add_library(dedisp_test ${DEDISP_TEST_OBJS}) +set_target_properties(dedisp_test PROPERTIES POSITION_INDEPENDENT_CODE ${BUILD_SHARED_LIBS}) + +add_executable(testdedisp testdedisp.c) +target_link_libraries(testdedisp PUBLIC dedisp dedisp_test -lm) + +# Install library +install(TARGETS dedisp_test LIBRARY DESTINATION lib) + +# Install executables +install(TARGETS testdedisp RUNTIME DESTINATION bin) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..e298220 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,19 @@ +# Object files +set (DEDISP_OBJS + dedisp.cu + ) + +# Headers +set(DEDISP_HEADERS + dedisp.h + ) + +# generate libs +add_library(dedisp ${DEDISP_OBJS}) +set_target_properties(dedisp PROPERTIES POSITION_INDEPENDENT_CODE ${BUILD_SHARED_LIBS}) + +# Install headers +install(FILES ${DEDISP_HEADERS} DESTINATION include) + +# Install lib +install(TARGETS dedisp LIBRARY DESTINATION lib) diff --git a/src/kernels.cuh b/src/kernels.cuh index 6014aad..d6758b9 100644 --- a/src/kernels.cuh +++ b/src/kernels.cuh @@ -28,8 +28,8 @@ // CUDA deprecations #include -#if CUDART_VERSION < 12000 -#define DEDISP_HAVE_TEXTURE_SUPPORT +#if (CUDART_VERSION < 12000) && defined (DEDISP_USE_TEXTURE) +#define DEDISP_HAVE_LEGACY_TEXTURE_SUPPORT #endif // Kernel tuning parameters @@ -40,7 +40,7 @@ __constant__ dedisp_float c_delay_table[DEDISP_MAX_NCHANS]; __constant__ dedisp_bool c_killmask[DEDISP_MAX_NCHANS]; -#ifdef DEDISP_HAVE_TEXTURE_SUPPORT +#if defined (DEDISP_HAVE_LEGACY_TEXTURE_SUPPORT) // Texture reference for input data texture t_in; #endif @@ -164,8 +164,7 @@ void set_out_val(dedisp_byte* d_out, dedisp_size idx, // E.g., Words bracketed: (t0c0,t0c1,t0c2,t0c3), (t1c0,t1c1,t1c2,t1c3),... // Note: out_stride should be in units of samples template + int BLOCK_DIM_X, int BLOCK_DIM_Y> __global__ void dedisperse_kernel(const dedisp_word* d_in, dedisp_size nsamps, @@ -184,9 +183,15 @@ void dedisperse_kernel(const dedisp_word* d_in, dedisp_size batch_in_stride, dedisp_size batch_dm_stride, dedisp_size batch_chan_stride, - dedisp_size batch_out_stride) + dedisp_size batch_out_stride +#if !defined (DEDISP_HAVE_LEGACY_TEXTURE_SUPPORT) && defined (DEDISP_USE_TEXTURE) + // Add explicit texture arg + , cudaTextureObject_t t_in +#endif + ) + { - // Compute compile-time constants + // Compute compile-time constants enum { BITS_PER_BYTE = 8, CHANS_PER_WORD = sizeof(dedisp_word) * BITS_PER_BYTE / IN_NBITS @@ -241,36 +246,25 @@ void dedisperse_kernel(const dedisp_word* d_in, // Compute the integer delay dedisp_size delay = __float2uint_rn(dm * frac_delay); -#ifdef DEDISP_HAVE_TEXTURE_SUPPORT - if( USE_TEXTURE_MEM ) { // Pre-Fermi path - // Loop over samples per thread - // Note: Unrolled to ensure the sum[] array is stored in regs - #pragma unroll - for( dedisp_size s=0; s(sample,chan_sub)); - } - } - else + // Loop over samples per thread + // Note: Unrolled to ensure the sum[] array is stored in regs +#pragma unroll + for( dedisp_size s=0; s(t_in, offset+s + delay); +#elif !defined (DEDISP_USE_TEXTURE) + dedisp_word sample = d_in[offset + s + delay]; +#else +#error "Suspect texture preprcessor definitions" #endif - { // Fermi path - // Note: Unrolled to ensure the sum[] array is stored in regs - #pragma unroll - for( dedisp_size s=0; s(sample, chan_sub); - } + // Extract the desired subword and accumulate + sum[s] += + // TODO: Pre-Fermi cards are faster with 24-bit mul + /*__umul24*/(c_killmask[chan_idx] *//, + extract_subword(sample,chan_sub)); } } } @@ -315,20 +309,6 @@ void dedisperse_kernel(const dedisp_word* d_in, } // End of DM loop } -bool check_use_texture_mem() { -#ifdef DEDISP_HAVE_TEXTURE_SUPPORT - // Decides based on GPU architecture - int device_idx; - cudaGetDevice(&device_idx); - cudaDeviceProp device_props; - cudaGetDeviceProperties(&device_props, device_idx); - // Fermi runs worse with texture mem - bool use_texture_mem = (device_props.major < 2); - return use_texture_mem; -#else - return false; -#endif -} bool dedisperse(const dedisp_word* d_in, dedisp_size in_stride, @@ -356,32 +336,44 @@ bool dedisperse(const dedisp_word* d_in, MAX_CUDA_GRID_SIZE_Y = 65535, MAX_CUDA_1D_TEXTURE_SIZE = (1<<27) }; - -#ifdef DEDISP_HAVE_TEXTURE_SUPPORT - // Initialise texture memory if necessary +#if defined (DEDISP_USE_TEXTURE) + // Initialise texture memory // -------------------------------------- - // Determine whether we should use texture memory - bool use_texture_mem = check_use_texture_mem(); - if( use_texture_mem ) { - dedisp_size chans_per_word = sizeof(dedisp_word)*BITS_PER_BYTE / in_nbits; - dedisp_size nchan_words = nchans / chans_per_word; - dedisp_size input_words = in_stride * nchan_words; - - // Check the texture size limit - if( input_words > MAX_CUDA_1D_TEXTURE_SIZE ) { - return false; - } - // Bind the texture memory - cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(); - cudaBindTexture(0, t_in, d_in, channel_desc, - input_words * sizeof(dedisp_word)); + dedisp_size chans_per_word = sizeof(dedisp_word)*BITS_PER_BYTE / in_nbits; + dedisp_size nchan_words = nchans / chans_per_word; + dedisp_size input_words = in_stride * nchan_words; + + // Check the texture size limit + if( input_words > MAX_CUDA_1D_TEXTURE_SIZE ) { + return false; + } + // Bind the texture memory + cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(); +#ifdef DEDISP_HAVE_LEGACY_TEXTURE_SUPPORT + // Bind texture reference (legacy) + cudaBindTexture(0, t_in, d_in, channel_desc, input_words * sizeof(dedisp_word)); +#else + // Create texture object + cudaResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + resDesc.resType = cudaResourceTypeLinear; + resDesc.res.linear.devPtr = (dedisp_word*)d_in; + resDesc.res.linear.desc = channel_desc; + resDesc.res.linear.sizeInBytes = input_words * sizeof(dedisp_word); + + cudaTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + texDesc.readMode = cudaReadModeElementType; + + cudaTextureObject_t t_in=0; + cudaCreateTextureObject(&t_in, &resDesc, &texDesc, NULL); +#endif #ifdef DEDISP_DEBUG - cudaError_t cuda_error = cudaGetLastError(); - if( cuda_error != cudaSuccess ) { - return false; - } -#endif // DEDISP_DEBUG + cudaError_t cuda_error = cudaGetLastError(); + if( cuda_error != cudaSuccess ) { + return false; } +#endif // DEDISP_DEBUG #endif // -------------------------------------- @@ -413,58 +405,71 @@ bool dedisperse(const dedisp_word* d_in, cudaStream_t stream = 0; // Execute the kernel -#define DEDISP_CALL_KERNEL(NBITS, USE_TEXTURE_MEM) \ - dedisperse_kernel \ - <<>>(d_in, \ - nsamps, \ - nsamps_reduced, \ - nsamp_blocks, \ - in_stride, \ - dm_count, \ - dm_stride, \ - ndm_blocks, \ - nchans, \ - chan_stride, \ - d_out, \ - out_nbits, \ - out_stride, \ - d_dm_list, \ - batch_in_stride, \ - batch_dm_stride, \ - batch_chan_stride, \ - batch_out_stride) - // Note: Here we dispatch dynamically on nbits for supported values -#ifdef DEDISP_HAVE_TEXTURE_SUPPORT - if( use_texture_mem ) { - switch( in_nbits ) { - case 1: DEDISP_CALL_KERNEL(1,true); break; - case 2: DEDISP_CALL_KERNEL(2,true); break; - case 4: DEDISP_CALL_KERNEL(4,true); break; - case 8: DEDISP_CALL_KERNEL(8,true); break; - case 16: DEDISP_CALL_KERNEL(16,true); break; - case 32: DEDISP_CALL_KERNEL(32,true); break; - default: /* should never be reached */ break; - } - } - else +#if defined (DEDISP_HAVE_LEGACY_TEXTURE_SUPPORT) || !defined (DEDISP_USE_TEXTURE) +#define DEDISP_CALL_KERNEL(NBITS) \ + dedisperse_kernel \ + <<>>(d_in, \ + nsamps, \ + nsamps_reduced, \ + nsamp_blocks, \ + in_stride, \ + dm_count, \ + dm_stride, \ + ndm_blocks, \ + nchans, \ + chan_stride, \ + d_out, \ + out_nbits, \ + out_stride, \ + d_dm_list, \ + batch_in_stride, \ + batch_dm_stride, \ + batch_chan_stride, + batch_out_stride) +#else +#define DEDISP_CALL_KERNEL(NBITS) \ + dedisperse_kernel \ + <<>>(d_in, \ + nsamps, \ + nsamps_reduced, \ + nsamp_blocks, \ + in_stride, \ + dm_count, \ + dm_stride, \ + ndm_blocks, \ + nchans, \ + chan_stride, \ + d_out, \ + out_nbits, \ + out_stride, \ + d_dm_list, \ + batch_in_stride, \ + batch_dm_stride, \ + batch_chan_stride, \ + batch_out_stride, \ + t_in) #endif - { - switch( in_nbits ) { - case 1: DEDISP_CALL_KERNEL(1,false); break; - case 2: DEDISP_CALL_KERNEL(2,false); break; - case 4: DEDISP_CALL_KERNEL(4,false); break; - case 8: DEDISP_CALL_KERNEL(8,false); break; - case 16: DEDISP_CALL_KERNEL(16,false); break; - case 32: DEDISP_CALL_KERNEL(32,false); break; - default: /* should never be reached */ break; - } + + switch( in_nbits ) { + case 1: DEDISP_CALL_KERNEL(1); break; + case 2: DEDISP_CALL_KERNEL(2); break; + case 4: DEDISP_CALL_KERNEL(4); break; + case 8: DEDISP_CALL_KERNEL(8); break; + case 16: DEDISP_CALL_KERNEL(16); break; + case 32: DEDISP_CALL_KERNEL(32); break; + default: /* should never be reached */ break; } #undef DEDISP_CALL_KERNEL - - // Check for kernel errors + +#if defined (DEDISP_USE_TEXTURE) && !defined (DEDISP_HAVE_LEGACY_TEXTURE_SUPPORT) +cudaDestroyTextureObject(t_in); +#endif + +// Check for kernel errors #ifdef DEDISP_DEBUG - //cudaStreamSynchronize(stream); +//cudaStreamSynchronize(stream); cudaDeviceSynchronize(); cudaError_t cuda_error = cudaGetLastError(); if( cuda_error != cudaSuccess ) {