Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
64 commits
Select commit Hold shift + click to select a range
eb86cbd
Renamed BlockSizes → ThreadBlockSizes
fthaler Jun 12, 2024
f10f9ad
Add loop blocking to fn GPU backend
fthaler Jun 17, 2024
c616749
Cleanup/refactor
fthaler Jun 18, 2024
44f13fd
Treat compile-time block sizes as compile-time
fthaler Jun 18, 2024
5da7ed2
Merge remote-tracking branch 'origin/master' into loop-blocking
fthaler Jun 18, 2024
b1577c1
Merge remote-tracking branch 'origin/master' into loop-blocking
fthaler Jun 18, 2024
9a5edf1
Minor cleanup
fthaler Jun 18, 2024
90e3a50
Use uint3 instead of dim3
fthaler Jun 18, 2024
f70b885
Silent warning
fthaler Jun 18, 2024
6e77d53
Add __launch_bounds__
fthaler Jun 18, 2024
508852d
Fixed comment
fthaler Jun 18, 2024
bade455
Slightly simpler meta:: calculation
fthaler Jun 18, 2024
1641040
Maybe fix for Clang-CUDA compilation
fthaler Jun 18, 2024
85dbec8
Merge remote-tracking branch 'origin/master' into loop-blocking
fthaler Jun 18, 2024
990555e
Added another ::template
fthaler Jun 18, 2024
ea9e3c9
Make clang-14-cuda-11 happy
fthaler Jun 18, 2024
d3d5228
Fix HIP compilation
fthaler Jun 19, 2024
bba6c78
Some cleanup
fthaler Jun 19, 2024
1b547d0
Fix formatting
fthaler Jun 19, 2024
6bc7bcf
Check types of ThreadBlockSizes and LoopBlockSizes
fthaler Jun 19, 2024
cf22abe
Merge branch 'master' into loop-blocking
fthaler Jun 24, 2024
afb3201
Merge branch 'master' into loop-blocking
fthaler Jun 26, 2024
21e27ea
Faster loop blocking WIP state
fthaler Jul 22, 2024
6be319a
Add const_host_view to const data store
fthaler Jul 22, 2024
ade9e8f
Added sid::make_unrolled_loop
fthaler Jul 22, 2024
0cfff11
Use explicitly unrolled loops in GPU fn backend
fthaler Jul 22, 2024
980933c
Re-enable verification
fthaler Jul 22, 2024
64098e2
Revert some irrelevant changes
fthaler Jul 22, 2024
8a021dc
Revert irrelevant changes to neighbor tables
fthaler Jul 22, 2024
8ab05d1
Revert more irrelevant changes
fthaler Jul 22, 2024
5db3605
Revert compile-time dimensions
fthaler Jul 22, 2024
05361de
Revert irrelevant changes in unstructured backend
fthaler Jul 22, 2024
d8b7b43
Formatting
fthaler Jul 22, 2024
23d2e65
Cleanup nabla stencils
fthaler Jul 22, 2024
2977c39
Fix NVCC warning
fthaler Jul 22, 2024
e2706ad
Re-enable nabla tests
fthaler Jul 22, 2024
2628178
Use vertex_field_id where applicable
fthaler Jul 22, 2024
e43a8b6
Cleanup storage SID adaptor
fthaler Jul 22, 2024
981b30a
Revert irrelevant changes
fthaler Jul 22, 2024
b9d4d34
Increased vertical block size
fthaler Jul 22, 2024
7ea497b
Revert temporary build changes
fthaler Jul 22, 2024
6c4c4f1
Fix missing ;
fthaler Jul 22, 2024
c5b3ad9
Use reasonable unroll factors in loop unrolling tests
fthaler Jul 22, 2024
34fd7e7
Fix missing include
fthaler Jul 23, 2024
e103e15
Merge remote-tracking branch 'upstream/master' into fast-loop-blockin…
fthaler Jul 23, 2024
702fa7c
Only include cuda_type_traits.hpp when required
fthaler Jul 23, 2024
68a2bd9
Enable fast-math on GPUs
fthaler Jul 23, 2024
19af796
Fix capturing of variable
fthaler Jul 23, 2024
8474c66
Disable k-blocking for now
fthaler Jul 23, 2024
69e13e6
Fix accidental use of single precision floats in fn nabla test
fthaler Jul 23, 2024
eb9096c
Updated references
fthaler Jul 23, 2024
e0fcc95
Merge remote-tracking branch 'origin/fix-nabla-float_t' into fast-loo…
fthaler Jul 23, 2024
0996c15
Possible workaround against compiler crash
fthaler Jul 24, 2024
adce6f2
Merge remote-tracking branch 'upstream/master' into fast-loop-blockin…
fthaler Jul 24, 2024
0a29491
Revert "Possible workaround against compiler crash"
fthaler Jul 24, 2024
2e6e3e0
Re-add accidentally lost index check
fthaler Jul 24, 2024
d2a6769
Selectively enable k-blocking
fthaler Jul 24, 2024
bf0ba6c
Merge remote-tracking branch 'upstream/master' into fast-loop-blockin…
fthaler Jul 24, 2024
0b0a306
Fix use of k_blocked_backend_t
fthaler Jul 24, 2024
c591f79
Explicitly use __ldg on pointer derefs
fthaler Jul 24, 2024
7699cb5
Merge remote-tracking branch 'upstream/master' into fast-loop-blockin…
fthaler Sep 2, 2024
07cae69
Fix compilation after merge
fthaler Sep 2, 2024
2a0914f
Revert unrelated changes to master
fthaler Sep 2, 2024
c3edf6f
Merge remote-tracking branch 'upstream/master' into fast-loop-blockin…
fthaler Sep 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions include/gridtools/fn/backend/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,30 @@ namespace gridtools::fn::backend {
meta::rename<tuple, Dims>());
}

template <class Dims, class Sizes, class UnrollFactors>
constexpr GT_FUNCTION auto make_unrolled_loops(Sizes const &sizes, UnrollFactors) {
return tuple_util::host_device::fold(
[&](auto outer, auto dim) {
using unroll_factor = std::remove_reference_t<decltype(host_device::at_key<decltype(dim)>(
std::declval<UnrollFactors const &>()))>;
return [outer = std::move(outer),
inner = sid::make_unrolled_loop<decltype(dim), unroll_factor::value>(
host_device::at_key<decltype(dim)>(sizes))](
auto &&...args) { return outer(inner(std::forward<decltype(args)>(args)...)); };
},
host_device::identity(),
meta::rename<tuple, Dims>());
}

template <class Sizes>
constexpr GT_FUNCTION auto make_loops(Sizes const &sizes) {
return make_loops<get_keys<Sizes>>(sizes);
}

template <class Sizes, class UnrollFactors>
constexpr GT_FUNCTION auto make_unrolled_loops(Sizes const &sizes, UnrollFactors unroll_factors) {
return make_unrolled_loops<get_keys<Sizes>>(sizes, unroll_factors);
}
} // namespace common

template <class T>
Expand Down
208 changes: 132 additions & 76 deletions include/gridtools/fn/backend/gpu.hpp

Large diffs are not rendered by default.

25 changes: 25 additions & 0 deletions include/gridtools/sid/loop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <utility>

#include "../common/defs.hpp"
#include "../common/for_each.hpp"
#include "../common/functional.hpp"
#include "../common/host_device.hpp"
#include "../common/integral_constant.hpp"
Expand Down Expand Up @@ -637,6 +638,30 @@ namespace gridtools {
return {};
}

template <class Key, int UnrollFactor, class NumSteps, class Step = integral_constant<int, 1>>
constexpr GT_FUNCTION auto make_unrolled_loop(NumSteps num_steps, Step step = {}) {
using u = integral_constant<int, UnrollFactor>;
return [step,
unrolled = make_loop<Key>(num_steps / u(), step * u()),
epilogue = make_loop<Key>(num_steps % u(), step),
epilogue_start = step * ((num_steps / u()) * u())](auto &&fun) {
return [unrolled = unrolled([step, fun=std::forward<decltype(fun)>(fun)](auto &&ptr, auto const strides) {
::gridtools::host_device::for_each<meta::make_indices_c<UnrollFactor>>([&](auto) {
fun(std::forward<decltype(ptr)>(ptr), strides);
shift(std::forward<decltype(ptr)>(ptr), get_stride<Key>(strides), step);
});
shift(std::forward<decltype(ptr)>(ptr), get_stride<Key>(strides), -step * u());
}),
epilogue = epilogue(std::forward<decltype(fun)>(fun)),
epilogue_start](auto &&ptr, auto const &strides) {
unrolled(std::forward<decltype(ptr)>(ptr), strides);
shift(std::forward<decltype(ptr)>(ptr), get_stride<Key>(strides), epilogue_start);
epilogue(std::forward<decltype(ptr)>(ptr), strides);
shift(std::forward<decltype(ptr)>(ptr), get_stride<Key>(strides), -epilogue_start);
};
};
}

/**
* A helper that allows to use `SID`s with C++11 range based loop
*
Expand Down
1 change: 1 addition & 0 deletions jenkins/envs/daint_nvcc_cray.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ module switch cudatoolkit/11.2.0_3.39-2.1__gf93aa1c
module switch cce/10.0.2

export GTCMAKE_GT_CLANG_CUDA_MODE=NVCC-CUDA
export GTCMAKE_CMAKE_CUDA_FLAGS_RELEASE='-O3 -DNDEBUG --use_fast_math'

export CTEST_PARALLEL_LEVEL=1

1 change: 1 addition & 0 deletions jenkins/envs/daint_nvcc_gcc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,6 @@ export CUDAHOSTCXX="$CXX"

export GTCMAKE_CMAKE_CXX_FLAGS='-march=haswell'
export GTCMAKE_CMAKE_CXX_FLAGS_RELEASE='-Ofast -DNDEBUG'
export GTCMAKE_CMAKE_CUDA_FLAGS_RELEASE='-O3 -DNDEBUG --use_fast_math'

export CTEST_PARALLEL_LEVEL=1
18 changes: 10 additions & 8 deletions tests/include/fn_select.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ namespace {
gridtools::integral_constant>,
gridtools::meta::list<gridtools::integral_constant<int, sizes>...>>;

using fn_backend_t = gridtools::fn::backend::gpu<block_sizes_t<32, 8, 1>>;
using fn_backend_t = gridtools::fn::backend::gpu<block_sizes_t<32, 8, 1>, block_sizes_t<1, 1, 1>>;
} // namespace
#endif

Expand All @@ -68,17 +68,19 @@ namespace gridtools::fn::backend {
return "naive";
}
} // namespace naive_impl_
using naive_impl_::naive_with_threadpool;

namespace gpu_impl_ {
template <class>
template <class, class>
struct gpu;
template <class BlockSizes>
storage::gpu backend_storage_traits(gpu<BlockSizes>);
template <class BlockSizes>
timer_cuda backend_timer_impl(gpu<BlockSizes>);
template <class BlockSizes>
inline char const *backend_name(gpu<BlockSizes> const &) {
template <class ThreadBlockSizes, class LoopBlockSizes>
storage::gpu backend_storage_traits(gpu<ThreadBlockSizes, LoopBlockSizes>);
template <class ThreadBlockSizes, class LoopBlockSizes>
timer_cuda backend_timer_impl(gpu<ThreadBlockSizes, LoopBlockSizes>);
template <class ThreadBlockSizes, class LoopBlockSizes>
inline char const *backend_name(gpu<ThreadBlockSizes, LoopBlockSizes> const &) {
return "gpu";
}
} // namespace gpu_impl_
using gpu_impl_::gpu;
} // namespace gridtools::fn::backend
18 changes: 14 additions & 4 deletions tests/regression/fn/fn_unstructured_nabla.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,12 @@ namespace {
static constexpr int vertex_field_id = 0;
static constexpr int edge_field_id = 1;

using k_blocked_backend_t = meta::if_<meta::is_instantiation_of<backend::gpu, fn_backend_t>,
backend::gpu<meta::list<meta::list<integral_constant<int, 0>, integral_constant<int, 32>>,
meta::list<integral_constant<int, 1>, integral_constant<int, 8>>>,
meta::list<meta::list<integral_constant<int, 1>, integral_constant<int, 5>>>>,
fn_backend_t>;

constexpr inline auto make_comp = [](auto backend, auto const &mesh, auto &nabla) {
using mesh_t = std::remove_reference_t<decltype(mesh)>;
using float_t = typename mesh_t::float_t;
Expand Down Expand Up @@ -253,19 +259,21 @@ namespace {
TypeParam::benchmark("fn_unstructured_nabla_field_of_tuples", comp);
}

GT_REGRESSION_TEST(fn_unstructured_nabla_fused_field_of_tuples, test_environment<>, fn_backend_t) {
GT_REGRESSION_TEST(fn_unstructured_nabla_fused_field_of_tuples, test_environment<>, k_blocked_backend_t) {
using float_t = typename TypeParam::float_t;

auto mesh = TypeParam::fn_unstructured_mesh();
auto nabla = mesh.template make_storage<tuple<float_t, float_t>>(mesh.nvertices(), mesh.nlevels());
auto comp = make_comp_fused(fn_backend_t(), mesh, nabla);
auto comp = make_comp_fused(k_blocked_backend_t(), mesh, nabla);
comp();
auto expected = make_expected(mesh);
TypeParam::verify(expected, nabla);
TypeParam::benchmark("fn_unstructured_nabla_fused_field_of_tuples", comp);
}

GT_REGRESSION_TEST(fn_unstructured_nabla_tuple_of_fields, test_environment<>, fn_backend_t) {
using float_t = typename TypeParam::float_t;

auto mesh = TypeParam::fn_unstructured_mesh();
auto nabla0 = mesh.template make_storage<float_t, vertex_field_id>(mesh.nvertices(), mesh.nlevels());
auto nabla1 = mesh.template make_storage<float_t, vertex_field_id>(mesh.nvertices(), mesh.nlevels());
Expand All @@ -280,14 +288,16 @@ namespace {
TypeParam::benchmark("fn_unstructured_nabla_tuple_of_fields", comp);
}

GT_REGRESSION_TEST(fn_unstructured_nabla_fused_tuple_of_fields, test_environment<>, fn_backend_t) {
GT_REGRESSION_TEST(fn_unstructured_nabla_fused_tuple_of_fields, test_environment<>, k_blocked_backend_t) {
using float_t = typename TypeParam::float_t;

auto mesh = TypeParam::fn_unstructured_mesh();
auto nabla0 = mesh.template make_storage<float_t, vertex_field_id>(mesh.nvertices(), mesh.nlevels());
auto nabla1 = mesh.template make_storage<float_t, vertex_field_id>(mesh.nvertices(), mesh.nlevels());
auto nabla =
sid::composite::keys<integral_constant<int, 0>, integral_constant<int, 1>>::make_values(nabla0, nabla1);

auto comp = make_comp_fused(fn_backend_t(), mesh, nabla);
auto comp = make_comp_fused(k_blocked_backend_t(), mesh, nabla);
comp();
auto expected = make_expected(mesh);
TypeParam::verify([&](int vertex, int k) { return get<0>(expected(vertex, k)); }, nabla0);
Expand Down
43 changes: 32 additions & 11 deletions tests/unit_tests/fn/test_fn_backend_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,16 @@ namespace gridtools::fn::backend {

column_stage<int_t<1>, sum_scan, 0, 1> cs;

using block_sizes_t = meta::list<meta::list<int_t<0>, int_t<4>>, meta::list<int_t<2>, int_t<2>>>;
using thread_block_sizes_t = meta::list<meta::list<int_t<0>, int_t<4>>, meta::list<int_t<2>, int_t<2>>>;
using loop_block_sizes_t = meta::list<meta::list<int_t<0>, int_t<1>>, meta::list<int_t<2>, int_t<2>>>;

apply_column_stage(
gpu<block_sizes_t>(), sizes, cs, make_iterator_mock(), composite, int_t<1>(), tuple(42, 1));
apply_column_stage(gpu<thread_block_sizes_t, loop_block_sizes_t>(),
sizes,
cs,
make_iterator_mock(),
composite,
int_t<1>(),
tuple(42, 1));

cudaMemcpy(outh, out.get(), 5 * 7 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < 5; ++i)
Expand Down Expand Up @@ -100,10 +106,16 @@ namespace gridtools::fn::backend {

column_stage<int_t<0>, sum_scan, 0, 1> cs;

using block_sizes_t = meta::list<meta::list<int_t<0>, int_t<4>>, meta::list<int_t<2>, int_t<2>>>;
using thread_block_sizes_t = meta::list<meta::list<int_t<0>, int_t<4>>, meta::list<int_t<2>, int_t<2>>>;
using loop_block_sizes_t = meta::list<meta::list<int_t<0>, int_t<2>>, meta::list<int_t<2>, int_t<2>>>;

apply_column_stage(
gpu<block_sizes_t>(), sizes, cs, make_iterator_mock(), composite, int_t<0>(), tuple(42, 1));
apply_column_stage(gpu<thread_block_sizes_t, loop_block_sizes_t>(),
sizes,
cs,
make_iterator_mock(),
composite,
int_t<0>(),
tuple(42, 1));

cudaMemcpy(outh, out.get(), 5 * sizeof(int), cudaMemcpyDeviceToHost);
int res = 42;
Expand Down Expand Up @@ -139,13 +151,22 @@ namespace gridtools::fn::backend {

column_stage<int_t<1>, sum_scan, 0, 1> cs;

using block_sizes_t = meta::list<meta::list<int_t<0>, int_t<4>>,
using thread_block_sizes_t = meta::list<meta::list<int_t<0>, int_t<4>>,
meta::list<int_t<2>, int_t<2>>,
meta::list<int_t<3>, int_t<2>>,
meta::list<int_t<4>, int_t<1>>>;
using loop_block_sizes_t = meta::list<meta::list<int_t<0>, int_t<2>>,
meta::list<int_t<2>, int_t<2>>,
meta::list<int_t<3>, int_t<2>>,
meta::list<int_t<4>, int_t<2>>>;

apply_column_stage(
gpu<block_sizes_t>(), sizes, cs, make_iterator_mock(), composite, int_t<1>(), tuple(42, 1));
apply_column_stage(gpu<thread_block_sizes_t, loop_block_sizes_t>(),
sizes,
cs,
make_iterator_mock(),
composite,
int_t<1>(),
tuple(42, 1));

cudaMemcpy(outh, out.get(), 5 * 7 * 3 * 2 * 3 * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < 5; ++i)
Expand Down Expand Up @@ -197,8 +218,8 @@ namespace gridtools::fn::backend {
};

TEST(backend_gpu, global_tmp) {
using block_sizes_t = meta::list<meta::list<int_t<0>, int_t<4>>, meta::list<int_t<2>, int_t<2>>>;
auto alloc = tmp_allocator(gpu<block_sizes_t>());
using thread_block_sizes_t = meta::list<meta::list<int_t<0>, int_t<4>>, meta::list<int_t<2>, int_t<2>>>;
auto alloc = tmp_allocator(gpu<thread_block_sizes_t>());
auto sizes = hymap::keys<int_t<0>, int_t<1>, int_t<2>>::values<int_t<5>, int_t<7>, int_t<3>>();
auto tmp = allocate_global_tmp(alloc, sizes, data_type<int>());
static_assert(sid::is_sid<decltype(tmp)>());
Expand Down
29 changes: 29 additions & 0 deletions tests/unit_tests/sid/test_sid_loop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,35 @@ namespace gridtools {
EXPECT_EQ(88, data[i][j]) << " i:" << i << ", j:" << j;
}

TEST(make_unrolled_loop, smoke) {
double data[10][10] = {};
auto strides = tuple(10_c, 1_c);

double *ptr = &data[0][0];
sid::make_unrolled_loop<i_t, 2>(5, 1)(assignment_f{42})(ptr, strides);
for (int i = 0; i < 5; ++i)
EXPECT_EQ(42, data[i][0]) << " i:" << i;

ptr = &data[2][3];
sid::make_unrolled_loop<j_t, 3>(4_c, -1_c)(assignment_f{5})(ptr, strides);
for (int i = 0; i < 4; ++i)
EXPECT_EQ(5, data[2][i]) << " i:" << i;

ptr = &data[0][0];
sid::make_unrolled_loop<i_t, 2>(10_c)(sid::make_unrolled_loop<j_t, 7>(10_c)(assignment_f{88}))(
ptr, strides);
for (int i = 0; i < 10; ++i)
for (int j = 0; j < 10; ++j)
EXPECT_EQ(88, data[i][j]) << " i:" << i << ", j:" << j;

// pass ptr as r-value
sid::make_unrolled_loop<i_t, 3>(10_c)(sid::make_unrolled_loop<j_t, 1>(10_c)(assignment_f{88}))(
&data[0][0], strides);
for (int i = 0; i < 10; ++i)
for (int j = 0; j < 10; ++j)
EXPECT_EQ(88, data[i][j]) << " i:" << i << ", j:" << j;
}

TEST(nest_loops, smoke) {
double data[10][10] = {};
double *ptr = &data[0][0];
Expand Down