From a924e747ad853296eaaee22d0c7cf15974eee302 Mon Sep 17 00:00:00 2001
From: Leon <asimay_y@126.com>
Date: Fri, 11 Aug 2023 17:32:24 +0800
Subject: [PATCH 1/3] Update main.cu

some optimization for the main.cc code.
1. add __device__ and __restrict__ decoration for style uniform and performance.
2. an error in function call, in line: cudaMemcpyToSymbol(&cFoo, &bar, sizeof(int)); the first parameter should be pointer type.
3. Hope you can send me your book. :)
---
 06_MemoryBasics/src/main.cu | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/06_MemoryBasics/src/main.cu b/06_MemoryBasics/src/main.cu
index 216cd2b..1d99ed9 100644
--- a/06_MemoryBasics/src/main.cu
+++ b/06_MemoryBasics/src/main.cu
@@ -3,7 +3,7 @@
 
 
 // Declaration of a device variable in constant memory
-__constant__ int cFoo;
+__device__ __constant__ int cFoo;
 
 __global__ void ReadConstantMemory()
 {
@@ -23,14 +23,14 @@ __global__ void WriteGlobalMemory(int* __restrict dOutPtr)
     *dOutPtr = dFoo * dFoo;
 }
 
-__device__ void WriteAndPrintSharedMemory(int* sFoo)
+__device__ void WriteAndPrintSharedMemory(int* __restrict sFoo)
 {
     // Write a computed result to shared memory for other threads to see
     sFoo[threadIdx.x] = 42 * (threadIdx.x + 1);
     // We make sure that no thread prints while the other still writes (parallelism!)
     __syncwarp();
     // Print own computed result and result by neighbor
-    printf("ThreadID: %d, sFoo[0]: %d, sFoo[1]: %d\n", threadIdx.x, sFoo[0], sFoo[1]);
+    printf("ThreadID: %d, sFoo[%d]: %d \n", threadIdx.x, threadIdx.x, sFoo[threadIdx.x]);
 }
 
 __global__ void WriteAndPrintSharedMemoryFixed()
@@ -73,7 +73,7 @@ int main()
      GPU memory. Can be updated with cudaMemcpyToSymbol.
      This syntax is unusual, but this is how it should be
     */
-    cudaMemcpyToSymbol(cFoo, &bar, sizeof(int));
+    cudaMemcpyToSymbol(&cFoo, &bar, sizeof(int));
     ReadConstantMemory<<<1, 1>>>();
     cudaDeviceSynchronize();
 
@@ -140,4 +140,4 @@ a syncwarp, so that other threads may fail to see it. You might need a block
 size larger than 32 threads for this to happen and you may have to let the writing 
 thread do some "fake" work to delay its write to shared memory. Or it may work
 immediately :) A solution should be provided by the following code sample.
-*/
\ No newline at end of file
+*/

From 9fc25076a3e5b3b0207f5d585373785e276b8e55 Mon Sep 17 00:00:00 2001
From: Leon <asimay_y@126.com>
Date: Fri, 11 Aug 2023 18:04:50 +0800
Subject: [PATCH 2/3] Update main.cu

delete the __device__
---
 06_MemoryBasics/src/main.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/06_MemoryBasics/src/main.cu b/06_MemoryBasics/src/main.cu
index 1d99ed9..ff59bc4 100644
--- a/06_MemoryBasics/src/main.cu
+++ b/06_MemoryBasics/src/main.cu
@@ -3,7 +3,7 @@
 
 
 // Declaration of a device variable in constant memory
-__device__ __constant__ int cFoo;
+__constant__ int cFoo;
 
 __global__ void ReadConstantMemory()
 {

From 05628c449b181041a640020d75b4403998a3db9b Mon Sep 17 00:00:00 2001
From: Leon <asimay_y@126.com>
Date: Mon, 14 Aug 2023 11:32:57 +0800
Subject: [PATCH 3/3] Update main.cu

we already input N/2 in kernel launch function point, if we divide/2 here, the result is always wrong, seems half of the original result.
---
 08_Reductions/src/main.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/08_Reductions/src/main.cu b/08_Reductions/src/main.cu
index 52f1728..9308fd1 100644
--- a/08_Reductions/src/main.cu
+++ b/08_Reductions/src/main.cu
@@ -201,8 +201,8 @@ __global__ void reduceFinal(const float* __restrict input, int N)
 
     __shared__ float data[BLOCK_SIZE];
     // Already combine two values upon load from global memory.
-    data[threadIdx.x] = id < N / 2 ? input[id] : 0;
-    data[threadIdx.x] += id + N/2 < N ? input[id + N / 2] : 0;
+    data[threadIdx.x] = id < N ? input[id] : 0;
+    data[threadIdx.x] += (id + N < 2*N) ? input[id + N] : 0;
 
     for (int s = blockDim.x / 2; s > 16; s /= 2)
     {
@@ -312,4 +312,4 @@ Can you observe any difference in terms of speed / computed results?
 2) Do you have any other ideas how the reduction could be improved?
 Making it even faster should be quite challenging, but if you have 
 some suggestions, try them out and see how they affect performance! 
-*/
\ No newline at end of file
+*/