From ed8d6e75503fcf75a3b62c249982e697385a4678 Mon Sep 17 00:00:00 2001 From: Sepcnt <30561671+sepcnt@users.noreply.github.com> Date: Sat, 18 May 2024 08:53:59 +0000 Subject: [PATCH 1/2] - Make run-c available on Windows - CUDA allocation fix for Windows - Monotonic High-Resolution time64() for Windows - Use thrd_start_t --- build.rs | 64 +++++++++++++++++++++++++++++++----------------------- src/hvm.c | 63 +++++++++++++++++++++++++++++++++++++++++++---------- src/hvm.cu | 41 ++++++++++++++++++++++++---------- 3 files changed, 119 insertions(+), 49 deletions(-) diff --git a/build.rs b/build.rs index 884bf589..8fa7eaab 100644 --- a/build.rs +++ b/build.rs @@ -1,41 +1,51 @@ fn main() { - let cores = num_cpus::get(); let tpcl2 = (cores as f64).log2().floor() as u32; - match cc::Build::new() + let mut build = cc::Build::new(); + + // if with msvc + if cfg!(target_env = "msvc") { + build.flag("/experimental:c11atomics"); + } + + match build .file("src/hvm.c") .opt_level(3) + .std("c11") .warnings(false) .define("TPC_L2", &*tpcl2.to_string()) - .try_compile("hvm-c") { - Ok(_) => println!("cargo:rerun-if-changed=src/hvm.c"), - Err(e) => { - println!("WARNING: Failed to compile hvm.c: {}", e); - println!("Ignoring hvm.c and proceeding with build."); - } + .try_compile("hvm-c") + { + Ok(_) => println!("cargo:rerun-if-changed=src/hvm.c"), + Err(e) => { + println!("WARNING: Failed to compile hvm.c: {}", e); + println!("Ignoring hvm.c and proceeding with build."); + } } - // Builds hvm.cu - if std::process::Command::new("nvcc").arg("--version").stdout(std::process::Stdio::null()).stderr(std::process::Stdio::null()).status().is_ok() { - - if let Ok(cuda_path) = std::env::var("CUDA_HOME") { - println!("cargo:rustc-link-search=native={}/lib64", cuda_path); - } else { - println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); - } + if std::process::Command::new("nvcc") + .arg("--version") + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status() + .is_ok() + { + if let Ok(cuda_path) = std::env::var("CUDA_HOME") { + println!("cargo:rustc-link-search=native={}/lib64", cuda_path); + } else { + println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); + } - cc::Build::new() - .cuda(true) - .file("src/hvm.cu") - .compile("hvm-cu"); - - println!("cargo:rerun-if-changed=src/hvm.cu"); - println!("cargo:rustc-cfg=feature=\"cuda\""); - } - else { - println!("WARNING: CUDA compiler not found. HVM will not be able to run on GPU."); - } + cc::Build::new() + .cuda(true) + .file("src/hvm.cu") + .compile("hvm-cu"); + println!("cargo:rerun-if-changed=src/hvm.cu"); + println!("cargo:rustc-cfg=feature=\"cuda\""); + } else { + println!("WARNING: CUDA compiler not found. HVM will not be able to run on GPU."); + } } diff --git a/src/hvm.c b/src/hvm.c index 913a0c33..ae927aaf 100644 --- a/src/hvm.c +++ b/src/hvm.c @@ -1,6 +1,9 @@ #include #include -#include +#include +#ifdef _WIN32 +#include +#endif #include #include #include @@ -113,9 +116,9 @@ typedef u32 Numb; // Numb ::= 29-bit (rounded up to u32) #define G_RBAG_LEN (TPC * RLEN) typedef struct Net { - APair node_buf[G_NODE_LEN]; // global node buffer - APort vars_buf[G_VARS_LEN]; // global vars buffer - APair rbag_buf[G_RBAG_LEN]; // global rbag buffer + APair* node_buf; // global node buffer, size = G_NODE_LEN + APort* vars_buf; // global vars buffer, size = G_VARS_LEN + APair* rbag_buf; // global rbag buffer, size = G_RBAG_LEN a64 itrs; // interaction count a32 idle; // idle thread counter } Net; @@ -268,9 +271,11 @@ static inline void swap(Port *a, Port *b) { Port x = *a; *a = *b; *b = x; } +#ifndef _WIN32 u32 min(u32 a, u32 b) { return (a < b) ? a : b; } +#endif // A simple spin-wait barrier using atomic operations a64 a_reached = 0; // number of threads that reached the current barrier @@ -284,7 +289,7 @@ void sync_threads() { } else { u32 tries = 0; while (atomic_load_explicit(&a_barrier, memory_order_acquire) == barrier_old) { - sched_yield(); + thrd_yield(); } } } @@ -300,11 +305,37 @@ u32 global_sum(u32 x) { return sum; } +#ifdef _WIN32 +static int64_t _win_time_offset = 0; +static LARGE_INTEGER _win_time_freq, _win_perf_offset; + +static inline void _win_start_timer() { + QueryPerformanceFrequency(&_win_time_freq); + QueryPerformanceCounter(&_win_perf_offset); + int64_t now; + GetSystemTimePreciseAsFileTime((FILETIME*)&now); + _win_time_offset = (u64)now * 100ULL; +} +#endif + // TODO: write a time64() function that returns the time as fast as possible as a u64 static inline u64 time64() { + +// if not on windows +#ifndef _WIN32 struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (u64)ts.tv_sec * 1000000000ULL + (u64)ts.tv_nsec; +#else + LARGE_INTEGER now; + QueryPerformanceCounter(&now); + + // Calculate the time in nanoseconds + long double diff = (long double)(now.QuadPart - _win_perf_offset.QuadPart); + diff *= 1000000000.0L; + diff /= (long double)_win_time_freq.QuadPart; + return _win_time_offset + (u64)diff; +#endif } // Ports / Pairs / Rules @@ -645,6 +676,11 @@ static inline void net_init(Net* net) { // is that needed? atomic_store(&net->itrs, 0); atomic_store(&net->idle, 0); + + // allocates global buffers + net->node_buf = malloc(G_NODE_LEN * sizeof(APair)); + net->vars_buf = malloc(G_VARS_LEN * sizeof(APort)); + net->rbag_buf = malloc(G_RBAG_LEN * sizeof(APair)); } // Allocator @@ -1153,7 +1189,7 @@ void evaluator(Net* net, TM* tm, Book* book) { } // Chill... - sched_yield(); + thrd_yield(); // Halt if all threads are idle if (tick % 256 == 0) { if (atomic_load_explicit(&net->idle, memory_order_relaxed) == TPC) { @@ -1179,7 +1215,7 @@ typedef struct { Book* book; } ThreadArg; -void* thread_func(void* arg) { +thrd_start_t thread_func(void* arg) { ThreadArg* data = (ThreadArg*)arg; evaluator(data->net, data->tm, data->book); return NULL; @@ -1203,14 +1239,14 @@ void normalize(Net* net, Book* book) { } // Spawns the evaluation threads - pthread_t threads[TPC]; + thrd_t threads[TPC]; for (u32 t = 0; t < TPC; ++t) { - pthread_create(&threads[t], NULL, thread_func, &thread_arg[t]); + thrd_create(&threads[t], thread_func, &thread_arg[t]); } // Wait for the threads to finish for (u32 t = 0; t < TPC; ++t) { - pthread_join(threads[t], NULL); + thrd_join(threads[t], NULL); } } @@ -1532,7 +1568,7 @@ Port io_sleep(Net* net, Book* book, u32 argc, Port* argv) { struct timespec ts; ts.tv_sec = dur_ns / 1000000000; ts.tv_nsec = dur_ns % 1000000000; - nanosleep(&ts, NULL); + thrd_sleep(&ts, NULL); // Return an eraser return new_port(ERA, 0); @@ -1956,6 +1992,11 @@ void hvm_c(u32* book_buffer, bool run_io) { book_load(book_buffer, book); } +#ifdef _WIN32 + // Initializes the timer on Windows + _win_start_timer(); +#endif + // Starts the timer u64 start = time64(); diff --git a/src/hvm.cu b/src/hvm.cu index cf45b423..32b4c8c4 100644 --- a/src/hvm.cu +++ b/src/hvm.cu @@ -346,13 +346,13 @@ const u32 G_RBAG_LEN = TPB * BPG * RLEN * 3; // max 4m redexes struct GNet { u32 rbag_use_A; // total rbag redex count (buffer A) u32 rbag_use_B; // total rbag redex count (buffer B) - Pair rbag_buf_A[G_RBAG_LEN]; // global redex bag (buffer A) - Pair rbag_buf_B[G_RBAG_LEN]; // global redex bag (buffer B) - Pair node_buf[G_NODE_LEN]; // global node buffer - Port vars_buf[G_VARS_LEN]; // global vars buffer - u32 node_put[TPB*BPG]; - u32 vars_put[TPB*BPG]; - u32 rbag_pos[TPB*BPG]; + Pair* rbag_buf_A; // global redex bag (buffer A), size = G_RBAG_LEN + Pair* rbag_buf_B; // global redex bag (buffer B), size = G_RBAG_LEN + Pair* node_buf; // global node buffer, size = G_NODE_LEN + Port* vars_buf; // global vars buffer, size = G_VARS_LEN + u32* node_put; // size = TPB*BPG + u32* vars_put; // size = TPB*BPG + u32* rbag_pos; // size = TPB*BPG u8 mode; // evaluation mode (curr) u64 itrs; // interaction count u64 iadd; // interaction count adder @@ -1895,10 +1895,29 @@ __global__ void evaluator(GNet* gnet) { // ------------------- GNet* gnet_create() { - GNet *gnet; - cudaMalloc((void**)&gnet, sizeof(GNet)); - cudaMemset(gnet, 0, sizeof(GNet)); - return gnet; + GNet gnet; + memset(&gnet, 0, sizeof(GNet)); + + #define ALLOCATE_HOST_POINTER(__host_pointer, __size) \ + do { \ + cudaMalloc((void**)&(__host_pointer), __size); \ + cudaMemset(__host_pointer, 0, __size); \ + } while(0) + + ALLOCATE_HOST_POINTER(gnet.rbag_buf_A, G_RBAG_LEN * sizeof(Pair)); + ALLOCATE_HOST_POINTER(gnet.rbag_buf_B, G_RBAG_LEN * sizeof(Pair)); + ALLOCATE_HOST_POINTER(gnet.node_buf, G_NODE_LEN * sizeof(Pair)); + ALLOCATE_HOST_POINTER(gnet.vars_buf, G_VARS_LEN * sizeof(Port)); + ALLOCATE_HOST_POINTER(gnet.node_put, BPG * TPB * sizeof(u32)); + ALLOCATE_HOST_POINTER(gnet.vars_put, BPG * TPB * sizeof(u32)); + ALLOCATE_HOST_POINTER(gnet.rbag_pos, BPG * TPB * sizeof(u32)); + + #undef ALLOCATE_HOST_POINTER + + GNet* gnet_d; + cudaMalloc(&gnet_d, sizeof(GNet)); + cudaMemcpy(gnet_d, &gnet, sizeof(GNet), cudaMemcpyHostToDevice); + return gnet_d; } u32 gnet_get_rlen(GNet* gnet, u32 turn) { From f8f559a4ffe82c4072047c3e25c214ca9f915970 Mon Sep 17 00:00:00 2001 From: Sepcnt <30561671+sepcnt@users.noreply.github.com> Date: Sun, 19 May 2024 06:21:37 +0000 Subject: [PATCH 2/2] Make time64() fast on WIndows and not relates to system time --- build.rs | 82 +++++++++++++++++++++++++++---------------------------- src/hvm.c | 41 ++++++++-------------------- 2 files changed, 51 insertions(+), 72 deletions(-) diff --git a/build.rs b/build.rs index 8fa7eaab..3e7fbf20 100644 --- a/build.rs +++ b/build.rs @@ -1,51 +1,49 @@ fn main() { - let cores = num_cpus::get(); - let tpcl2 = (cores as f64).log2().floor() as u32; - let mut build = cc::Build::new(); - - // if with msvc - if cfg!(target_env = "msvc") { - build.flag("/experimental:c11atomics"); - } - - match build - .file("src/hvm.c") - .opt_level(3) - .std("c11") - .warnings(false) - .define("TPC_L2", &*tpcl2.to_string()) - .try_compile("hvm-c") - { - Ok(_) => println!("cargo:rerun-if-changed=src/hvm.c"), + let cores = num_cpus::get(); + let tpcl2 = (cores as f64).log2().floor() as u32; + + println!("cargo:rerun-if-changed=src/hvm.c"); + println!("cargo:rerun-if-changed=src/hvm.cu"); + + let mut build = cc::Build::new(); + + // if with msvc + if cfg!(target_env = "msvc") { + build.flag("/experimental:c11atomics"); + } + + match build + .file("src/hvm.c") + .opt_level(3) + .std("c11") + .warnings(false) + .define("TPC_L2", &*tpcl2.to_string()) + .try_compile("hvm-c") { + Ok(_) => println!("cargo:rustc-cfg=feature=\"c\""), Err(e) => { - println!("WARNING: Failed to compile hvm.c: {}", e); - println!("Ignoring hvm.c and proceeding with build."); + println!("cargo:warning=WARNING: Failed to compile hvm.c: {}", e); + println!("cargo:warning=Ignoring hvm.c and proceeding with build. The C runtime will not be available."); } - } - - // Builds hvm.cu - if std::process::Command::new("nvcc") - .arg("--version") - .stdout(std::process::Stdio::null()) - .stderr(std::process::Stdio::null()) - .status() - .is_ok() - { + } + + // Builds hvm.cu + if std::process::Command::new("nvcc").arg("--version").stdout(std::process::Stdio::null()).stderr(std::process::Stdio::null()).status().is_ok() { + if let Ok(cuda_path) = std::env::var("CUDA_HOME") { - println!("cargo:rustc-link-search=native={}/lib64", cuda_path); + println!("cargo:rustc-link-search=native={}/lib64", cuda_path); } else { - println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); + println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); } - + cc::Build::new() - .cuda(true) - .file("src/hvm.cu") - .compile("hvm-cu"); - - println!("cargo:rerun-if-changed=src/hvm.cu"); + .cuda(true) + .file("src/hvm.cu") + .compile("hvm-cu"); println!("cargo:rustc-cfg=feature=\"cuda\""); - } else { - println!("WARNING: CUDA compiler not found. HVM will not be able to run on GPU."); - } -} + } + else { + println!("cargo:warning=WARNING: CUDA compiler not found. HVM will not be able to run on GPU."); + } + + } \ No newline at end of file diff --git a/src/hvm.c b/src/hvm.c index ae927aaf..e554b6d9 100644 --- a/src/hvm.c +++ b/src/hvm.c @@ -131,8 +131,8 @@ typedef struct Def { u32 node_len; u32 vars_len; Port root; - Pair rbag_buf[32]; - Pair node_buf[32]; + Pair rbag_buf[0xFFF]; + Pair node_buf[0xFFF]; } Def; // Book of Definitions @@ -150,8 +150,8 @@ typedef struct TM { u32 hput; // next hbag push index u32 rput; // next rbag push index u32 sidx; // steal index - u32 nloc[32]; // node allocation indices - u32 vloc[32]; // vars allocation indices + u32 nloc[0xFFF]; // node allocation indices + u32 vloc[0xFFF]; // vars allocation indices Pair hbag_buf[HLEN]; // high-priority redexes } TM; @@ -305,20 +305,8 @@ u32 global_sum(u32 x) { return sum; } -#ifdef _WIN32 -static int64_t _win_time_offset = 0; -static LARGE_INTEGER _win_time_freq, _win_perf_offset; - -static inline void _win_start_timer() { - QueryPerformanceFrequency(&_win_time_freq); - QueryPerformanceCounter(&_win_perf_offset); - int64_t now; - GetSystemTimePreciseAsFileTime((FILETIME*)&now); - _win_time_offset = (u64)now * 100ULL; -} -#endif - // TODO: write a time64() function that returns the time as fast as possible as a u64 +// The time should be in nanoseconds, but not related to UTC time static inline u64 time64() { // if not on windows @@ -327,14 +315,12 @@ static inline u64 time64() { clock_gettime(CLOCK_MONOTONIC, &ts); return (u64)ts.tv_sec * 1000000000ULL + (u64)ts.tv_nsec; #else - LARGE_INTEGER now; - QueryPerformanceCounter(&now); - - // Calculate the time in nanoseconds - long double diff = (long double)(now.QuadPart - _win_perf_offset.QuadPart); - diff *= 1000000000.0L; - diff /= (long double)_win_time_freq.QuadPart; - return _win_time_offset + (u64)diff; + // @developedby: We dont care about system time, this is just a timer. + LARGE_INTEGER freq; + LARGE_INTEGER counter; + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&counter); + return (u64)((counter.QuadPart * 1000000000ULL) / freq.QuadPart); #endif } @@ -1992,11 +1978,6 @@ void hvm_c(u32* book_buffer, bool run_io) { book_load(book_buffer, book); } -#ifdef _WIN32 - // Initializes the timer on Windows - _win_start_timer(); -#endif - // Starts the timer u64 start = time64();