From ed8d6e75503fcf75a3b62c249982e697385a4678 Mon Sep 17 00:00:00 2001
From: Sepcnt <30561671+sepcnt@users.noreply.github.com>
Date: Sat, 18 May 2024 08:53:59 +0000
Subject: [PATCH 1/2] - Make run-c available on Windows - CUDA allocation fix
 for Windows - Monotonic High-Resolution time64() for Windows - Use
 thrd_start_t

---
 build.rs   | 64 +++++++++++++++++++++++++++++++-----------------------
 src/hvm.c  | 63 +++++++++++++++++++++++++++++++++++++++++++----------
 src/hvm.cu | 41 ++++++++++++++++++++++++----------
 3 files changed, 119 insertions(+), 49 deletions(-)

diff --git a/build.rs b/build.rs
index 884bf589..8fa7eaab 100644
--- a/build.rs
+++ b/build.rs
@@ -1,41 +1,51 @@
 fn main() {
-
   let cores = num_cpus::get();
   let tpcl2 = (cores as f64).log2().floor() as u32;
 
-  match cc::Build::new()
+  let mut build = cc::Build::new();
+
+  // if with msvc
+  if cfg!(target_env = "msvc") {
+      build.flag("/experimental:c11atomics");
+  }
+
+  match build
       .file("src/hvm.c")
       .opt_level(3)
+      .std("c11")
       .warnings(false)
       .define("TPC_L2", &*tpcl2.to_string())
-      .try_compile("hvm-c") {
-    Ok(_) => println!("cargo:rerun-if-changed=src/hvm.c"),
-    Err(e) => {
-      println!("WARNING: Failed to compile hvm.c: {}", e);
-      println!("Ignoring hvm.c and proceeding with build.");
-    }
+      .try_compile("hvm-c")
+  {
+      Ok(_) => println!("cargo:rerun-if-changed=src/hvm.c"),
+      Err(e) => {
+          println!("WARNING: Failed to compile hvm.c: {}", e);
+          println!("Ignoring hvm.c and proceeding with build.");
+      }
   }
 
-
   // Builds hvm.cu
-  if std::process::Command::new("nvcc").arg("--version").stdout(std::process::Stdio::null()).stderr(std::process::Stdio::null()).status().is_ok() {
-  
-    if let Ok(cuda_path) = std::env::var("CUDA_HOME") {
-      println!("cargo:rustc-link-search=native={}/lib64", cuda_path);
-    } else {
-      println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
-    }
+  if std::process::Command::new("nvcc")
+      .arg("--version")
+      .stdout(std::process::Stdio::null())
+      .stderr(std::process::Stdio::null())
+      .status()
+      .is_ok()
+  {
+      if let Ok(cuda_path) = std::env::var("CUDA_HOME") {
+          println!("cargo:rustc-link-search=native={}/lib64", cuda_path);
+      } else {
+          println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
+      }
 
-    cc::Build::new()
-      .cuda(true)
-      .file("src/hvm.cu")
-      .compile("hvm-cu");
-    
-    println!("cargo:rerun-if-changed=src/hvm.cu");
-    println!("cargo:rustc-cfg=feature=\"cuda\"");   
-  }
-  else {
-    println!("WARNING: CUDA compiler not found. HVM will not be able to run on GPU.");
-  }
+      cc::Build::new()
+          .cuda(true)
+          .file("src/hvm.cu")
+          .compile("hvm-cu");
 
+      println!("cargo:rerun-if-changed=src/hvm.cu");
+      println!("cargo:rustc-cfg=feature=\"cuda\"");
+  } else {
+      println!("WARNING: CUDA compiler not found. HVM will not be able to run on GPU.");
+  }
 }
diff --git a/src/hvm.c b/src/hvm.c
index 913a0c33..ae927aaf 100644
--- a/src/hvm.c
+++ b/src/hvm.c
@@ -1,6 +1,9 @@
 #include <inttypes.h>
 #include <math.h>
-#include <pthread.h>
+#include <threads.h>
+#ifdef _WIN32
+#include <windows.h>
+#endif
 #include <stdatomic.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -113,9 +116,9 @@ typedef u32 Numb; // Numb ::= 29-bit (rounded up to u32)
 #define G_RBAG_LEN (TPC * RLEN)
 
 typedef struct Net {
-  APair node_buf[G_NODE_LEN]; // global node buffer
-  APort vars_buf[G_VARS_LEN]; // global vars buffer
-  APair rbag_buf[G_RBAG_LEN]; // global rbag buffer
+  APair* node_buf; // global node buffer, size = G_NODE_LEN
+  APort* vars_buf; // global vars buffer, size = G_VARS_LEN
+  APair* rbag_buf; // global rbag buffer, size = G_RBAG_LEN
   a64 itrs; // interaction count
   a32 idle; // idle thread counter
 } Net;
@@ -268,9 +271,11 @@ static inline void swap(Port *a, Port *b) {
   Port x = *a; *a = *b; *b = x;
 }
 
+#ifndef _WIN32
 u32 min(u32 a, u32 b) {
   return (a < b) ? a : b;
 }
+#endif
 
 // A simple spin-wait barrier using atomic operations
 a64 a_reached = 0; // number of threads that reached the current barrier
@@ -284,7 +289,7 @@ void sync_threads() {
   } else {
     u32 tries = 0;
     while (atomic_load_explicit(&a_barrier, memory_order_acquire) == barrier_old) {
-      sched_yield();
+      thrd_yield();
     }
   }
 }
@@ -300,11 +305,37 @@ u32 global_sum(u32 x) {
   return sum;
 }
 
+#ifdef _WIN32
+static int64_t _win_time_offset = 0;
+static LARGE_INTEGER _win_time_freq, _win_perf_offset;
+
+static inline void _win_start_timer() {
+  QueryPerformanceFrequency(&_win_time_freq);
+  QueryPerformanceCounter(&_win_perf_offset);
+  int64_t now;
+  GetSystemTimePreciseAsFileTime((FILETIME*)&now);
+  _win_time_offset = (u64)now * 100ULL;
+}
+#endif
+
 // TODO: write a time64() function that returns the time as fast as possible as a u64
 static inline u64 time64() {
+
+// if not on windows
+#ifndef _WIN32
   struct timespec ts;
   clock_gettime(CLOCK_MONOTONIC, &ts);
   return (u64)ts.tv_sec * 1000000000ULL + (u64)ts.tv_nsec;
+#else
+  LARGE_INTEGER now;
+  QueryPerformanceCounter(&now);
+
+  // Calculate the time in nanoseconds
+  long double diff = (long double)(now.QuadPart - _win_perf_offset.QuadPart);
+  diff *= 1000000000.0L;
+  diff /= (long double)_win_time_freq.QuadPart;
+  return _win_time_offset + (u64)diff;
+#endif
 }
 
 // Ports / Pairs / Rules
@@ -645,6 +676,11 @@ static inline void net_init(Net* net) {
   // is that needed?
   atomic_store(&net->itrs, 0);
   atomic_store(&net->idle, 0);
+  
+  // allocates global buffers
+  net->node_buf = malloc(G_NODE_LEN * sizeof(APair));
+  net->vars_buf = malloc(G_VARS_LEN * sizeof(APort));
+  net->rbag_buf = malloc(G_RBAG_LEN * sizeof(APair));
 }
 
 // Allocator
@@ -1153,7 +1189,7 @@ void evaluator(Net* net, TM* tm, Book* book) {
       }
       
       // Chill...
-      sched_yield();
+      thrd_yield();
       // Halt if all threads are idle
       if (tick % 256 == 0) {
         if (atomic_load_explicit(&net->idle, memory_order_relaxed) == TPC) {
@@ -1179,7 +1215,7 @@ typedef struct {
   Book* book;
 } ThreadArg;
 
-void* thread_func(void* arg) {
+thrd_start_t thread_func(void* arg) {
   ThreadArg* data = (ThreadArg*)arg;
   evaluator(data->net, data->tm, data->book);
   return NULL;
@@ -1203,14 +1239,14 @@ void normalize(Net* net, Book* book) {
   }
 
   // Spawns the evaluation threads
-  pthread_t threads[TPC];
+  thrd_t threads[TPC];
   for (u32 t = 0; t < TPC; ++t) {
-    pthread_create(&threads[t], NULL, thread_func, &thread_arg[t]);
+    thrd_create(&threads[t], thread_func, &thread_arg[t]);
   }
 
   // Wait for the threads to finish
   for (u32 t = 0; t < TPC; ++t) {
-    pthread_join(threads[t], NULL);
+    thrd_join(threads[t], NULL);
   }
 }
 
@@ -1532,7 +1568,7 @@ Port io_sleep(Net* net, Book* book, u32 argc, Port* argv) {
   struct timespec ts;
   ts.tv_sec = dur_ns / 1000000000;
   ts.tv_nsec = dur_ns % 1000000000;
-  nanosleep(&ts, NULL);
+  thrd_sleep(&ts, NULL);
 
   // Return an eraser
   return new_port(ERA, 0);
@@ -1956,6 +1992,11 @@ void hvm_c(u32* book_buffer, bool run_io) {
     book_load(book_buffer, book);
   }
 
+#ifdef _WIN32
+  // Initializes the timer on Windows
+  _win_start_timer();
+#endif
+
   // Starts the timer
   u64 start = time64();
 
diff --git a/src/hvm.cu b/src/hvm.cu
index cf45b423..32b4c8c4 100644
--- a/src/hvm.cu
+++ b/src/hvm.cu
@@ -346,13 +346,13 @@ const u32 G_RBAG_LEN = TPB * BPG * RLEN * 3; // max 4m redexes
 struct GNet {
   u32  rbag_use_A; // total rbag redex count (buffer A)
   u32  rbag_use_B; // total rbag redex count (buffer B)
-  Pair rbag_buf_A[G_RBAG_LEN]; // global redex bag (buffer A)
-  Pair rbag_buf_B[G_RBAG_LEN]; // global redex bag (buffer B)
-  Pair node_buf[G_NODE_LEN]; // global node buffer
-  Port vars_buf[G_VARS_LEN]; // global vars buffer
-  u32  node_put[TPB*BPG];
-  u32  vars_put[TPB*BPG];
-  u32  rbag_pos[TPB*BPG];
+  Pair* rbag_buf_A; // global redex bag (buffer A), size = G_RBAG_LEN
+  Pair* rbag_buf_B; // global redex bag (buffer B), size = G_RBAG_LEN
+  Pair* node_buf; // global node buffer, size = G_NODE_LEN
+  Port* vars_buf; // global vars buffer, size = G_VARS_LEN
+  u32*  node_put; // size = TPB*BPG
+  u32*  vars_put; // size = TPB*BPG
+  u32*  rbag_pos; // size = TPB*BPG
   u8   mode; // evaluation mode (curr)
   u64  itrs; // interaction count
   u64  iadd; // interaction count adder
@@ -1895,10 +1895,29 @@ __global__ void evaluator(GNet* gnet) {
 // -------------------
 
 GNet* gnet_create() {
-  GNet *gnet;
-  cudaMalloc((void**)&gnet, sizeof(GNet));
-  cudaMemset(gnet, 0, sizeof(GNet));
-  return gnet;
+  GNet gnet;
+  memset(&gnet, 0, sizeof(GNet));
+  
+  #define ALLOCATE_HOST_POINTER(__host_pointer, __size) \
+    do { \
+      cudaMalloc((void**)&(__host_pointer), __size); \
+      cudaMemset(__host_pointer, 0, __size); \
+    } while(0)
+
+  ALLOCATE_HOST_POINTER(gnet.rbag_buf_A, G_RBAG_LEN * sizeof(Pair));
+  ALLOCATE_HOST_POINTER(gnet.rbag_buf_B, G_RBAG_LEN * sizeof(Pair));
+  ALLOCATE_HOST_POINTER(gnet.node_buf, G_NODE_LEN * sizeof(Pair));
+  ALLOCATE_HOST_POINTER(gnet.vars_buf, G_VARS_LEN * sizeof(Port));
+  ALLOCATE_HOST_POINTER(gnet.node_put, BPG * TPB * sizeof(u32));
+  ALLOCATE_HOST_POINTER(gnet.vars_put, BPG * TPB * sizeof(u32));
+  ALLOCATE_HOST_POINTER(gnet.rbag_pos, BPG * TPB * sizeof(u32));
+
+  #undef ALLOCATE_HOST_POINTER
+
+  GNet* gnet_d;
+  cudaMalloc(&gnet_d, sizeof(GNet));
+  cudaMemcpy(gnet_d, &gnet, sizeof(GNet), cudaMemcpyHostToDevice);
+  return gnet_d;
 }
 
 u32 gnet_get_rlen(GNet* gnet, u32 turn) {

From f8f559a4ffe82c4072047c3e25c214ca9f915970 Mon Sep 17 00:00:00 2001
From: Sepcnt <30561671+sepcnt@users.noreply.github.com>
Date: Sun, 19 May 2024 06:21:37 +0000
Subject: [PATCH 2/2] Make time64() fast on WIndows and not relates to system
 time

---
 build.rs  | 82 +++++++++++++++++++++++++++----------------------------
 src/hvm.c | 41 ++++++++--------------------
 2 files changed, 51 insertions(+), 72 deletions(-)

diff --git a/build.rs b/build.rs
index 8fa7eaab..3e7fbf20 100644
--- a/build.rs
+++ b/build.rs
@@ -1,51 +1,49 @@
 fn main() {
-  let cores = num_cpus::get();
-  let tpcl2 = (cores as f64).log2().floor() as u32;
 
-  let mut build = cc::Build::new();
-
-  // if with msvc
-  if cfg!(target_env = "msvc") {
-      build.flag("/experimental:c11atomics");
-  }
-
-  match build
-      .file("src/hvm.c")
-      .opt_level(3)
-      .std("c11")
-      .warnings(false)
-      .define("TPC_L2", &*tpcl2.to_string())
-      .try_compile("hvm-c")
-  {
-      Ok(_) => println!("cargo:rerun-if-changed=src/hvm.c"),
+    let cores = num_cpus::get();
+    let tpcl2 = (cores as f64).log2().floor() as u32;
+  
+    println!("cargo:rerun-if-changed=src/hvm.c");
+    println!("cargo:rerun-if-changed=src/hvm.cu");
+  
+    let mut build = cc::Build::new();
+  
+    // if with msvc
+    if cfg!(target_env = "msvc") {
+        build.flag("/experimental:c11atomics");
+    }
+  
+    match build
+        .file("src/hvm.c")
+        .opt_level(3)
+        .std("c11")
+        .warnings(false)
+        .define("TPC_L2", &*tpcl2.to_string())
+        .try_compile("hvm-c") {
+      Ok(_) => println!("cargo:rustc-cfg=feature=\"c\""),
       Err(e) => {
-          println!("WARNING: Failed to compile hvm.c: {}", e);
-          println!("Ignoring hvm.c and proceeding with build.");
+        println!("cargo:warning=WARNING: Failed to compile hvm.c: {}", e);
+        println!("cargo:warning=Ignoring hvm.c and proceeding with build. The C runtime will not be available.");
       }
-  }
-
-  // Builds hvm.cu
-  if std::process::Command::new("nvcc")
-      .arg("--version")
-      .stdout(std::process::Stdio::null())
-      .stderr(std::process::Stdio::null())
-      .status()
-      .is_ok()
-  {
+    }
+  
+    // Builds hvm.cu
+    if std::process::Command::new("nvcc").arg("--version").stdout(std::process::Stdio::null()).stderr(std::process::Stdio::null()).status().is_ok() {
+    
       if let Ok(cuda_path) = std::env::var("CUDA_HOME") {
-          println!("cargo:rustc-link-search=native={}/lib64", cuda_path);
+        println!("cargo:rustc-link-search=native={}/lib64", cuda_path);
       } else {
-          println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
+        println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
       }
-
+  
       cc::Build::new()
-          .cuda(true)
-          .file("src/hvm.cu")
-          .compile("hvm-cu");
-
-      println!("cargo:rerun-if-changed=src/hvm.cu");
+        .cuda(true)
+        .file("src/hvm.cu")
+        .compile("hvm-cu");
       println!("cargo:rustc-cfg=feature=\"cuda\"");
-  } else {
-      println!("WARNING: CUDA compiler not found. HVM will not be able to run on GPU.");
-  }
-}
+    }
+    else {
+      println!("cargo:warning=WARNING: CUDA compiler not found. HVM will not be able to run on GPU.");
+    }
+  
+  }
\ No newline at end of file
diff --git a/src/hvm.c b/src/hvm.c
index ae927aaf..e554b6d9 100644
--- a/src/hvm.c
+++ b/src/hvm.c
@@ -131,8 +131,8 @@ typedef struct Def {
   u32  node_len;
   u32  vars_len;
   Port root;
-  Pair rbag_buf[32];
-  Pair node_buf[32];
+  Pair rbag_buf[0xFFF];
+  Pair node_buf[0xFFF];
 } Def;
 
 // Book of Definitions
@@ -150,8 +150,8 @@ typedef struct TM {
   u32  hput; // next hbag push index
   u32  rput; // next rbag push index
   u32  sidx; // steal index
-  u32  nloc[32]; // node allocation indices
-  u32  vloc[32]; // vars allocation indices
+  u32  nloc[0xFFF]; // node allocation indices
+  u32  vloc[0xFFF]; // vars allocation indices
   Pair hbag_buf[HLEN]; // high-priority redexes
 } TM;
 
@@ -305,20 +305,8 @@ u32 global_sum(u32 x) {
   return sum;
 }
 
-#ifdef _WIN32
-static int64_t _win_time_offset = 0;
-static LARGE_INTEGER _win_time_freq, _win_perf_offset;
-
-static inline void _win_start_timer() {
-  QueryPerformanceFrequency(&_win_time_freq);
-  QueryPerformanceCounter(&_win_perf_offset);
-  int64_t now;
-  GetSystemTimePreciseAsFileTime((FILETIME*)&now);
-  _win_time_offset = (u64)now * 100ULL;
-}
-#endif
-
 // TODO: write a time64() function that returns the time as fast as possible as a u64
+// The time should be in nanoseconds, but not related to UTC time
 static inline u64 time64() {
 
 // if not on windows
@@ -327,14 +315,12 @@ static inline u64 time64() {
   clock_gettime(CLOCK_MONOTONIC, &ts);
   return (u64)ts.tv_sec * 1000000000ULL + (u64)ts.tv_nsec;
 #else
-  LARGE_INTEGER now;
-  QueryPerformanceCounter(&now);
-
-  // Calculate the time in nanoseconds
-  long double diff = (long double)(now.QuadPart - _win_perf_offset.QuadPart);
-  diff *= 1000000000.0L;
-  diff /= (long double)_win_time_freq.QuadPart;
-  return _win_time_offset + (u64)diff;
+  // @developedby: We dont care about system time, this is just a timer.
+  LARGE_INTEGER freq;
+  LARGE_INTEGER counter;
+  QueryPerformanceFrequency(&freq);
+  QueryPerformanceCounter(&counter);
+  return (u64)((counter.QuadPart * 1000000000ULL) / freq.QuadPart);
 #endif
 }
 
@@ -1992,11 +1978,6 @@ void hvm_c(u32* book_buffer, bool run_io) {
     book_load(book_buffer, book);
   }
 
-#ifdef _WIN32
-  // Initializes the timer on Windows
-  _win_start_timer();
-#endif
-
   // Starts the timer
   u64 start = time64();