From 9abda64272bd2c903e2ccc25263bdae92c016d6d Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Wed, 23 Aug 2023 20:56:12 +0000
Subject: [PATCH 01/72] tcpdirect initial commit

---
 Makefile            |  28 ++-
 check_all_options.c |   8 +
 define_all_flags.c  |   8 +
 flags.c             |   4 +
 flow.c              |  16 ++
 flow.h              |  17 ++
 lib.h               |   8 +
 socket.c            |   4 +
 stream.c            |  36 +++
 tcpdirect.cu        | 596 ++++++++++++++++++++++++++++++++++++++++++++
 tcpdirect.h         |  26 ++
 11 files changed, 747 insertions(+), 4 deletions(-)
 create mode 100644 tcpdirect.cu
 create mode 100644 tcpdirect.h

diff --git a/Makefile b/Makefile
index c3b6041..d600eec 100644
--- a/Makefile
+++ b/Makefile
@@ -18,7 +18,7 @@
 
 all: binaries
 
-CFLAGS = -std=c99 -Wall -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA
+CFLAGS = -std=c99 -Wall -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA -DNDEBUG=1 -DWITH_TCPDIRECT
 
 lib := \
 	check_all_options.o \
@@ -48,7 +48,12 @@ lib := \
 
 tcp_rr-objs := tcp_rr_main.o tcp_rr.o rr.o $(lib)
 
-tcp_stream-objs := tcp_stream_main.o tcp_stream.o stream.o $(lib)
+tcp_stream-objs := tcp_stream_main.o tcp_stream.o stream.o tcpdirect.o $(lib)
+
+tcp_stream-cuda-objs := tcp_stream_main_cuda.o tcp_stream.o stream.o tcpdirect.o $(lib)
+
+tcp_stream-cuda2-objs := tcp_stream_main.o tcp_stream.o stream.o tcpdirect.o $(lib)
+# tcp_stream-cuda3-objs := tcp_stream_main.cu.o tcp_stream.o stream.o tcpdirect.o $(lib)
 
 tcp_crr-objs := tcp_crr_main.o tcp_crr.o rr.o $(lib)
 
@@ -64,12 +69,27 @@ psp_rr-objs := psp_rr_main.o psp_rr.o rr.o psp_lib.o $(lib)
 
 ext-libs := -lm -lrt -lpthread
 
+tcpdirect.o: tcpdirect.cu
+	nvcc -arch=sm_90 -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA -DWITH_TCPDIRECT -c -o $@ $^
+
+tcp_stream_main_cuda.o: tcp_stream_main.cu
+	nvcc -arch=sm_90 -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA -c -o $@ $^
+
+tcp_stream_main.cu.o: tcp_stream_main.c
+	nvcc -arch=sm_90 -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA -DWITH_TCPDIRECT -c -o $@ $^
+
 tcp_rr: $(tcp_rr-objs)
 	$(CC) $(LDFLAGS) -o $@ $^ $(ext-libs)
 
 tcp_stream: $(tcp_stream-objs)
 	$(CC) $(LDFLAGS) -o $@ $^ $(ext-libs)
 
+tcp_stream_cuda2: $(tcp_stream-cuda2-objs)
+	g++ $(LDFLAGS) -o $@ $^ $(ext-libs) -lc -L/usr/local/cuda/lib64 -lcudart -lcuda
+
+tcp_stream_cuda: $(tcp_stream-cuda-objs)
+	g++ $(LDFLAGS) -o $@ $^ $(ext-libs) -lc -L/usr/local/cuda/lib64 -lcudart -lcuda
+
 tcp_crr: $(tcp_crr-objs)
 	$(CC) $(LDFLAGS) -o $@ $^ $(ext-libs)
 
@@ -88,7 +108,7 @@ psp_crr: $(psp_crr-objs)
 psp_rr: $(psp_rr-objs)
 	$(CC) $(LDFLAGS) -o $@ $^ $(ext-libs)
 
-binaries: tcp_rr tcp_stream tcp_crr udp_rr udp_stream psp_stream psp_crr psp_rr
+binaries: tcp_rr tcp_stream tcp_crr udp_rr udp_stream psp_stream psp_crr psp_rr tcp_stream_cuda tcp_stream_cuda2
 
 clean:
-	rm -f *.o tcp_rr tcp_stream tcp_crr udp_rr udp_stream psp_stream psp_crr psp_rr
+	rm -f *.o tcp_rr tcp_stream tcp_crr udp_rr udp_stream psp_stream psp_crr psp_rr tcp_stream_cuda tcp_stream_cuda2
diff --git a/check_all_options.c b/check_all_options.c
index a5a5179..e4630bf 100644
--- a/check_all_options.c
+++ b/check_all_options.c
@@ -101,6 +101,14 @@ void check_options_tcp_rr(struct options *opts, struct callbacks *cb)
 
 void check_options_tcp_stream(struct options *opts, struct callbacks *cb)
 {
+      if (opts->tcpd_gpu_pci_addr) {
+            CHECK(cb, opts->tcpd_nic_pci_addr,
+                  "Must provide NIC PCI address if GPU PCI address was provided.");
+            CHECK(cb, opts->tcpdirect_phys_len > 0,
+                  "Must provide non-zero --tcpdirect-phys-len flag if GPU PCI address was provided.");
+            // TODO check page-alignment
+            // CHECK((CUdeviceptr)gpu_tx_mem_ % PAGE_SIZE == 0);
+      }
 }
 
 void check_options_udp_rr(struct options *opts, struct callbacks *cb)
diff --git a/define_all_flags.c b/define_all_flags.c
index 6f31a8e..ee7cabb 100644
--- a/define_all_flags.c
+++ b/define_all_flags.c
@@ -141,6 +141,14 @@ struct flags_parser *add_flags_tcp_stream(struct flags_parser *fp)
         DEFINE_FLAG(fp, bool,          enable_write,    false,   'w', "Write to flows? Enabled by default for the client");
         DEFINE_FLAG(fp, bool,          enable_tcp_maerts,    false,   'M', "Enables TCP_MAERTS test (server writes and client reads). It overrides enable_read, and enable_write");
         DEFINE_FLAG(fp, bool,          async_connect,   false,   0,  "use non blocking connect");
+#ifdef WITH_TCPDIRECT
+        DEFINE_FLAG(fp, const char *,  tcpd_nic_pci_addr, 0,     0,   "NIC PCI addr, e.x. 0000:06:00.0");
+        DEFINE_FLAG(fp, const char *,  tcpd_gpu_pci_addr, 0,     0,   "GPU PCI addr, e.x. 0000:04:00.0");
+        DEFINE_FLAG(fp, unsigned long long,          tcpdirect_phys_addr, 0, 0, "Set the remote memory physical address for tcpdirect, e.x. 0000:06:00.0");
+        DEFINE_FLAG(fp, unsigned long long,          tcpdirect_phys_len, 0, 0, "Set the remote memory length for tcpdirect");
+        DEFINE_FLAG(fp, const char *,                tcpdirect_src_ip, 0, 0, "Set the src ip address for tcpdirect");
+        DEFINE_FLAG(fp, const char *,                tcpdirect_dst_ip, 0, 0, "Set the dst ip address for tcpdirect");
+#endif
 
         /* Return the updated fp */
         return (fp);
diff --git a/flags.c b/flags.c
index dd801d5..8d1f5d3 100644
--- a/flags.c
+++ b/flags.c
@@ -157,6 +157,8 @@ static void default_parser(const char *type, char *arg, void *out,
                 *(unsigned long *)out = strtoul(arg, NULL, 0);
         else if (strcmp(type, "double") == 0)
                 *(double *)out = atof(arg);
+        else if (strcmp(type, "unsigned long long") == 0)
+                *(unsigned long long *)out = strtoull(arg, NULL, 0);
         else
                 LOG_ERROR(cb, "Unknown type `%s' for arg `%s'.", type, arg);
 }
@@ -339,6 +341,8 @@ static void print_flag(const struct flag *flag, struct callbacks *cb)
                 PRINT(cb, name, "%f", *(double *)var);
         else if (strcmp(type, "long long") == 0)
                 PRINT(cb, name, "%lld", *(long long *)var);
+        else if (strcmp(type, "unsigned long long") == 0)
+                PRINT(cb, name, "%llu", *(unsigned long long *)var);
         else
                 LOG_ERROR(cb, "Unknown type `%s' for variable %s", type, name);
 }
diff --git a/flow.c b/flow.c
index 97dc5ac..7dd054f 100644
--- a/flow.c
+++ b/flow.c
@@ -19,6 +19,9 @@
 #include "socket.h"
 #include "thread.h"
 #include "stats.h"
+#ifdef WITH_TCPDIRECT
+#include "tcpdirect.h"
+#endif
 
 /*
  * We define the flow struct locally to this file to force outside users to go
@@ -250,6 +253,19 @@ void flow_delete(struct flow *f)
                 thread_clear_flow_or_die(f->f_thread, f);
         }
 
+#ifdef WITH_TCPDIRECT
+        if (flow_thread(f)->opts->tcpd_gpu_pci_addr) {
+                cuda_flow_cleanup(f->f_mbuf);
+        } else if (flow_thread(f)->opts->tcpd_nic_pci_addr) {
+                struct tcpdirect_udma_mbuf *t_mbuf = (struct tcpdirect_udma_mbuf *)f->f_mbuf;
+
+                close(t_mbuf->buf_pages);
+                close(t_mbuf->buf);
+                close(t_mbuf->memfd);
+                close(t_mbuf->devfd);
+        }
+#endif
+
 /* TODO: need to free the stat struct here for crr tests */
         free(f->f_opaque);
         /* Right now the test is always false, but let's leave it in case
diff --git a/flow.h b/flow.h
index c56691a..d4488e2 100644
--- a/flow.h
+++ b/flow.h
@@ -26,6 +26,23 @@ struct flow;  /* note: struct is defined opaquely within flow.c */
 struct neper_stat;
 struct thread;
 
+struct tcpdirect_udma_mbuf {
+        struct msghdr msg;
+        int dmabuf_fd;
+        int pages_fd;
+
+        int devfd;
+        int memfd;
+        int buf;
+        int buf_pages;
+};
+
+struct tcpdirect_cuda_mbuf {
+        int gpu_mem_fd_;
+        int dma_buf_fd_;
+        void *gpu_tx_mem_;
+};
+
 typedef void (*flow_handler)(struct flow *, uint32_t);
 
 /* Simple accessors. */
diff --git a/lib.h b/lib.h
index bf6fffe..07d26fb 100644
--- a/lib.h
+++ b/lib.h
@@ -106,6 +106,14 @@ struct options {
         bool async_connect;
 
         /* tcp_stream */
+#ifdef WITH_TCPDIRECT
+        const char *tcpd_nic_pci_addr;
+        const char *tcpd_gpu_pci_addr;
+        unsigned long long tcpdirect_phys_addr;
+        unsigned long long tcpdirect_phys_len;
+        const char *tcpdirect_src_ip;
+        const char *tcpdirect_dst_ip;
+#endif
         bool enable_read;
         bool enable_write;
         bool enable_tcp_maerts;
diff --git a/socket.c b/socket.c
index fb6122b..55d342f 100644
--- a/socket.c
+++ b/socket.c
@@ -67,6 +67,10 @@ static void socket_init_not_established(struct thread *t, int s)
                 if (err)
                         PLOG_ERROR(t->cb, "setsockopt(SO_LINGER)");
         }
+#ifdef WITH_TCPDIRECT
+        if (opts->tcpdirect_phys_addr)
+                tcpdirect_setup_socket(s);
+#endif
 }
 
 /*
diff --git a/stream.c b/stream.c
index d3e049a..fc973ff 100644
--- a/stream.c
+++ b/stream.c
@@ -23,11 +23,32 @@
 #include "socket.h"
 #include "stats.h"
 #include "thread.h"
+#ifdef WITH_TCPDIRECT
+#include "tcpdirect.h"
+#endif
 
 static void *stream_alloc(struct thread *t)
 {
         const struct options *opts = t->opts;
 
+#ifdef WITH_TCPDIRECT
+        if (!t->f_mbuf && t->opts->tcpd_gpu_pci_addr) {
+                if (tcpdirect_cuda_setup_alloc(t->opts, &t->f_mbuf, t)) {
+                        LOG_ERROR(t->cb, "%s: failed to setup tcpdirect CUDA socket",
+                                  __func__);
+                        exit(1);
+                }
+        }
+
+        if (!t->f_mbuf && t->opts->tcpd_nic_pci_addr) {
+                if (udmabuf_setup_alloc(t->opts, &t->f_mbuf)) {
+                        LOG_ERROR(t->cb, "%s: failed to setup tcpdirect UDMA socket",
+                                  __func__);
+                        exit(1);
+                }
+        }
+#endif
+
         if (!t->f_mbuf) {
                 t->f_mbuf = malloc_or_die(opts->buffer_size, t->cb);
                 if (opts->enable_write)
@@ -85,6 +106,13 @@ void stream_handler(struct flow *f, uint32_t events)
         if (events & EPOLLIN)
                 do {
                         do {
+#ifdef WITH_TCPDIRECT
+                                if (t->opts->tcpd_nic_pci_addr)
+                                        n = tcpdirect_recv(fd, mbuf,
+                                                           opts->buffer_size,
+                                                           opts->recv_flags);
+                                else
+#endif
                                 n = recv(fd, mbuf, opts->buffer_size,
                                          opts->recv_flags);
                         } while(n == -1 && errno == EINTR);
@@ -102,6 +130,14 @@ void stream_handler(struct flow *f, uint32_t events)
 
         if (events & EPOLLOUT)
                 do {
+#ifdef WITH_TCPDIRECT
+                        if (t->opts->tcpd_gpu_pci_addr) {
+                                n = tcpdirect_send(fd, mbuf, opts->buffer_size, opts->send_flags);
+                        }else if (t->opts->tcpd_nic_pci_addr) {
+                                n = tcpdirect_udma_send(fd, mbuf,
+                                        opts->buffer_size, opts->send_flags);
+                        } else
+#endif
                         n = send(fd, mbuf, opts->buffer_size, opts->send_flags);
                         if (n == -1) {
                                 if (errno != EAGAIN)
diff --git a/tcpdirect.cu b/tcpdirect.cu
new file mode 100644
index 0000000..3f0e80d
--- /dev/null
+++ b/tcpdirect.cu
@@ -0,0 +1,596 @@
+#ifdef WITH_TCPDIRECT
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <asm-generic/errno-base.h>
+#include <asm-generic/socket.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/if.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include <linux/dma-buf.h>
+
+#include <memory>
+#include <cstdint>
+#include <cstdio>
+#include <string>
+
+#include "tcpdirect.h"
+#include "logging.h"
+#include "flow.h"
+#include "thread.h"
+
+#define MIN_RX_BUFFER_TOTAL_SIZE (1 << 28)
+#define GPUMEM_ALIGNMENT (1UL << 21)
+#define GPUMEM_MINSZ 0x400000
+#define PAGE_SHIFT (12)
+#define PAGE_SIZE (1 << PAGE_SHIFT)
+
+#define multiplier (1 << 16)
+
+#define SO_DEVMEM_OFFSET 99
+#define SCM_DEVMEM_OFFSET SO_DEVMEM_OFFSET
+
+#define TEST_PREFIX "ncdevmem"
+#define NUM_PAGES 16000
+
+/* missing definitions in mman-linux.h */
+#ifndef MFD_ALLOW_SEALING
+#define MFD_ALLOW_SEALING 2U
+#endif
+
+/* GRTE libraries from google3 already define the following */
+#ifndef F_SEAL_SHRINK
+#define F_SEAL_SHRINK 2U
+#endif
+#ifndef F_ADD_SEALS
+#define F_ADD_SEALS 1033U
+#endif
+#ifndef F_GET_SEALS
+#define F_GET_SEALS 1034U
+#endif
+
+#define MSG_SOCK_DEVMEM 0x2000000
+#define SO_DEVMEM_DONTNEED 97
+#define SO_DEVMEM_HEADER 98
+#define SCM_DEVMEM_HEADER SO_DEVMEM_HEADER
+#define SO_DEVMEM_OFFSET 99
+#define SCM_DEVMEM_OFFSET SO_DEVMEM_OFFSET
+
+struct dma_buf_create_pages_info {
+  __u64 pci_bdf[3];
+  __s32 dma_buf_fd;
+  __s32 create_page_pool;
+};
+
+struct dma_buf_pages_bind_rx_queue {
+  char ifname[IFNAMSIZ];
+  __u32 rxq_idx;
+};
+
+#define DMA_BUF_CREATE_PAGES \
+  _IOW(DMA_BUF_BASE, 2, struct dma_buf_create_pages_info)
+
+#define DMA_BUF_PAGES_BIND_RX \
+  _IOW(DMA_BUF_BASE, 3, struct dma_buf_pages_bind_rx_queue)
+
+// devmemvec represents a fragment of payload that is received on the socket.
+struct devmemvec {
+  // frag_offset is the offset in the registered memory.
+  __u32 frag_offset;
+  // frag size is the size of the payload.
+  __u32 frag_size;
+  // frag_token is an identifier for this fragment and it can be used to return
+  // the memory back to kernel.
+  __u32 frag_token;
+};
+
+// devmemtoken represents a range of tokens. It is used to return the fragment
+// memory back to the kernel.
+struct devmemtoken {
+  __u32 token_start;
+  __u32 token_count;
+};
+
+struct udmabuf_create {
+  uint32_t memfd;
+  uint32_t flags;
+  uint64_t offset;
+  uint64_t size;
+};
+#define UDMABUF_CREATE _IOW('u', 0x42, struct udmabuf_create)
+
+int memfd_create(const char *name, unsigned int flags)
+{
+	return syscall(__NR_memfd_create, name, flags);
+}
+
+int tcpdirect_setup_socket(int socket) {
+  const int one = 1;
+  if (setsockopt(socket, SOL_SOCKET,
+                 SO_REUSEADDR | SO_REUSEPORT | SO_ZEROCOPY,
+                 &one,
+                 sizeof(one))) {
+    perror("tcpdirect_setup_socket");
+    exit(EXIT_FAILURE);
+  }
+
+  return 0;
+}
+
+int get_gpumem_dmabuf_pages_fd(const std::string& gpu_pci_addr,
+                               const std::string& nic_pci_addr, void* gpu_mem,
+                               size_t gpu_mem_sz, int* dma_buf_fd) {
+  int err, ret;
+
+  cuMemGetHandleForAddressRange((void*)dma_buf_fd, (CUdeviceptr)gpu_mem,
+                                gpu_mem_sz, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
+                                0);
+
+  if (*dma_buf_fd < 0) {
+    perror("cuMemGetHandleForAddressRange() failed!: ");
+    exit(EXIT_FAILURE);
+  }
+
+  printf("Registered dmabuf region 0x%p of %lu Bytes\n",
+      gpu_mem, gpu_mem_sz);
+
+  struct dma_buf_create_pages_info frags_create_info;
+  frags_create_info.dma_buf_fd = *dma_buf_fd;
+  frags_create_info.create_page_pool = 0;
+
+  uint16_t pci_bdf[3];
+  ret = sscanf(nic_pci_addr.c_str(), "0000:%hx:%hx.%hx", &pci_bdf[0],
+               &pci_bdf[1], &pci_bdf[2]);
+  frags_create_info.pci_bdf[0] = pci_bdf[0];
+  frags_create_info.pci_bdf[1] = pci_bdf[1];
+  frags_create_info.pci_bdf[2] = pci_bdf[2];
+  if (ret != 3) {
+    err = -EINVAL;
+    goto err_close_dmabuf;
+  }
+
+  ret = ioctl(*dma_buf_fd, DMA_BUF_CREATE_PAGES, &frags_create_info);
+  if (ret < 0) {
+    perror("Error getting dma_buf frags: ");
+    err = -EIO;
+    goto err_close_dmabuf;
+  }
+  return ret;
+
+err_close_dmabuf:
+  close(*dma_buf_fd);
+  return err;
+}
+
+int tcpdirect_setup_rx_socket(const struct options *opts, struct thread *t)
+{
+  int j;
+  char *eth_device = "eth1";  // TODO: hard-coded for now
+
+  /* Need to trigger the NIC to reallocate its RX pages, otherwise the
+    * bind doesn't take effect.
+    */
+  system("sudo ethtool --set-priv-flags eth1 enable-header-split off");
+  system("sudo ethtool --set-priv-flags eth1 enable-header-split on");
+
+  sleep(2);
+
+  // TODO hardcoded
+  for (j = 0; j < t->flow_limit; j++) {
+    char command[256];
+    sleep(1);
+
+    int flow_idx = (t->flow_first + t->flow_count);
+    int src_port = flow_idx + t->opts->source_port;
+
+    int n = t->opts->num_ports ? t->opts->num_ports : 1;
+    int i = (t->flow_first + t->flow_count) % n;
+    int dst_port = atoi(t->opts->port) + i;
+
+    // TODO hard-coded
+    char *src_ip = "192.169.1.6", *dst_ip = "192.168.1.4";
+
+    snprintf(
+      command, sizeof(command),
+      "sudo ethtool -N eth1 flow-type tcp4 src-ip %s dst-ip %s src-port %i dst-port %i queue 15",
+      src_ip, dst_ip, src_port, dst_port);
+
+    printf("bound %s %i %s %i\n", src_ip, src_port, dst_ip, dst_port);
+  }
+}
+
+int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct thread *t)
+{
+  bool is_client = opts->client;
+  int ret;
+  void *gpu_tx_mem_;
+  int gpu_mem_fd_;
+  int dma_buf_fd_;
+  std::unique_ptr<char[]> buf_;
+  struct tcpdirect_cuda_mbuf *tmbuf;
+  const char *gpu_pci_addr = opts->tcpd_gpu_pci_addr;  // "0000:04:00.0"
+  const char *nic_pci_addr = opts->tcpd_nic_pci_addr;  // "0000:06:00.0"
+  size_t message_size_ = 4096000; // TODO param this
+  size_t alloc_size = opts->tcpdirect_phys_len;  // std::max(message_size_, (unsigned long)GPUMEM_MINSZ)
+
+  tmbuf =
+    (struct tcpdirect_cuda_mbuf *)calloc(1, sizeof(struct tcpdirect_udma_mbuf));
+  if (!tmbuf) {
+    exit(EXIT_FAILURE);
+  }
+
+  if (alloc_size % GPUMEM_ALIGNMENT != 0) {
+    alloc_size += GPUMEM_ALIGNMENT - (alloc_size % GPUMEM_ALIGNMENT);
+  }
+
+  cudaMalloc(&gpu_tx_mem_, alloc_size);
+  unsigned int flag = 1;
+  cuPointerSetAttribute(&flag,
+                        CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+                        (CUdeviceptr)gpu_tx_mem_);
+  buf_.reset(new char[message_size_]);
+
+  gpu_mem_fd_ = get_gpumem_dmabuf_pages_fd(gpu_pci_addr, nic_pci_addr,
+                                           gpu_tx_mem_, alloc_size,
+                                           &dma_buf_fd_);
+
+  if (gpu_mem_fd_ < 0) {
+    printf("get_gpumem_dmabuf_pages_fd() failed!: ");
+    exit(71);
+  }
+
+  // if (!is_client) {
+  //   tcpdirect_setup_rx_socket(opts, t);
+  // }
+
+  *f_mbuf = tmbuf;
+  tmbuf->gpu_mem_fd_ = gpu_mem_fd_;
+  tmbuf->dma_buf_fd_ = dma_buf_fd_;
+  tmbuf->gpu_tx_mem_ = gpu_tx_mem_;
+  return 0;
+}
+
+int udmabuf_setup_alloc(const struct options *opts, void **f_mbuf) {
+  bool is_client = opts->client;
+  std::unique_ptr<char[]> buf_;
+  int devfd;
+  int memfd;
+  int buf;
+  int buf_pages;
+  int ret;
+  size_t size = opts->tcpdirect_phys_len;
+
+  struct tcpdirect_udma_mbuf *tmbuf;
+  struct dma_buf_create_pages_info pages_create_info;
+  struct udmabuf_create create;
+
+  if (f_mbuf == NULL) return ENOMEM;
+
+  if (*f_mbuf) return 0;
+
+  tmbuf = (struct tcpdirect_udma_mbuf *)calloc(1, sizeof(struct tcpdirect_udma_mbuf));
+  if (!tmbuf) {
+    exit(EXIT_FAILURE);
+  }
+
+  devfd = open("/dev/udmabuf", O_RDWR);
+  if (devfd < 0) {
+    printf("%s: [skip,no-udmabuf: Unable to access DMA buffer device file]\n",
+           TEST_PREFIX);
+    exit(70);
+  }
+
+  memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING);
+  if (memfd < 0) {
+    printf("%s: [skip,no-memfd]\n", TEST_PREFIX);
+    exit(72);
+  }
+
+  ret = fcntl(memfd, F_ADD_SEALS, F_SEAL_SHRINK);
+  if (ret < 0) {
+    printf("%s: [skip,fcntl-add-seals]\n", TEST_PREFIX);
+    exit(73);
+  }
+
+  ret = ftruncate(memfd, size);
+  if (ret == -1) {
+    printf("%s: [FAIL,memfd-truncate]\n", TEST_PREFIX);
+    exit(74);
+  }
+
+  memset(&create, 0, sizeof(create));
+
+  create.memfd = memfd;
+  create.offset = 0;
+  create.size = size;
+  printf("size=%lu\n", size);
+  buf = ioctl(devfd, UDMABUF_CREATE, &create);
+  if (buf < 0) {
+    printf("%s: [FAIL, create udmabuf]\n", TEST_PREFIX);
+    exit(75);
+  }
+
+  pages_create_info.dma_buf_fd = buf;
+  pages_create_info.create_page_pool = is_client ? 0 : 1;
+
+  /* TODO: hardcoded NIC pci address */
+  // "0000:06:00.0"
+  ret = sscanf(opts->tcpd_nic_pci_addr, "0000:%llx:%llx.%llx",
+         &pages_create_info.pci_bdf[0],
+         &pages_create_info.pci_bdf[1],
+         &pages_create_info.pci_bdf[2]);
+
+  if (ret != 3) {
+    printf("%s: [FAIL, parse fail]\n", TEST_PREFIX);
+    exit(76);
+  }
+
+  buf_pages = ioctl(buf, DMA_BUF_CREATE_PAGES, &pages_create_info);
+  if (buf_pages < 0) {
+    perror("ioctl DMA_BUF_CREATE_PAGES: [FAIL, create pages fail]\n");
+    exit(77);
+  }
+
+  if (!is_client) {
+    /* TODO hardcoded num_queues */
+    int num_queues = 15;
+    struct dma_buf_pages_bind_rx_queue bind_cmd;
+
+    strcpy(bind_cmd.ifname, "eth1");
+    bind_cmd.rxq_idx = num_queues;
+
+    ret = ioctl(buf_pages, DMA_BUF_PAGES_BIND_RX, &bind_cmd);
+    if (ret < 0) {
+      printf("%s: [FAIL, bind fail queue=%d]\n", TEST_PREFIX,
+            num_queues);
+      exit(78);
+    }
+
+    // tcpdirect_setup_rx_socket(opts, t);
+  }
+
+  struct dma_buf_sync sync = { 0 };
+  sync.flags = DMA_BUF_SYNC_WRITE | DMA_BUF_SYNC_START;
+  ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
+
+  *f_mbuf = tmbuf;
+
+  tmbuf->devfd = devfd;
+  tmbuf->memfd = memfd;
+  tmbuf->buf = buf;
+  tmbuf->buf_pages = buf_pages;
+  return 0;
+}
+
+int tcpdirect_udma_send(int socket, void *f_mbuf, size_t n, int flags) {
+  int buf_pages, buf;
+  struct iovec iov;
+  struct msghdr *msg;
+  struct cmsghdr *cmsg;
+  char buf_dummy[n];
+  char offsetbuf[CMSG_SPACE(sizeof(uint32_t) * 2)];
+  struct tcpdirect_udma_mbuf *tmbuf;
+
+  if (!f_mbuf) return -1;
+
+  tmbuf = (struct tcpdirect_udma_mbuf *)f_mbuf;
+  buf_pages = tmbuf->buf_pages;
+  buf = tmbuf->buf;
+  msg = &tmbuf->msg;
+
+  struct dma_buf_sync sync = { 0 };
+  sync.flags = DMA_BUF_SYNC_WRITE | DMA_BUF_SYNC_START;
+  ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
+
+  char *buf_mem = NULL;
+  buf_mem = (char *)mmap(NULL, n, PROT_READ | PROT_WRITE, MAP_SHARED, buf, 0);
+  if (buf_mem == MAP_FAILED) {
+    perror("mmap()");
+    exit(1);
+  }
+
+  memcpy(buf_mem, buf_dummy, n);
+
+  sync.flags = DMA_BUF_SYNC_WRITE | DMA_BUF_SYNC_END;
+  ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
+
+  munmap(buf_mem, n);
+
+  memset(msg, 0, sizeof(struct msghdr));
+  // memset(cmsg, 0, sizeof(struct cmsghdr));
+
+  iov.iov_base = buf_dummy;
+  iov.iov_len = n;
+
+  msg->msg_iov = &iov;
+  msg->msg_iovlen = 1;
+
+  msg->msg_control = offsetbuf;
+  msg->msg_controllen = sizeof(offsetbuf);
+
+  cmsg = CMSG_FIRSTHDR(msg);
+  cmsg->cmsg_level = SOL_SOCKET;
+  cmsg->cmsg_type = SCM_DEVMEM_OFFSET;
+  cmsg->cmsg_len = CMSG_LEN(sizeof(int) * 2);
+  *((int*)CMSG_DATA(cmsg)) = buf_pages;
+  ((int*)CMSG_DATA(cmsg))[1] = 0;
+
+  ssize_t bytes_sent = sendmsg(socket, msg, MSG_ZEROCOPY);
+  if (bytes_sent < 0 && errno != EWOULDBLOCK && errno != EAGAIN) {
+    perror("sendmsg() error: ");
+    exit(EXIT_FAILURE);
+  }
+
+  if (bytes_sent == 0) {
+    perror("sendmsg() sent 0 bytes. Something is wrong.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return bytes_sent;
+}
+
+int tcpdirect_send(int socket, void *buf, size_t n, int flags) {
+  int gpu_mem_fd_;
+  struct iovec iov;
+  struct msghdr *msg;
+  struct cmsghdr *cmsg;
+  char buf_dummy[n];
+  char offsetbuf[CMSG_SPACE(sizeof(uint32_t) * 2)];
+  struct tcpdirect_udma_mbuf *tmbuf;
+
+  if (!buf) return -1;
+
+  tmbuf = (struct tcpdirect_udma_mbuf *)buf;
+  gpu_mem_fd_ = tmbuf->pages_fd;
+  msg = &tmbuf->msg;
+
+  memset(msg, 0, sizeof(struct msghdr));
+  // memset(cmsg, 0, sizeof(struct cmsghdr));
+
+  iov.iov_base = buf_dummy;
+  iov.iov_len = n;
+
+  msg->msg_iov = &iov;
+  msg->msg_iovlen = 1;
+
+  msg->msg_control = offsetbuf;
+  msg->msg_controllen = sizeof(offsetbuf);
+
+  cmsg = CMSG_FIRSTHDR(msg);
+  cmsg->cmsg_level = SOL_SOCKET;
+  cmsg->cmsg_type = SCM_DEVMEM_OFFSET;
+  cmsg->cmsg_len = CMSG_LEN(sizeof(int) * 2);
+  *((int*)CMSG_DATA(cmsg)) = gpu_mem_fd_;
+
+  ssize_t bytes_sent = sendmsg(socket, msg, MSG_ZEROCOPY | MSG_DONTWAIT);
+  if (bytes_sent < 0 && errno != EWOULDBLOCK && errno != EAGAIN) {
+    perror("sendmsg() error: ");
+    exit(EXIT_FAILURE);
+  }
+
+  if (bytes_sent == 0) {
+    perror("sendmsg() sent 0 bytes. Something is wrong.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return bytes_sent;
+}
+
+int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags) {
+  struct iovec iov;
+  struct msghdr msg_local;
+  struct msghdr *msg;
+  struct tcpdirect_udma_mbuf *tmbuf;
+  int buf, ret, client_fd;
+  size_t total_received = 0;
+
+  if (!f_mbuf) return -1;
+
+  tmbuf = (struct tcpdirect_udma_mbuf *)f_mbuf;
+
+  buf = tmbuf->buf;
+  client_fd = socket;
+
+  char buf_dummy[n];
+  // char offsetbuf[CMSG_SPACE(sizeof(uint32_t) * 128)];
+  char offsetbuf[CMSG_SPACE(sizeof(int) * 1000)];
+  msg = &msg_local;
+
+  memset(msg, 0, sizeof(struct msghdr));
+
+  iov.iov_base = buf_dummy;
+  iov.iov_len = n;
+  msg->msg_iov = &iov;
+  msg->msg_iovlen = 1;
+
+  msg->msg_control = offsetbuf;
+  msg->msg_controllen = sizeof(offsetbuf);
+
+  char *buf_mem = NULL;
+
+  if (msg->msg_flags & MSG_CTRUNC) {
+    printf("fatal, cmsg truncated, current msg_controllen\n");
+ }
+
+  ssize_t received = recvmsg(socket, msg, MSG_SOCK_DEVMEM | MSG_DONTWAIT);
+  if (received < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
+  } else if (received < 0) {
+    printf("%s %d\n", __func__, __LINE__);
+    return -1;
+  } else if (received == 0) {
+    printf("Client exited\n");
+  }
+
+  struct cmsghdr *cm = NULL;
+  struct devmemvec *devmemvec = NULL;
+  for (cm = CMSG_FIRSTHDR(msg); cm; cm = CMSG_NXTHDR(msg, cm)) {
+    if (cm->cmsg_level != SOL_SOCKET ||
+        (cm->cmsg_type != SCM_DEVMEM_OFFSET &&
+          cm->cmsg_type != SCM_DEVMEM_HEADER)) {
+      continue;
+    }
+
+    devmemvec = (struct devmemvec *)CMSG_DATA(cm);
+
+    if (cm->cmsg_type == SCM_DEVMEM_HEADER) {
+      // TODO: process data copied from skb's linear
+      // buffer.
+      fprintf(stderr, "\n\nSCM_DEVMEM_HEADER. devmemvec->frag_size=%u\n",
+              devmemvec->frag_size);
+      exit(1);
+    }
+
+    struct devmemtoken token = { devmemvec->frag_token, 1 };
+
+    // struct dma_buf_sync sync = { 0 };
+    // sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_START;
+    // ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
+
+    // buf_mem = (char *)mmap(NULL, n, PROT_READ | PROT_WRITE,
+    //                MAP_SHARED, buf, 0);
+    // if (buf_mem == MAP_FAILED) {
+    //   perror("mmap()");
+    //   exit(1);
+    // }
+    total_received += devmemvec->frag_size;
+    printf("\n\nreceived frag_page=%u, in_page_offset=%u,"
+            " frag_offset=%u, frag_size=%u, token=%u"
+            " total_received=%lu\n",
+            devmemvec->frag_offset >> PAGE_SHIFT,
+            devmemvec->frag_offset % PAGE_SIZE,
+            devmemvec->frag_offset, devmemvec->frag_size,
+            devmemvec->frag_token,
+            total_received);
+
+    // sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_END;
+    // ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
+
+    // ret = setsockopt(client_fd, SOL_SOCKET,
+    //       SO_DEVMEM_DONTNEED, &token,
+    //       sizeof(token));
+    // if (ret) {
+    //   perror("DONTNEED failed");
+    //   exit(1);
+    // }
+
+    // munmap(buf_mem, n);
+
+    return total_received;
+  }
+  return 0;
+}
+
+int cuda_flow_cleanup(void *f_mbuf) {
+  struct tcpdirect_cuda_mbuf *t_mbuf = (struct tcpdirect_cuda_mbuf *)f_mbuf;
+
+  close(t_mbuf->gpu_mem_fd_);
+  close(t_mbuf->dma_buf_fd_);
+  cudaFree(t_mbuf->gpu_tx_mem_);
+  return 0;
+}
+#endif /* #ifdef WITH_TCPDIRECT */
\ No newline at end of file
diff --git a/tcpdirect.h b/tcpdirect.h
new file mode 100644
index 0000000..943c8d0
--- /dev/null
+++ b/tcpdirect.h
@@ -0,0 +1,26 @@
+#ifndef THIRD_PARTY_NEPER_TCPDIRECT_H_
+#define THIRD_PARTY_NEPER_TCPDIRECT_H_
+
+#if __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+#include "common.h"
+#include "flags.h"
+#include "lib.h"
+
+int tcpdirect_setup_socket(int socket);
+int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct thread *t);
+int cuda_flow_cleanup(void *f_mbuf);
+int udmabuf_setup_alloc(const struct options *opts, void **f_mbuf);
+int tcpdirect_send(int socket, void *buf, size_t n, int flags);
+int tcpdirect_udma_send(int fd, void *buf, size_t n, int flags);
+int tcpdirect_recv(int fd, void *f_mbuf, size_t n, int flags);
+
+#if __cplusplus
+}
+#endif
+
+#endif  // THIRD_PARTY_NEPER_TCPDIRECT_H_

From 1ceae04ff4de0ff945b88c7fb680f3df09f473fb Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Fri, 25 Aug 2023 17:28:52 +0000
Subject: [PATCH 02/72] tcpdirect: discard frags and bind cuda rx bufs

---
 socket.c     |  5 ++++-
 tcpdirect.cu | 33 ++++++++++++++++++++++-----------
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/socket.c b/socket.c
index 55d342f..b252a52 100644
--- a/socket.c
+++ b/socket.c
@@ -18,6 +18,9 @@
 #include "flow.h"
 #include "socket.h"
 #include "thread.h"
+#ifdef WITH_TCPDIRECT
+#include "tcpdirect.h"
+#endif
 
 #ifndef NO_LIBNUMA
 #include "third_party/libnuma/numa.h"
@@ -68,7 +71,7 @@ static void socket_init_not_established(struct thread *t, int s)
                         PLOG_ERROR(t->cb, "setsockopt(SO_LINGER)");
         }
 #ifdef WITH_TCPDIRECT
-        if (opts->tcpdirect_phys_addr)
+        if (opts->tcpd_nic_pci_addr)
                 tcpdirect_setup_socket(s);
 #endif
 }
diff --git a/tcpdirect.cu b/tcpdirect.cu
index 3f0e80d..ffd6f23 100644
--- a/tcpdirect.cu
+++ b/tcpdirect.cu
@@ -244,9 +244,21 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
     exit(71);
   }
 
-  // if (!is_client) {
-  //   tcpdirect_setup_rx_socket(opts, t);
-  // }
+  if (!is_client) {
+    /* TODO hardcoded num_queues */
+    int num_queues = 15;
+    struct dma_buf_pages_bind_rx_queue bind_cmd;
+
+    strcpy(bind_cmd.ifname, "eth1");
+    bind_cmd.rxq_idx = num_queues;
+
+    ret = ioctl(gpu_mem_fd_, DMA_BUF_PAGES_BIND_RX, &bind_cmd);
+    if (ret < 0) {
+      printf("%s: [FAIL, bind fail queue=%d]\n", TEST_PREFIX,
+            num_queues);
+      exit(78);
+    }
+  }
 
   *f_mbuf = tmbuf;
   tmbuf->gpu_mem_fd_ = gpu_mem_fd_;
@@ -257,7 +269,6 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
 
 int udmabuf_setup_alloc(const struct options *opts, void **f_mbuf) {
   bool is_client = opts->client;
-  std::unique_ptr<char[]> buf_;
   int devfd;
   int memfd;
   int buf;
@@ -570,13 +581,13 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags) {
     // sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_END;
     // ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
 
-    // ret = setsockopt(client_fd, SOL_SOCKET,
-    //       SO_DEVMEM_DONTNEED, &token,
-    //       sizeof(token));
-    // if (ret) {
-    //   perror("DONTNEED failed");
-    //   exit(1);
-    // }
+    ret = setsockopt(client_fd, SOL_SOCKET,
+                      SO_DEVMEM_DONTNEED, &token,
+                      sizeof(token));
+    if (ret) {
+      perror("DONTNEED failed");
+      exit(1);
+    }
 
     // munmap(buf_mem, n);
 

From 15b5cee2ffb145857ad4e33bfa8ed3db5edd974e Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Tue, 12 Sep 2023 18:08:45 +0000
Subject: [PATCH 03/72] tcpd: create page pool for host

---
 tcpdirect.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tcpdirect.cu b/tcpdirect.cu
index ffd6f23..ff76524 100644
--- a/tcpdirect.cu
+++ b/tcpdirect.cu
@@ -141,7 +141,7 @@ int get_gpumem_dmabuf_pages_fd(const std::string& gpu_pci_addr,
 
   struct dma_buf_create_pages_info frags_create_info;
   frags_create_info.dma_buf_fd = *dma_buf_fd;
-  frags_create_info.create_page_pool = 0;
+  frags_create_info.create_page_pool = is_client ? 0 : 1;
 
   uint16_t pci_bdf[3];
   ret = sscanf(nic_pci_addr.c_str(), "0000:%hx:%hx.%hx", &pci_bdf[0],

From 82737ea374ba1d35a0036d25662d37daa9e4f287 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Tue, 12 Sep 2023 21:40:47 +0000
Subject: [PATCH 04/72] tcpd: create page_pool for cuda host

---
 tcpdirect.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tcpdirect.cu b/tcpdirect.cu
index ff76524..50743cc 100644
--- a/tcpdirect.cu
+++ b/tcpdirect.cu
@@ -124,7 +124,7 @@ int tcpdirect_setup_socket(int socket) {
 
 int get_gpumem_dmabuf_pages_fd(const std::string& gpu_pci_addr,
                                const std::string& nic_pci_addr, void* gpu_mem,
-                               size_t gpu_mem_sz, int* dma_buf_fd) {
+                               size_t gpu_mem_sz, int* dma_buf_fd, bool is_client) {
   int err, ret;
 
   cuMemGetHandleForAddressRange((void*)dma_buf_fd, (CUdeviceptr)gpu_mem,
@@ -237,7 +237,7 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
 
   gpu_mem_fd_ = get_gpumem_dmabuf_pages_fd(gpu_pci_addr, nic_pci_addr,
                                            gpu_tx_mem_, alloc_size,
-                                           &dma_buf_fd_);
+                                           &dma_buf_fd_, is_client);
 
   if (gpu_mem_fd_ < 0) {
     printf("get_gpumem_dmabuf_pages_fd() failed!: ");

From 23cb22a1f3eaab0558166433fb5a24b4ede66359 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Tue, 12 Sep 2023 23:21:20 +0000
Subject: [PATCH 05/72] tcpd: specify link to use with cuda tcpdirect

---
 define_all_flags.c |  1 +
 lib.h              |  1 +
 tcpdirect.cu       | 45 ++-------------------------------------------
 3 files changed, 4 insertions(+), 43 deletions(-)

diff --git a/define_all_flags.c b/define_all_flags.c
index ee7cabb..ee2eb94 100644
--- a/define_all_flags.c
+++ b/define_all_flags.c
@@ -148,6 +148,7 @@ struct flags_parser *add_flags_tcp_stream(struct flags_parser *fp)
         DEFINE_FLAG(fp, unsigned long long,          tcpdirect_phys_len, 0, 0, "Set the remote memory length for tcpdirect");
         DEFINE_FLAG(fp, const char *,                tcpdirect_src_ip, 0, 0, "Set the src ip address for tcpdirect");
         DEFINE_FLAG(fp, const char *,                tcpdirect_dst_ip, 0, 0, "Set the dst ip address for tcpdirect");
+        DEFINE_FLAG(fp, const char *,                tcpdirect_link_name, "eth1", 0, "Link name to bind DMA buffer_pages for Rx");
 #endif
 
         /* Return the updated fp */
diff --git a/lib.h b/lib.h
index 07d26fb..87e4f1c 100644
--- a/lib.h
+++ b/lib.h
@@ -113,6 +113,7 @@ struct options {
         unsigned long long tcpdirect_phys_len;
         const char *tcpdirect_src_ip;
         const char *tcpdirect_dst_ip;
+        const char *tcpdirect_link_name;
 #endif
         bool enable_read;
         bool enable_write;
diff --git a/tcpdirect.cu b/tcpdirect.cu
index 50743cc..356c8b5 100644
--- a/tcpdirect.cu
+++ b/tcpdirect.cu
@@ -167,43 +167,6 @@ err_close_dmabuf:
   return err;
 }
 
-int tcpdirect_setup_rx_socket(const struct options *opts, struct thread *t)
-{
-  int j;
-  char *eth_device = "eth1";  // TODO: hard-coded for now
-
-  /* Need to trigger the NIC to reallocate its RX pages, otherwise the
-    * bind doesn't take effect.
-    */
-  system("sudo ethtool --set-priv-flags eth1 enable-header-split off");
-  system("sudo ethtool --set-priv-flags eth1 enable-header-split on");
-
-  sleep(2);
-
-  // TODO hardcoded
-  for (j = 0; j < t->flow_limit; j++) {
-    char command[256];
-    sleep(1);
-
-    int flow_idx = (t->flow_first + t->flow_count);
-    int src_port = flow_idx + t->opts->source_port;
-
-    int n = t->opts->num_ports ? t->opts->num_ports : 1;
-    int i = (t->flow_first + t->flow_count) % n;
-    int dst_port = atoi(t->opts->port) + i;
-
-    // TODO hard-coded
-    char *src_ip = "192.169.1.6", *dst_ip = "192.168.1.4";
-
-    snprintf(
-      command, sizeof(command),
-      "sudo ethtool -N eth1 flow-type tcp4 src-ip %s dst-ip %s src-port %i dst-port %i queue 15",
-      src_ip, dst_ip, src_port, dst_port);
-
-    printf("bound %s %i %s %i\n", src_ip, src_port, dst_ip, dst_port);
-  }
-}
-
 int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct thread *t)
 {
   bool is_client = opts->client;
@@ -249,7 +212,7 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
     int num_queues = 15;
     struct dma_buf_pages_bind_rx_queue bind_cmd;
 
-    strcpy(bind_cmd.ifname, "eth1");
+    strcpy(bind_cmd.ifname, opts->tcpdirect_link_name);
     bind_cmd.rxq_idx = num_queues;
 
     ret = ioctl(gpu_mem_fd_, DMA_BUF_PAGES_BIND_RX, &bind_cmd);
@@ -361,8 +324,6 @@ int udmabuf_setup_alloc(const struct options *opts, void **f_mbuf) {
             num_queues);
       exit(78);
     }
-
-    // tcpdirect_setup_rx_socket(opts, t);
   }
 
   struct dma_buf_sync sync = { 0 };
@@ -590,10 +551,8 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags) {
     }
 
     // munmap(buf_mem, n);
-
-    return total_received;
   }
-  return 0;
+  return total_received;
 }
 
 int cuda_flow_cleanup(void *f_mbuf) {

From 342846fe17da6a71302671575a97d6eb24995aa9 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Thu, 14 Sep 2023 00:27:58 +0000
Subject: [PATCH 06/72] tcpd: add multi_neper.py

Usage:
Tx
./multi_neper.py --client --hosts 192.168.1.46 \
  --devices eth1 --buffer-size 409600 \
  --flows 1 --threads 1 --length 10

Rx
./multi_neper.py --hosts 192.168.1.46 \
  --devices eth1 --src-ips 192.168.1.47 \
  --flows 1 --threads 1 --length 10 \
  --buffer-size 409600

./multi_neper.py -h to view other flags
---
 multi_neper.py | 176 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 176 insertions(+)
 create mode 100755 multi_neper.py

diff --git a/multi_neper.py b/multi_neper.py
new file mode 100755
index 0000000..7ca4c3f
--- /dev/null
+++ b/multi_neper.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python3
+
+import argparse, sys, os, subprocess
+from logging import debug,info,warning,error,critical,basicConfig
+
+parser=argparse.ArgumentParser()
+
+link_to_gpu_pci_addr = {
+        "eth1": "0000:04:00.0", # GPU0
+        "eth2": "0000:0a:00.0", # GPU2
+        "eth3": "0000:84:00.0", # GPU4
+        "eth4": "0000:8a:00.0"  # GPU6
+}
+
+link_to_nic_pci_addr = {
+        "eth1": "0000:06:00.0",
+        "eth2": "0000:0c:00.0",
+        "eth3": "0000:86:00.0",
+        "eth4": "0000:8c:00.0"
+}
+
+# adds flow-steering rules, e.x.
+# ethtool -N eth1 flow-type tcp4 ...
+def install_flow_steer_rules(dev, threads: int, src_port, port, src_ip, dst_ip)->list:
+        subprocesses, rules = [], []
+
+        for i in range(threads):
+                flow_steering_cmd = f"ethtool -N {dev} flow-type tcp4 src-ip {src_ip} dst-ip {dst_ip} src-port {src_port + i} dst-port {port} queue 15"
+                sp = subprocess.run(flow_steering_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+                subprocesses.append(sp)
+
+                line = sp.stdout.strip()
+                # the expected output will be similar to:
+                # "Added rule with ID 19989"
+                if "Added rule with ID" in line:
+                        rule = line.split()[-1]
+                        debug(f"[{dev}] added rule {rule}: {src_ip} {dst_ip} {src_port + i} {port}")
+                        rules.append(rule)
+
+        return rules
+
+
+# deletes flow-steering rules, given a list of rules and a link name
+def del_flow_steer_rules(dev: str, rules: list):
+        for rule in rules:
+                del_cmd = f"ethtool -N {dev} delete {rule}"
+                debug(f"[{dev}] deleting rule {rule}")
+                subprocess.run(del_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+
+
+def build_neper_cmd(neper_dir: str, is_client: bool, link: str,
+                    threads: int, flows: int,
+                    cpu_list, buffer_size: int, phys_len: int,
+                    nic_pci: str, gpu_pci: str,
+                    control_port, source_port, port, length, host_ip=None)->str:
+
+        # TODO tcp_stream_cuda2 -> tcp_stream eventually
+        cmd = (f"taskset --cpu-list {cpu_list} {neper_dir}/tcp_stream_cuda2 -T {threads} -F {flows} --tcpdirect-phys-len {phys_len}"
+                f" --port {port} --source-port {source_port} --control-port {control_port}"
+                f" --buffer-size {buffer_size} --tcpd-nic-pci-addr {nic_pci} --tcpd-gpu-pci-addr {gpu_pci} -l {length}")
+
+        if is_client:
+                cmd += f" -c -H {host_ip}"
+        else:
+                cmd += f" --tcpdirect-link-name {dev}"
+
+        return cmd
+
+# returns a CPU range for taskset
+# e.x. returns 4-7 provided 0, 4, 1 as arguments
+def get_cpu_range(starting_cpu:int, interval: int, idx: int)->str:
+        cpu_start = idx * interval + starting_cpu
+        cpu_end = cpu_start + interval - 1
+        return f"{cpu_start}-{cpu_end}"
+
+def run_cmds(cmds: list)->list:
+        sp_list = []
+        for cmd in cmds:
+                popen = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+                sp_list.append(popen)
+
+        return sp_list
+
+def parse_subprocess_outputs(subprocesses):
+        output_dicts = []
+
+        for sp in subprocesses:
+                cur_hash = dict()
+
+                sp.wait()
+                for line in sp.stdout.read().split("\n"):
+                        stripped_line = line.strip()
+                        if "=" in stripped_line:
+                                parsed_line = stripped_line.split("=")
+                                cur_hash[parsed_line[0]] = parsed_line[1]
+                if cur_hash:
+                        output_dicts.append(cur_hash)
+
+        return output_dicts
+
+if __name__ == "__main__":
+        parser = argparse.ArgumentParser()
+
+        parser.add_argument("--neper-dir", help="directory containing Neper binaries", default="/tmp/neper")
+        parser.add_argument("--threads", help="number of threads per Neper instance", default="4", type=int)
+        parser.add_argument("--flows", help="number of flows per Neper instance", default="4", type=int)
+        parser.add_argument("--source-port", default="12345", type=int)
+        parser.add_argument("--port", default="12345", type=int)
+        parser.add_argument("--control-port", default="12866", type=int)
+        parser.add_argument("--devices", help="comma-delimited list of links to run Neper on, i.e. eth1,eth2,eth3",
+                            default="eth1")
+        parser.add_argument("--phys-len", default=4294967296)
+        parser.add_argument("--buffer-size", default=4096*120)
+
+        parser.add_argument("-c", "--client", action="store_true")
+        parser.add_argument("--src-ips", required="--client" not in sys.argv and "-c" not in sys.argv,
+                            help="required for Host to install/remove flow-steering rules, comma-delimited list of client IP addresses")
+        parser.add_argument("-H", "--hosts", required=True,
+                            help="comma-delimited list of host IP addresses")
+
+        parser.add_argument("-l", "--length", default=10)
+        parser.add_argument("--log", default="WARNING")
+
+        args = parser.parse_args()
+
+        basicConfig(level=args.log.upper())
+
+        devices = args.devices.split(",")
+        hosts = args.hosts.split(",")
+
+        dev_to_rule = dict()
+        # setup flow_steering rules
+        if not args.client:
+                info("setting up flow-steering rules")
+                src_ips = args.src_ips.split(",")
+
+                for i in range(len(devices)):
+                        control_port = args.control_port + i
+                        starting_port = i * args.threads + args.source_port
+                        dev = devices[i]
+                        src_ip, dst_ip = src_ips[i], hosts[i]
+
+                        rules = install_flow_steer_rules(dev, args.threads, starting_port, args.port, src_ip, dst_ip)
+                        dev_to_rule[dev] = rules
+
+        cmds = []
+        debug(f"running on {devices}")
+        for i, dev in enumerate(devices):
+                nic_pci = link_to_nic_pci_addr[dev]
+                gpu_pci = link_to_gpu_pci_addr[dev]
+
+                ctrl_port = int(args.control_port) + i
+                src_port = int(args.source_port) + i*int(args.flows)
+                is_client = args.client
+                host_ip = hosts[i] if is_client else None
+                cpu_range = get_cpu_range(2, 3, i)
+
+                cmd = build_neper_cmd(args.neper_dir, is_client, dev,
+                                      args.threads, args.flows, cpu_range, args.buffer_size,
+                                      args.phys_len, nic_pci, gpu_pci,
+                                      ctrl_port, src_port, args.port, args.length, host_ip)
+
+                cmds.append(cmd)
+
+        debug(cmds)
+        sp_list = run_cmds(cmds)
+        debug("parsing subprocesses outputs")
+        for i in parse_subprocess_outputs(sp_list):
+                if not args.client:
+                        debug(f"throughput (Mb/s): {i['throughput']}")
+
+        # delete flow-steering rules
+        if not args.client:
+                info("deleting flow-steering rules")
+                for dev in dev_to_rule:
+                        del_flow_steer_rules(dev, dev_to_rule[dev])

From d70af0050da5b907becc4ee0b6a1f64d841f96c6 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Thu, 14 Sep 2023 00:40:44 +0000
Subject: [PATCH 07/72] tcpd-multi:print throughputs of each link

---
 multi_neper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/multi_neper.py b/multi_neper.py
index 7ca4c3f..5e46378 100755
--- a/multi_neper.py
+++ b/multi_neper.py
@@ -165,9 +165,9 @@ def parse_subprocess_outputs(subprocesses):
         debug(cmds)
         sp_list = run_cmds(cmds)
         debug("parsing subprocesses outputs")
-        for i in parse_subprocess_outputs(sp_list):
+        for dev, i in zip(devices, parse_subprocess_outputs(sp_list)):
                 if not args.client:
-                        debug(f"throughput (Mb/s): {i['throughput']}")
+                        print(f"[{dev}] Throughput (Mb/s): {i['throughput']}")
 
         # delete flow-steering rules
         if not args.client:

From cebb6612a550ad41cbc99e4093ae0def13eaecfd Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Fri, 15 Sep 2023 02:01:17 +0000
Subject: [PATCH 08/72] tcpd: force device index when allocating CUDA bufs

set CUDA_VISIBLE_DEVICES for each Neper call, and call cudaSetDevice to
force cudaMalloc to allocate buffers on the correct GPU
---
 define_all_flags.c |  1 +
 lib.h              |  1 +
 multi_neper.py     | 27 ++++++++++++++++++---------
 tcpdirect.cu       |  6 ++++++
 4 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/define_all_flags.c b/define_all_flags.c
index ee2eb94..b9ace92 100644
--- a/define_all_flags.c
+++ b/define_all_flags.c
@@ -149,6 +149,7 @@ struct flags_parser *add_flags_tcp_stream(struct flags_parser *fp)
         DEFINE_FLAG(fp, const char *,                tcpdirect_src_ip, 0, 0, "Set the src ip address for tcpdirect");
         DEFINE_FLAG(fp, const char *,                tcpdirect_dst_ip, 0, 0, "Set the dst ip address for tcpdirect");
         DEFINE_FLAG(fp, const char *,                tcpdirect_link_name, "eth1", 0, "Link name to bind DMA buffer_pages for Rx");
+        DEFINE_FLAG(fp, int,                         tcpdirect_gpu_idx, 1, 0, "GPU index provided to cudaSetDevice");
 #endif
 
         /* Return the updated fp */
diff --git a/lib.h b/lib.h
index 87e4f1c..8c11f91 100644
--- a/lib.h
+++ b/lib.h
@@ -114,6 +114,7 @@ struct options {
         const char *tcpdirect_src_ip;
         const char *tcpdirect_dst_ip;
         const char *tcpdirect_link_name;
+        int tcpdirect_gpu_idx;
 #endif
         bool enable_read;
         bool enable_write;
diff --git a/multi_neper.py b/multi_neper.py
index 5e46378..a7440ea 100755
--- a/multi_neper.py
+++ b/multi_neper.py
@@ -19,6 +19,13 @@
         "eth4": "0000:8c:00.0"
 }
 
+link_to_gpu_index = {
+        "eth1": "0",
+        "eth2": "2",
+        "eth3": "4",
+        "eth4": "6"
+}
+
 # adds flow-steering rules, e.x.
 # ethtool -N eth1 flow-type tcp4 ...
 def install_flow_steer_rules(dev, threads: int, src_port, port, src_ip, dst_ip)->list:
@@ -47,8 +54,8 @@ def del_flow_steer_rules(dev: str, rules: list):
                 debug(f"[{dev}] deleting rule {rule}")
                 subprocess.run(del_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
 
-
-def build_neper_cmd(neper_dir: str, is_client: bool, link: str,
+# returns a 2-tuple of a Neper command and a dict of env vars
+def build_neper_cmd(neper_dir: str, is_client: bool, dev: str,
                     threads: int, flows: int,
                     cpu_list, buffer_size: int, phys_len: int,
                     nic_pci: str, gpu_pci: str,
@@ -56,15 +63,17 @@ def build_neper_cmd(neper_dir: str, is_client: bool, link: str,
 
         # TODO tcp_stream_cuda2 -> tcp_stream eventually
         cmd = (f"taskset --cpu-list {cpu_list} {neper_dir}/tcp_stream_cuda2 -T {threads} -F {flows} --tcpdirect-phys-len {phys_len}"
-                f" --port {port} --source-port {source_port} --control-port {control_port}"
+                f" --port {port} --source-port {source_port} --control-port {control_port} --tcpdirect-gpu-idx {link_to_gpu_index[dev]}"
                 f" --buffer-size {buffer_size} --tcpd-nic-pci-addr {nic_pci} --tcpd-gpu-pci-addr {gpu_pci} -l {length}")
 
+        env = None
         if is_client:
                 cmd += f" -c -H {host_ip}"
         else:
-                cmd += f" --tcpdirect-link-name {dev}"
+                cmd = cmd + f" --tcpdirect-link-name {dev}"
+                env = {"CUDA_VISIBLE_DEVICES": link_to_gpu_index[dev]}
 
-        return cmd
+        return (cmd, env)
 
 # returns a CPU range for taskset
 # e.x. returns 4-7 provided 0, 4, 1 as arguments
@@ -75,8 +84,8 @@ def get_cpu_range(starting_cpu:int, interval: int, idx: int)->str:
 
 def run_cmds(cmds: list)->list:
         sp_list = []
-        for cmd in cmds:
-                popen = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        for cmd, env in cmds:
+                popen = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, env=env)
                 sp_list.append(popen)
 
         return sp_list
@@ -155,12 +164,12 @@ def parse_subprocess_outputs(subprocesses):
                 host_ip = hosts[i] if is_client else None
                 cpu_range = get_cpu_range(2, 3, i)
 
-                cmd = build_neper_cmd(args.neper_dir, is_client, dev,
+                cmd_env = build_neper_cmd(args.neper_dir, is_client, dev,
                                       args.threads, args.flows, cpu_range, args.buffer_size,
                                       args.phys_len, nic_pci, gpu_pci,
                                       ctrl_port, src_port, args.port, args.length, host_ip)
 
-                cmds.append(cmd)
+                cmds.append(cmd_env)
 
         debug(cmds)
         sp_list = run_cmds(cmds)
diff --git a/tcpdirect.cu b/tcpdirect.cu
index 356c8b5..449912a 100644
--- a/tcpdirect.cu
+++ b/tcpdirect.cu
@@ -191,6 +191,12 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
     alloc_size += GPUMEM_ALIGNMENT - (alloc_size % GPUMEM_ALIGNMENT);
   }
 
+  ret = cudaSetDevice(opts->tcpdirect_gpu_idx);
+  if (ret != 0) {
+    printf("cudaSetDevice failed: index %i", opts->tcpdirect_gpu_idx);
+    exit(70);
+  }
+
   cudaMalloc(&gpu_tx_mem_, alloc_size);
   unsigned int flag = 1;
   cuPointerSetAttribute(&flag,

From 67e41db2d1f8f23c7f81f4ec02da883513216a12 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Tue, 19 Sep 2023 21:42:35 +0000
Subject: [PATCH 09/72] tcpd: attempt at some basic data validation

---
 tcpdirect.cu | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/tcpdirect.cu b/tcpdirect.cu
index 449912a..01111bb 100644
--- a/tcpdirect.cu
+++ b/tcpdirect.cu
@@ -415,37 +415,38 @@ int tcpdirect_udma_send(int socket, void *f_mbuf, size_t n, int flags) {
 int tcpdirect_send(int socket, void *buf, size_t n, int flags) {
   int gpu_mem_fd_;
   struct iovec iov;
-  struct msghdr *msg;
+  struct msghdr msg = {0};
   struct cmsghdr *cmsg;
   char buf_dummy[n];
   char offsetbuf[CMSG_SPACE(sizeof(uint32_t) * 2)];
-  struct tcpdirect_udma_mbuf *tmbuf;
+  struct tcpdirect_cuda_mbuf *tmbuf;
 
   if (!buf) return -1;
 
-  tmbuf = (struct tcpdirect_udma_mbuf *)buf;
-  gpu_mem_fd_ = tmbuf->pages_fd;
-  msg = &tmbuf->msg;
+  tmbuf = (struct tcpdirect_cuda_mbuf *)buf;
+  gpu_mem_fd_ = tmbuf->gpu_mem_fd_;
+  void *gpu_tx_mem_ = tmbuf->gpu_tx_mem_;
+
+  cudaMemset(gpu_tx_mem_, 'a', n);
 
-  memset(msg, 0, sizeof(struct msghdr));
   // memset(cmsg, 0, sizeof(struct cmsghdr));
 
   iov.iov_base = buf_dummy;
   iov.iov_len = n;
 
-  msg->msg_iov = &iov;
-  msg->msg_iovlen = 1;
+  msg.msg_iov = &iov;
+  msg.msg_iovlen = 1;
 
-  msg->msg_control = offsetbuf;
-  msg->msg_controllen = sizeof(offsetbuf);
+  msg.msg_control = offsetbuf;
+  msg.msg_controllen = sizeof(offsetbuf);
 
-  cmsg = CMSG_FIRSTHDR(msg);
+  cmsg = CMSG_FIRSTHDR(&msg);
   cmsg->cmsg_level = SOL_SOCKET;
   cmsg->cmsg_type = SCM_DEVMEM_OFFSET;
   cmsg->cmsg_len = CMSG_LEN(sizeof(int) * 2);
   *((int*)CMSG_DATA(cmsg)) = gpu_mem_fd_;
 
-  ssize_t bytes_sent = sendmsg(socket, msg, MSG_ZEROCOPY | MSG_DONTWAIT);
+  ssize_t bytes_sent = sendmsg(socket, &msg, MSG_ZEROCOPY | MSG_DONTWAIT);
   if (bytes_sent < 0 && errno != EWOULDBLOCK && errno != EAGAIN) {
     perror("sendmsg() error: ");
     exit(EXIT_FAILURE);
@@ -463,15 +464,14 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags) {
   struct iovec iov;
   struct msghdr msg_local;
   struct msghdr *msg;
-  struct tcpdirect_udma_mbuf *tmbuf;
+  struct tcpdirect_cuda_mbuf *tmbuf;
   int buf, ret, client_fd;
   size_t total_received = 0;
 
   if (!f_mbuf) return -1;
 
-  tmbuf = (struct tcpdirect_udma_mbuf *)f_mbuf;
+  tmbuf = (struct tcpdirect_cuda_mbuf *)f_mbuf;
 
-  buf = tmbuf->buf;
   client_fd = socket;
 
   char buf_dummy[n];
@@ -545,6 +545,13 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags) {
             devmemvec->frag_token,
             total_received);
 
+    char mybuf[devmemvec->frag_size];
+    cudaMemcpy(mybuf,
+               (char *)tmbuf->gpu_tx_mem_ + devmemvec->frag_offset,
+               devmemvec->frag_size,
+               cudaMemcpyDeviceToHost);
+    printf("cudaFrag: %.25s\n", mybuf);
+
     // sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_END;
     // ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
 

From d05bfb57dbd01b17e4c5cf3031d691413036e82f Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Tue, 19 Sep 2023 22:52:43 +0000
Subject: [PATCH 10/72] tcpd: toggle header-split on Rx

---
 tcpdirect.cu | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/tcpdirect.cu b/tcpdirect.cu
index 01111bb..6f10032 100644
--- a/tcpdirect.cu
+++ b/tcpdirect.cu
@@ -191,11 +191,12 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
     alloc_size += GPUMEM_ALIGNMENT - (alloc_size % GPUMEM_ALIGNMENT);
   }
 
-  ret = cudaSetDevice(opts->tcpdirect_gpu_idx);
-  if (ret != 0) {
-    printf("cudaSetDevice failed: index %i", opts->tcpdirect_gpu_idx);
-    exit(70);
-  }
+  // unnecessary if CUDA_VISIBLE_DEVICES env var is set
+  // ret = cudaSetDevice(opts->tcpdirect_gpu_idx);
+  // if (ret != 0) {
+  //   printf("cudaSetDevice failed: index %i", opts->tcpdirect_gpu_idx);
+  //   exit(70);
+  // }
 
   cudaMalloc(&gpu_tx_mem_, alloc_size);
   unsigned int flag = 1;
@@ -216,6 +217,7 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
   if (!is_client) {
     /* TODO hardcoded num_queues */
     int num_queues = 15;
+    printf("Bind to queue %i\n", num_queues);
     struct dma_buf_pages_bind_rx_queue bind_cmd;
 
     strcpy(bind_cmd.ifname, opts->tcpdirect_link_name);
@@ -330,6 +332,12 @@ int udmabuf_setup_alloc(const struct options *opts, void **f_mbuf) {
             num_queues);
       exit(78);
     }
+
+    system("ethtool --set-priv-flags eth1 enable-header-split on");
+    system("ethtool --set-priv-flags eth1 enable-header-split off");
+	  system("ethtool --set-priv-flags eth1 enable-header-split on");
+    sleep(1);
+    printf("toggled header-split\n");
   }
 
   struct dma_buf_sync sync = { 0 };

From 8fd9b35b50acbf7b2c17afbbd6b357d84c9e9aa7 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Wed, 20 Sep 2023 00:12:26 +0000
Subject: [PATCH 11/72] tcpd: toggle header-split

---
 tcpdirect.cu | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tcpdirect.cu b/tcpdirect.cu
index 6f10032..27f7c29 100644
--- a/tcpdirect.cu
+++ b/tcpdirect.cu
@@ -220,7 +220,7 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
     printf("Bind to queue %i\n", num_queues);
     struct dma_buf_pages_bind_rx_queue bind_cmd;
 
-    strcpy(bind_cmd.ifname, opts->tcpdirect_link_name);
+    strcpy(bind_cmd.ifname, "eth1"); // opts->tcpdirect_link_name
     bind_cmd.rxq_idx = num_queues;
 
     ret = ioctl(gpu_mem_fd_, DMA_BUF_PAGES_BIND_RX, &bind_cmd);
@@ -229,6 +229,13 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
             num_queues);
       exit(78);
     }
+
+    system("ethtool --set-priv-flags eth1 enable-strict-header-split on");
+    system("ethtool --set-rxfh-indir eth4 equal 8");
+    system("ethtool -N eth1 flow-type tcp4 src-ip 192.168.1.198 dst-ip 192.168.1.46 src-port 12345 dst-port 12345 queue 15");
+    printf("sleeping 1...\n");
+    sleep(1);
+    printf("toggled header-split\n");
   }
 
   *f_mbuf = tmbuf;

From 62778410fd4ecf2246bc5f89d387ca52a1d21fa7 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Wed, 20 Sep 2023 00:20:00 +0000
Subject: [PATCH 12/72] tcpd: minor fix

---
 tcpdirect.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tcpdirect.cu b/tcpdirect.cu
index 27f7c29..5cb7cca 100644
--- a/tcpdirect.cu
+++ b/tcpdirect.cu
@@ -231,7 +231,7 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
     }
 
     system("ethtool --set-priv-flags eth1 enable-strict-header-split on");
-    system("ethtool --set-rxfh-indir eth4 equal 8");
+    system("ethtool --set-rxfh-indir eth1 equal 8");
     system("ethtool -N eth1 flow-type tcp4 src-ip 192.168.1.198 dst-ip 192.168.1.46 src-port 12345 dst-port 12345 queue 15");
     printf("sleeping 1...\n");
     sleep(1);

From 032b6f7aed207603e281fa97d222895b4f90066c Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Wed, 20 Sep 2023 14:28:06 +0000
Subject: [PATCH 13/72] tcpd: allocate gpu buffer earlier

---
 socket.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/socket.c b/socket.c
index b252a52..7347d0f 100644
--- a/socket.c
+++ b/socket.c
@@ -71,6 +71,13 @@ static void socket_init_not_established(struct thread *t, int s)
                         PLOG_ERROR(t->cb, "setsockopt(SO_LINGER)");
         }
 #ifdef WITH_TCPDIRECT
+        if (!t->f_mbuf && opts->tcpd_gpu_pci_addr) {
+                if (tcpdirect_cuda_setup_alloc(t->opts, &t->f_mbuf, t)) {
+                        LOG_ERROR(t->cb, "%s: failed to setup tcpdirect CUDA socket",
+                                  __func__);
+                        exit(1);
+                }
+        }
         if (opts->tcpd_nic_pci_addr)
                 tcpdirect_setup_socket(s);
 #endif

From 4fb36d3a3399cefa4adc3a4828a31ede47a60951 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Wed, 20 Sep 2023 19:20:20 +0000
Subject: [PATCH 14/72] tcpd: fill client cuda buf with a char earlier

---
 tcpdirect.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tcpdirect.cu b/tcpdirect.cu
index 5cb7cca..54c68a2 100644
--- a/tcpdirect.cu
+++ b/tcpdirect.cu
@@ -199,6 +199,7 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
   // }
 
   cudaMalloc(&gpu_tx_mem_, alloc_size);
+  if (is_client) cudaMemset(gpu_tx_mem_, 'a', alloc_size);
   unsigned int flag = 1;
   cuPointerSetAttribute(&flag,
                         CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
@@ -442,8 +443,6 @@ int tcpdirect_send(int socket, void *buf, size_t n, int flags) {
   gpu_mem_fd_ = tmbuf->gpu_mem_fd_;
   void *gpu_tx_mem_ = tmbuf->gpu_tx_mem_;
 
-  cudaMemset(gpu_tx_mem_, 'a', n);
-
   // memset(cmsg, 0, sizeof(struct cmsghdr));
 
   iov.iov_base = buf_dummy;

From 5398c56cf99a0363cd6a8873a03ca9177b677992 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Wed, 20 Sep 2023 21:26:40 +0000
Subject: [PATCH 15/72] tcpd: install flow-steer after cudaMalloc

---
 check_all_options.c |  7 +++++++
 define_all_flags.c  |  2 ++
 tcpdirect.cu        | 30 +++++++++++++++++++++---------
 3 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/check_all_options.c b/check_all_options.c
index e4630bf..73f1cf1 100644
--- a/check_all_options.c
+++ b/check_all_options.c
@@ -108,6 +108,13 @@ void check_options_tcp_stream(struct options *opts, struct callbacks *cb)
                   "Must provide non-zero --tcpdirect-phys-len flag if GPU PCI address was provided.");
             // TODO check page-alignment
             // CHECK((CUdeviceptr)gpu_tx_mem_ % PAGE_SIZE == 0);
+
+            if (!opts->client) {
+                  CHECK(cb, opts->tcpdirect_src_ip,
+                        "Must provide source IP address for TCPDirect host.");
+                  CHECK(cb, opts->tcpdirect_dst_ip,
+                        "Must provide destination IP address for TCPDirect host.");
+            }
       }
 }
 
diff --git a/define_all_flags.c b/define_all_flags.c
index b9ace92..d9ef633 100644
--- a/define_all_flags.c
+++ b/define_all_flags.c
@@ -150,6 +150,8 @@ struct flags_parser *add_flags_tcp_stream(struct flags_parser *fp)
         DEFINE_FLAG(fp, const char *,                tcpdirect_dst_ip, 0, 0, "Set the dst ip address for tcpdirect");
         DEFINE_FLAG(fp, const char *,                tcpdirect_link_name, "eth1", 0, "Link name to bind DMA buffer_pages for Rx");
         DEFINE_FLAG(fp, int,                         tcpdirect_gpu_idx, 1, 0, "GPU index provided to cudaSetDevice");
+        DEFINE_FLAG(fp, int,                         queue_start, 8, 0, "Queue to start flow-steering at");
+        DEFINE_FLAG(fp, int,                         queue_num, 4, 0, "Number of queues to flow-steer to");
 #endif
 
         /* Return the updated fp */
diff --git a/tcpdirect.cu b/tcpdirect.cu
index 54c68a2..63bcd79 100644
--- a/tcpdirect.cu
+++ b/tcpdirect.cu
@@ -174,6 +174,8 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
   void *gpu_tx_mem_;
   int gpu_mem_fd_;
   int dma_buf_fd_;
+  int q_start = opts->queue_start;
+  int q_num = opts->queue_num;
   std::unique_ptr<char[]> buf_;
   struct tcpdirect_cuda_mbuf *tmbuf;
   const char *gpu_pci_addr = opts->tcpd_gpu_pci_addr;  // "0000:04:00.0"
@@ -216,12 +218,11 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
   }
 
   if (!is_client) {
-    /* TODO hardcoded num_queues */
-    int num_queues = 15;
+    int num_queues = q_start + (t->index % q_num);
     printf("Bind to queue %i\n", num_queues);
     struct dma_buf_pages_bind_rx_queue bind_cmd;
 
-    strcpy(bind_cmd.ifname, "eth1"); // opts->tcpdirect_link_name
+    strcpy(bind_cmd.ifname, opts->tcpdirect_link_name);
     bind_cmd.rxq_idx = num_queues;
 
     ret = ioctl(gpu_mem_fd_, DMA_BUF_PAGES_BIND_RX, &bind_cmd);
@@ -231,12 +232,23 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
       exit(78);
     }
 
-    system("ethtool --set-priv-flags eth1 enable-strict-header-split on");
-    system("ethtool --set-rxfh-indir eth1 equal 8");
-    system("ethtool -N eth1 flow-type tcp4 src-ip 192.168.1.198 dst-ip 192.168.1.46 src-port 12345 dst-port 12345 queue 15");
-    printf("sleeping 1...\n");
-    sleep(1);
-    printf("toggled header-split\n");
+    // copied from socket.c#socket_connect_one()
+    int flow_idx = (t->flow_first + t->flow_count);
+    int source_port = flow_idx + opts->source_port;
+    char flow_steer_cmd[512];
+    sprintf(flow_steer_cmd,
+            "ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %i dst-port %s queue %i",
+            opts->tcpdirect_link_name, opts->tcpdirect_src_ip, opts->tcpdirect_dst_ip, source_port, opts->port, num_queues);
+    ret = system(flow_steer_cmd);
+
+    // only running the below ethtool commands after last thread/flow is setup
+    if (flow_idx + flow_limit >= opts->num_flows) {
+      ret = ret | system("ethtool --set-priv-flags eth1 enable-strict-header-split on");
+      ret = ret | system("ethtool --set-priv-flags eth1 enable-header-split on");
+      ret = ret | system("ethtool --set-rxfh-indir eth1 equal 8");
+      printf("ethtool cmds returned %i, sleeping 1...\n", ret);
+      sleep(1);
+    }
   }
 
   *f_mbuf = tmbuf;

From 0b1feefd2dc5563f2fac3041859f3255b26d6f9a Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Wed, 20 Sep 2023 23:56:56 +0000
Subject: [PATCH 16/72] tcpd: reset device state before running neper

---
 multi_neper.py | 97 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 68 insertions(+), 29 deletions(-)

diff --git a/multi_neper.py b/multi_neper.py
index a7440ea..9bd56d6 100755
--- a/multi_neper.py
+++ b/multi_neper.py
@@ -26,13 +26,32 @@
         "eth4": "6"
 }
 
+def run_pre_neper_cmds(dev: str):
+        cmds = [
+                f"ethtool --set-priv-flags {dev} enable-strict-header-split on",
+                f"ethtool --set-priv-flags {dev} enable-strict-header-split off",
+                f"ethtool --set-priv-flags {dev} enable-header-split off",
+                f"ethtool --set-rxfh-indir {dev} equal 16",
+                f"ethtool -K {dev} ntuple off",
+                f"ethtool --set-priv-flags {dev} enable-strict-header-split off",
+                f"ethtool --set-priv-flags {dev} enable-header-split off",
+                f"ethtool -K {dev} ntuple off",
+                f"ethtool --set-priv-flags {dev} enable-max-rx-buffer-size on",
+                f"ethtool -K {dev} ntuple on"
+        ]
+
+        for cmd in cmds:
+                subprocess.run(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+
 # adds flow-steering rules, e.x.
 # ethtool -N eth1 flow-type tcp4 ...
-def install_flow_steer_rules(dev, threads: int, src_port, port, src_ip, dst_ip)->list:
+def install_flow_steer_rules(dev, threads: int, src_port, port, src_ip, dst_ip, q_start, q_num)->list:
         subprocesses, rules = [], []
 
         for i in range(threads):
-                flow_steering_cmd = f"ethtool -N {dev} flow-type tcp4 src-ip {src_ip} dst-ip {dst_ip} src-port {src_port + i} dst-port {port} queue 15"
+                queue = q_start + (i % q_num)
+                flow_steering_cmd = f"ethtool -N {dev} flow-type tcp4 src-ip {src_ip} dst-ip {dst_ip} src-port {src_port + i} dst-port {port} queue {queue}"
+                debug(flow_steering_cmd)
                 sp = subprocess.run(flow_steering_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
                 subprocesses.append(sp)
 
@@ -52,25 +71,25 @@ def del_flow_steer_rules(dev: str, rules: list):
         for rule in rules:
                 del_cmd = f"ethtool -N {dev} delete {rule}"
                 debug(f"[{dev}] deleting rule {rule}")
-                subprocess.run(del_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+                subprocess.run(del_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
 
 # returns a 2-tuple of a Neper command and a dict of env vars
 def build_neper_cmd(neper_dir: str, is_client: bool, dev: str,
                     threads: int, flows: int,
                     cpu_list, buffer_size: int, phys_len: int,
                     nic_pci: str, gpu_pci: str,
-                    control_port, source_port, port, length, host_ip=None)->str:
+                    control_port, source_port, port, length, src_ip, dst_ip)->str:
 
         # TODO tcp_stream_cuda2 -> tcp_stream eventually
         cmd = (f"taskset --cpu-list {cpu_list} {neper_dir}/tcp_stream_cuda2 -T {threads} -F {flows} --tcpdirect-phys-len {phys_len}"
-                f" --port {port} --source-port {source_port} --control-port {control_port} --tcpdirect-gpu-idx {link_to_gpu_index[dev]}"
+                f" --port {port} --source-port {source_port} --control-port {control_port}"
                 f" --buffer-size {buffer_size} --tcpd-nic-pci-addr {nic_pci} --tcpd-gpu-pci-addr {gpu_pci} -l {length}")
 
         env = None
         if is_client:
-                cmd += f" -c -H {host_ip}"
+                cmd += f" -c -H {dst_ip}"
         else:
-                cmd = cmd + f" --tcpdirect-link-name {dev}"
+                cmd = cmd + f" --tcpdirect-link-name {dev} --tcpdirect-src-ip {src_ip} --tcpdirect-dst-ip {dst_ip}"
                 env = {"CUDA_VISIBLE_DEVICES": link_to_gpu_index[dev]}
 
         return (cmd, env)
@@ -97,7 +116,9 @@ def parse_subprocess_outputs(subprocesses):
                 cur_hash = dict()
 
                 sp.wait()
+                debug(sp.stderr.read())
                 for line in sp.stdout.read().split("\n"):
+                        debug(line)
                         stripped_line = line.strip()
                         if "=" in stripped_line:
                                 parsed_line = stripped_line.split("=")
@@ -127,6 +148,15 @@ def parse_subprocess_outputs(subprocesses):
         parser.add_argument("-H", "--hosts", required=True,
                             help="comma-delimited list of host IP addresses")
 
+        parser.add_argument("--q-start", default="8", help="starting queue for flow-steering rules", type=int)
+        parser.add_argument("--q-num", default="4", help=("number of queues for flow-steering rules"
+                                                          " (i.e. if q-start=8 and q-num=4, 2"
+                                                          " flow-steering rules each will be"
+                                                          " installed for queues [8-11])"),
+                                                          type=int)
+
+        parser.add_argument("--dry-run", default=False, action="store_true")
+
         parser.add_argument("-l", "--length", default=10)
         parser.add_argument("--log", default="WARNING")
 
@@ -136,21 +166,27 @@ def parse_subprocess_outputs(subprocesses):
 
         devices = args.devices.split(",")
         hosts = args.hosts.split(",")
+        src_ips = args.src_ips.split(",")
 
         dev_to_rule = dict()
         # setup flow_steering rules
         if not args.client:
                 info("setting up flow-steering rules")
-                src_ips = args.src_ips.split(",")
+                # src_ips = args.src_ips.split(",")
 
-                for i in range(len(devices)):
-                        control_port = args.control_port + i
-                        starting_port = i * args.threads + args.source_port
-                        dev = devices[i]
-                        src_ip, dst_ip = src_ips[i], hosts[i]
+                for i, dev in enumerate(devices):
+                        if not args.dry_run:
+                                run_pre_neper_cmds(dev)
 
-                        rules = install_flow_steer_rules(dev, args.threads, starting_port, args.port, src_ip, dst_ip)
-                        dev_to_rule[dev] = rules
+                        # TODO flow-steering rules installed in Neper now
+                        # control_port = args.control_port + i
+                        # starting_port = i * args.threads + args.source_port
+                        # dev = devices[i]
+                        # src_ip, dst_ip = src_ips[i], hosts[i]
+
+                        # # TODO port_start q_start, q_num
+                        # rules = install_flow_steer_rules(dev, args.threads, starting_port, args.port, src_ip, dst_ip, args.q_start, args.q_num)
+                        # dev_to_rule[dev] = rules
 
         cmds = []
         debug(f"running on {devices}")
@@ -161,25 +197,28 @@ def parse_subprocess_outputs(subprocesses):
                 ctrl_port = int(args.control_port) + i
                 src_port = int(args.source_port) + i*int(args.flows)
                 is_client = args.client
-                host_ip = hosts[i] if is_client else None
-                cpu_range = get_cpu_range(2, 3, i)
+                src_ip, dst_ip = src_ips[i], hosts[i]
+                # TODO 8 CPUs is hard-coded. Probably should change to args.flows
+                cpu_range = get_cpu_range(2 + (52 if i >= 2 else 0), 8, i)
 
                 cmd_env = build_neper_cmd(args.neper_dir, is_client, dev,
                                       args.threads, args.flows, cpu_range, args.buffer_size,
                                       args.phys_len, nic_pci, gpu_pci,
-                                      ctrl_port, src_port, args.port, args.length, host_ip)
+                                      ctrl_port, src_port, args.port, args.length, src_ip, dst_ip)
 
                 cmds.append(cmd_env)
 
         debug(cmds)
-        sp_list = run_cmds(cmds)
-        debug("parsing subprocesses outputs")
-        for dev, i in zip(devices, parse_subprocess_outputs(sp_list)):
-                if not args.client:
-                        print(f"[{dev}] Throughput (Mb/s): {i['throughput']}")
-
-        # delete flow-steering rules
-        if not args.client:
-                info("deleting flow-steering rules")
-                for dev in dev_to_rule:
-                        del_flow_steer_rules(dev, dev_to_rule[dev])
+        if not args.dry_run:
+                sp_list = run_cmds(cmds)
+                debug("parsing subprocesses outputs")
+                for dev, i in zip(devices, parse_subprocess_outputs(sp_list)):
+                        if not args.client:
+                                print(f"[{dev}] Throughput (Mb/s): {i['throughput']}")
+
+                # TODO remove, flow-steering rules are installed via Neper now
+                # delete flow-steering rules
+                # if not args.client:
+                #         info("deleting flow-steering rules")
+                #         for dev in dev_to_rule:
+                #                 del_flow_steer_rules(dev, dev_to_rule[dev])

From 1f61d4d9761f497a7e06eeb687ddc903f2530d72 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Thu, 21 Sep 2023 16:36:56 +0000
Subject: [PATCH 17/72] tcpd: minor changes

---
 define_all_flags.c |  1 -
 lib.h              |  3 ++-
 tcpdirect.cu       | 18 ++++++++++--------
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/define_all_flags.c b/define_all_flags.c
index d9ef633..f2b9430 100644
--- a/define_all_flags.c
+++ b/define_all_flags.c
@@ -149,7 +149,6 @@ struct flags_parser *add_flags_tcp_stream(struct flags_parser *fp)
         DEFINE_FLAG(fp, const char *,                tcpdirect_src_ip, 0, 0, "Set the src ip address for tcpdirect");
         DEFINE_FLAG(fp, const char *,                tcpdirect_dst_ip, 0, 0, "Set the dst ip address for tcpdirect");
         DEFINE_FLAG(fp, const char *,                tcpdirect_link_name, "eth1", 0, "Link name to bind DMA buffer_pages for Rx");
-        DEFINE_FLAG(fp, int,                         tcpdirect_gpu_idx, 1, 0, "GPU index provided to cudaSetDevice");
         DEFINE_FLAG(fp, int,                         queue_start, 8, 0, "Queue to start flow-steering at");
         DEFINE_FLAG(fp, int,                         queue_num, 4, 0, "Number of queues to flow-steer to");
 #endif
diff --git a/lib.h b/lib.h
index 8c11f91..6985224 100644
--- a/lib.h
+++ b/lib.h
@@ -114,7 +114,8 @@ struct options {
         const char *tcpdirect_src_ip;
         const char *tcpdirect_dst_ip;
         const char *tcpdirect_link_name;
-        int tcpdirect_gpu_idx;
+        int queue_start;
+        int queue_num;
 #endif
         bool enable_read;
         bool enable_write;
diff --git a/tcpdirect.cu b/tcpdirect.cu
index 63bcd79..1ace088 100644
--- a/tcpdirect.cu
+++ b/tcpdirect.cu
@@ -184,7 +184,7 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
   size_t alloc_size = opts->tcpdirect_phys_len;  // std::max(message_size_, (unsigned long)GPUMEM_MINSZ)
 
   tmbuf =
-    (struct tcpdirect_cuda_mbuf *)calloc(1, sizeof(struct tcpdirect_udma_mbuf));
+    (struct tcpdirect_cuda_mbuf *)calloc(1, sizeof(struct tcpdirect_cuda_mbuf));
   if (!tmbuf) {
     exit(EXIT_FAILURE);
   }
@@ -242,7 +242,7 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
     ret = system(flow_steer_cmd);
 
     // only running the below ethtool commands after last thread/flow is setup
-    if (flow_idx + flow_limit >= opts->num_flows) {
+    if (flow_idx + t->flow_limit >= opts->num_flows) {
       ret = ret | system("ethtool --set-priv-flags eth1 enable-strict-header-split on");
       ret = ret | system("ethtool --set-priv-flags eth1 enable-header-split on");
       ret = ret | system("ethtool --set-rxfh-indir eth1 equal 8");
@@ -571,12 +571,14 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags) {
             devmemvec->frag_token,
             total_received);
 
-    char mybuf[devmemvec->frag_size];
-    cudaMemcpy(mybuf,
-               (char *)tmbuf->gpu_tx_mem_ + devmemvec->frag_offset,
-               devmemvec->frag_size,
-               cudaMemcpyDeviceToHost);
-    printf("cudaFrag: %.25s\n", mybuf);
+    // if (devmemvec->frag_token % 10 == 0) {
+    //   char mybuf[devmemvec->frag_size];
+    //   cudaMemcpy(mybuf,
+    //             (char *)tmbuf->gpu_tx_mem_ + devmemvec->frag_offset,
+    //             devmemvec->frag_size,
+    //             cudaMemcpyDeviceToHost);
+    //   printf("cudaFrag: %.25s\n", mybuf);
+    // }
 
     // sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_END;
     // ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);

From 51292bada2b5bd425f652afcfb576b1271048fd8 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Thu, 21 Sep 2023 18:25:50 +0000
Subject: [PATCH 18/72] tcpd: change default neper-dir to .

---
 multi_neper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/multi_neper.py b/multi_neper.py
index 9bd56d6..a425d18 100755
--- a/multi_neper.py
+++ b/multi_neper.py
@@ -131,7 +131,7 @@ def parse_subprocess_outputs(subprocesses):
 if __name__ == "__main__":
         parser = argparse.ArgumentParser()
 
-        parser.add_argument("--neper-dir", help="directory containing Neper binaries", default="/tmp/neper")
+        parser.add_argument("--neper-dir", help="directory containing Neper binaries", default=".")
         parser.add_argument("--threads", help="number of threads per Neper instance", default="4", type=int)
         parser.add_argument("--flows", help="number of flows per Neper instance", default="4", type=int)
         parser.add_argument("--source-port", default="12345", type=int)

From 94f61d9845bbc6dabf589b04c661c881227ca059 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Thu, 21 Sep 2023 18:46:26 +0000
Subject: [PATCH 19/72] tcpd: don't hardcode eth1, change every dev's port

---
 multi_neper.py | 10 ++++++----
 tcpdirect.cu   | 10 +++++++---
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/multi_neper.py b/multi_neper.py
index a425d18..6bec40f 100755
--- a/multi_neper.py
+++ b/multi_neper.py
@@ -185,7 +185,8 @@ def parse_subprocess_outputs(subprocesses):
                         # src_ip, dst_ip = src_ips[i], hosts[i]
 
                         # # TODO port_start q_start, q_num
-                        # rules = install_flow_steer_rules(dev, args.threads, starting_port, args.port, src_ip, dst_ip, args.q_start, args.q_num)
+                        # dst_port = args.port + i
+                        # rules = install_flow_steer_rules(dev, args.threads, starting_port, dst_port, src_ip, dst_ip, args.q_start, args.q_num)
                         # dev_to_rule[dev] = rules
 
         cmds = []
@@ -194,8 +195,9 @@ def parse_subprocess_outputs(subprocesses):
                 nic_pci = link_to_nic_pci_addr[dev]
                 gpu_pci = link_to_gpu_pci_addr[dev]
 
-                ctrl_port = int(args.control_port) + i
-                src_port = int(args.source_port) + i*int(args.flows)
+                ctrl_port = args.control_port + i
+                src_port = args.source_port + i * args.flows
+                dst_port = args.port + i
                 is_client = args.client
                 src_ip, dst_ip = src_ips[i], hosts[i]
                 # TODO 8 CPUs is hard-coded. Probably should change to args.flows
@@ -204,7 +206,7 @@ def parse_subprocess_outputs(subprocesses):
                 cmd_env = build_neper_cmd(args.neper_dir, is_client, dev,
                                       args.threads, args.flows, cpu_range, args.buffer_size,
                                       args.phys_len, nic_pci, gpu_pci,
-                                      ctrl_port, src_port, args.port, args.length, src_ip, dst_ip)
+                                      ctrl_port, src_port, dst_port, args.length, src_ip, dst_ip)
 
                 cmds.append(cmd_env)
 
diff --git a/tcpdirect.cu b/tcpdirect.cu
index 1ace088..affbb32 100644
--- a/tcpdirect.cu
+++ b/tcpdirect.cu
@@ -243,9 +243,13 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
 
     // only running the below ethtool commands after last thread/flow is setup
     if (flow_idx + t->flow_limit >= opts->num_flows) {
-      ret = ret | system("ethtool --set-priv-flags eth1 enable-strict-header-split on");
-      ret = ret | system("ethtool --set-priv-flags eth1 enable-header-split on");
-      ret = ret | system("ethtool --set-rxfh-indir eth1 equal 8");
+      char ethtool_cmd[512];
+      sprintf(ethtool_cmd, "ethtool --set-priv-flags %s enable-strict-header-split on", opts->tcpdirect_link_name);
+      ret = ret | system(ethtool_cmd);
+      sprintf(ethtool_cmd, "ethtool --set-priv-flags %s enable-header-split on", opts->tcpdirect_link_name);
+      ret = ret | system(ethtool_cmd);
+      sprintf(ethtool_cmd, "ethtool --set-rxfh-indir %s equal 8", opts->tcpdirect_link_name);
+      ret = ret | system(ethtool_cmd);
       printf("ethtool cmds returned %i, sleeping 1...\n", ret);
       sleep(1);
     }

From 1d4662c6a3f402e358ed30af27756b29754dfd9b Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Thu, 21 Sep 2023 19:00:13 +0000
Subject: [PATCH 20/72] tcpd: add queue_start and queue_num flags

---
 multi_neper.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/multi_neper.py b/multi_neper.py
index 6bec40f..8b41d0c 100755
--- a/multi_neper.py
+++ b/multi_neper.py
@@ -78,7 +78,8 @@ def build_neper_cmd(neper_dir: str, is_client: bool, dev: str,
                     threads: int, flows: int,
                     cpu_list, buffer_size: int, phys_len: int,
                     nic_pci: str, gpu_pci: str,
-                    control_port, source_port, port, length, src_ip, dst_ip)->str:
+                    control_port, source_port, port, length,
+                    src_ip, dst_ip, queue_start, queue_num)->str:
 
         # TODO tcp_stream_cuda2 -> tcp_stream eventually
         cmd = (f"taskset --cpu-list {cpu_list} {neper_dir}/tcp_stream_cuda2 -T {threads} -F {flows} --tcpdirect-phys-len {phys_len}"
@@ -89,7 +90,8 @@ def build_neper_cmd(neper_dir: str, is_client: bool, dev: str,
         if is_client:
                 cmd += f" -c -H {dst_ip}"
         else:
-                cmd = cmd + f" --tcpdirect-link-name {dev} --tcpdirect-src-ip {src_ip} --tcpdirect-dst-ip {dst_ip}"
+                cmd = cmd + (f" --tcpdirect-link-name {dev} --tcpdirect-src-ip {src_ip} --tcpdirect-dst-ip {dst_ip}"
+                             f" --queue-start {queue_start} --queue-num {queue_num}")
                 env = {"CUDA_VISIBLE_DEVICES": link_to_gpu_index[dev]}
 
         return (cmd, env)
@@ -206,7 +208,8 @@ def parse_subprocess_outputs(subprocesses):
                 cmd_env = build_neper_cmd(args.neper_dir, is_client, dev,
                                       args.threads, args.flows, cpu_range, args.buffer_size,
                                       args.phys_len, nic_pci, gpu_pci,
-                                      ctrl_port, src_port, dst_port, args.length, src_ip, dst_ip)
+                                      ctrl_port, src_port, dst_port, args.length, src_ip, dst_ip,
+                                      args.q_start, args.q_num)
 
                 cmds.append(cmd_env)
 

From ac0203be203aba05298ee3b5f63a924c9d2534cc Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Fri, 22 Sep 2023 20:35:49 +0000
Subject: [PATCH 21/72] tcpd: use tcpdirect properly on Tx

---
 multi_neper.py |  3 +--
 tcpdirect.cu   | 22 ++++++++++++----------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/multi_neper.py b/multi_neper.py
index 8b41d0c..13c8b5e 100755
--- a/multi_neper.py
+++ b/multi_neper.py
@@ -86,13 +86,12 @@ def build_neper_cmd(neper_dir: str, is_client: bool, dev: str,
                 f" --port {port} --source-port {source_port} --control-port {control_port}"
                 f" --buffer-size {buffer_size} --tcpd-nic-pci-addr {nic_pci} --tcpd-gpu-pci-addr {gpu_pci} -l {length}")
 
-        env = None
+        env = {"CUDA_VISIBLE_DEVICES": link_to_gpu_index[dev]}
         if is_client:
                 cmd += f" -c -H {dst_ip}"
         else:
                 cmd = cmd + (f" --tcpdirect-link-name {dev} --tcpdirect-src-ip {src_ip} --tcpdirect-dst-ip {dst_ip}"
                              f" --queue-start {queue_start} --queue-num {queue_num}")
-                env = {"CUDA_VISIBLE_DEVICES": link_to_gpu_index[dev]}
 
         return (cmd, env)
 
diff --git a/tcpdirect.cu b/tcpdirect.cu
index affbb32..24cff39 100644
--- a/tcpdirect.cu
+++ b/tcpdirect.cu
@@ -24,6 +24,10 @@
 #include "flow.h"
 #include "thread.h"
 
+#ifndef MSG_ZEROCOPY
+#define MSG_ZEROCOPY	0x4000000
+#endif
+
 #define MIN_RX_BUFFER_TOTAL_SIZE (1 << 28)
 #define GPUMEM_ALIGNMENT (1UL << 21)
 #define GPUMEM_MINSZ 0x400000
@@ -32,9 +36,6 @@
 
 #define multiplier (1 << 16)
 
-#define SO_DEVMEM_OFFSET 99
-#define SCM_DEVMEM_OFFSET SO_DEVMEM_OFFSET
-
 #define TEST_PREFIX "ncdevmem"
 #define NUM_PAGES 16000
 
@@ -111,10 +112,10 @@ int memfd_create(const char *name, unsigned int flags)
 
 int tcpdirect_setup_socket(int socket) {
   const int one = 1;
-  if (setsockopt(socket, SOL_SOCKET,
-                 SO_REUSEADDR | SO_REUSEPORT | SO_ZEROCOPY,
-                 &one,
-                 sizeof(one))) {
+  if (setsockopt(socket, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one))
+      || setsockopt(socket, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one))
+      || setsockopt(socket, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one))
+     ) {
     perror("tcpdirect_setup_socket");
     exit(EXIT_FAILURE);
   }
@@ -447,9 +448,8 @@ int tcpdirect_udma_send(int socket, void *f_mbuf, size_t n, int flags) {
 int tcpdirect_send(int socket, void *buf, size_t n, int flags) {
   int gpu_mem_fd_;
   struct iovec iov;
-  struct msghdr msg = {0};
+  struct msghdr msg;
   struct cmsghdr *cmsg;
-  char buf_dummy[n];
   char offsetbuf[CMSG_SPACE(sizeof(uint32_t) * 2)];
   struct tcpdirect_cuda_mbuf *tmbuf;
 
@@ -459,9 +459,10 @@ int tcpdirect_send(int socket, void *buf, size_t n, int flags) {
   gpu_mem_fd_ = tmbuf->gpu_mem_fd_;
   void *gpu_tx_mem_ = tmbuf->gpu_tx_mem_;
 
+  memset(&msg, 0, sizeof(msg));
   // memset(cmsg, 0, sizeof(struct cmsghdr));
 
-  iov.iov_base = buf_dummy;
+  iov.iov_base = NULL;
   iov.iov_len = n;
 
   msg.msg_iov = &iov;
@@ -475,6 +476,7 @@ int tcpdirect_send(int socket, void *buf, size_t n, int flags) {
   cmsg->cmsg_type = SCM_DEVMEM_OFFSET;
   cmsg->cmsg_len = CMSG_LEN(sizeof(int) * 2);
   *((int*)CMSG_DATA(cmsg)) = gpu_mem_fd_;
+  ((int *)CMSG_DATA(cmsg))[1] = 0;
 
   ssize_t bytes_sent = sendmsg(socket, &msg, MSG_ZEROCOPY | MSG_DONTWAIT);
   if (bytes_sent < 0 && errno != EWOULDBLOCK && errno != EAGAIN) {

From a1f6bc858d04e8e2c714e619e39146b0f80f5402 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Fri, 22 Sep 2023 20:49:18 +0000
Subject: [PATCH 22/72] tcpd: don't print out each frag received

taskset same number of CPUs as number of threads
---
 multi_neper.py |  5 +++--
 tcpdirect.cu   | 16 ++++++++--------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/multi_neper.py b/multi_neper.py
index 13c8b5e..32daf06 100755
--- a/multi_neper.py
+++ b/multi_neper.py
@@ -201,8 +201,9 @@ def parse_subprocess_outputs(subprocesses):
                 dst_port = args.port + i
                 is_client = args.client
                 src_ip, dst_ip = src_ips[i], hosts[i]
-                # TODO 8 CPUs is hard-coded. Probably should change to args.flows
-                cpu_range = get_cpu_range(2 + (52 if i >= 2 else 0), 8, i)
+
+                # TODO should CPU range be configurable by the user?
+                cpu_range = get_cpu_range(2 + (52 if i >= 2 else 0), args.threads, i)
 
                 cmd_env = build_neper_cmd(args.neper_dir, is_client, dev,
                                       args.threads, args.flows, cpu_range, args.buffer_size,
diff --git a/tcpdirect.cu b/tcpdirect.cu
index 24cff39..64ede8a 100644
--- a/tcpdirect.cu
+++ b/tcpdirect.cu
@@ -568,14 +568,14 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags) {
     //   exit(1);
     // }
     total_received += devmemvec->frag_size;
-    printf("\n\nreceived frag_page=%u, in_page_offset=%u,"
-            " frag_offset=%u, frag_size=%u, token=%u"
-            " total_received=%lu\n",
-            devmemvec->frag_offset >> PAGE_SHIFT,
-            devmemvec->frag_offset % PAGE_SIZE,
-            devmemvec->frag_offset, devmemvec->frag_size,
-            devmemvec->frag_token,
-            total_received);
+    // printf("\n\nreceived frag_page=%u, in_page_offset=%u,"
+    //         " frag_offset=%u, frag_size=%u, token=%u"
+    //         " total_received=%lu\n",
+    //         devmemvec->frag_offset >> PAGE_SHIFT,
+    //         devmemvec->frag_offset % PAGE_SIZE,
+    //         devmemvec->frag_offset, devmemvec->frag_size,
+    //         devmemvec->frag_token,
+    //         total_received);
 
     // if (devmemvec->frag_token % 10 == 0) {
     //   char mybuf[devmemvec->frag_size];

From af783cfcb4cab860889f79715cdd2554a5d52388 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Mon, 2 Oct 2023 21:55:38 +0000
Subject: [PATCH 23/72] fill tx cuda buffer with [1-111] repeating

---
 tcpdirect.cu | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/tcpdirect.cu b/tcpdirect.cu
index 64ede8a..b6e9c65 100644
--- a/tcpdirect.cu
+++ b/tcpdirect.cu
@@ -110,6 +110,30 @@ int memfd_create(const char *name, unsigned int flags)
 	return syscall(__NR_memfd_create, name, flags);
 }
 
+/* Fills buf of size n with a repeating sequence of 1 to 111 inclusive
+ */
+void fill_buffer(void *buf, size_t n) {
+#define BUFSIZE 3996
+  unsigned char src_buf[BUFSIZE];
+  int ptr = 0, i = 0;
+
+  while (i < BUFSIZE) {
+    src_buf[i] = (i % LAST_PRIME) + 1;
+    i++;
+  }
+
+  while (ptr*BUFSIZE + BUFSIZE < n) {
+    cudaMemcpy((char *)buf + ptr*BUFSIZE, &src_buf, BUFSIZE, cudaMemcpyHostToDevice);
+    ptr++;
+  }
+
+  i = ptr*BUFSIZE;
+  while (i < n) {
+    cudaMemset((char *)buf + i, (i % LAST_PRIME) + 1, 1);
+    i++;
+  }
+}
+
 int tcpdirect_setup_socket(int socket) {
   const int one = 1;
   if (setsockopt(socket, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one))
@@ -177,12 +201,10 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
   int dma_buf_fd_;
   int q_start = opts->queue_start;
   int q_num = opts->queue_num;
-  std::unique_ptr<char[]> buf_;
   struct tcpdirect_cuda_mbuf *tmbuf;
   const char *gpu_pci_addr = opts->tcpd_gpu_pci_addr;  // "0000:04:00.0"
   const char *nic_pci_addr = opts->tcpd_nic_pci_addr;  // "0000:06:00.0"
-  size_t message_size_ = 4096000; // TODO param this
-  size_t alloc_size = opts->tcpdirect_phys_len;  // std::max(message_size_, (unsigned long)GPUMEM_MINSZ)
+  size_t alloc_size = opts->tcpdirect_phys_len;
 
   tmbuf =
     (struct tcpdirect_cuda_mbuf *)calloc(1, sizeof(struct tcpdirect_cuda_mbuf));
@@ -202,12 +224,13 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
   // }
 
   cudaMalloc(&gpu_tx_mem_, alloc_size);
-  if (is_client) cudaMemset(gpu_tx_mem_, 'a', alloc_size);
+  if (is_client) {
+    fill_buffer(gpu_tx_mem_, alloc_size);
+  }
   unsigned int flag = 1;
   cuPointerSetAttribute(&flag,
                         CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
                         (CUdeviceptr)gpu_tx_mem_);
-  buf_.reset(new char[message_size_]);
 
   gpu_mem_fd_ = get_gpumem_dmabuf_pages_fd(gpu_pci_addr, nic_pci_addr,
                                            gpu_tx_mem_, alloc_size,

From 8e11a8c308fa62b8c79252eea02fad9208997617 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Tue, 3 Oct 2023 00:08:54 +0000
Subject: [PATCH 24/72] adding rx-buffer-cpy and rx-data-validation

---
 check_all_options.c |  7 ++++-
 define_all_flags.c  |  2 ++
 flow.h              |  3 ++
 lib.h               |  2 ++
 multi_neper.py      | 11 +++++--
 stream.c            |  3 +-
 tcpdirect.cu        | 72 +++++++++++++++++++++++++++++++++++----------
 tcpdirect.h         |  2 +-
 8 files changed, 81 insertions(+), 21 deletions(-)

diff --git a/check_all_options.c b/check_all_options.c
index 73f1cf1..c0e5523 100644
--- a/check_all_options.c
+++ b/check_all_options.c
@@ -109,7 +109,12 @@ void check_options_tcp_stream(struct options *opts, struct callbacks *cb)
             // TODO check page-alignment
             // CHECK((CUdeviceptr)gpu_tx_mem_ % PAGE_SIZE == 0);
 
-            if (!opts->client) {
+            if (opts->client) {
+                  CHECK(cb, !opts->tcpd_validate,
+                        "Validation only allowed on hosts.");
+                  CHECK(cb, !opts->tcpd_rx_cpy,
+                        "Copying CUDA buffer to userspace only allowed on hosts.");
+            } else {
                   CHECK(cb, opts->tcpdirect_src_ip,
                         "Must provide source IP address for TCPDirect host.");
                   CHECK(cb, opts->tcpdirect_dst_ip,
diff --git a/define_all_flags.c b/define_all_flags.c
index f2b9430..5602e1a 100644
--- a/define_all_flags.c
+++ b/define_all_flags.c
@@ -142,6 +142,8 @@ struct flags_parser *add_flags_tcp_stream(struct flags_parser *fp)
         DEFINE_FLAG(fp, bool,          enable_tcp_maerts,    false,   'M', "Enables TCP_MAERTS test (server writes and client reads). It overrides enable_read, and enable_write");
         DEFINE_FLAG(fp, bool,          async_connect,   false,   0,  "use non blocking connect");
 #ifdef WITH_TCPDIRECT
+        DEFINE_FLAG(fp, bool,                        tcpd_validate, false, 0, "Validates that received data is a repeating sequence of 1 to 111 inclusive");
+        DEFINE_FLAG(fp, bool,                        tcpd_rx_cpy, false, 0, "After the CUDA buffer is filled to buffer_size, calls cudaMemcpy to a userspace buffer");
         DEFINE_FLAG(fp, const char *,  tcpd_nic_pci_addr, 0,     0,   "NIC PCI addr, e.x. 0000:06:00.0");
         DEFINE_FLAG(fp, const char *,  tcpd_gpu_pci_addr, 0,     0,   "GPU PCI addr, e.x. 0000:04:00.0");
         DEFINE_FLAG(fp, unsigned long long,          tcpdirect_phys_addr, 0, 0, "Set the remote memory physical address for tcpdirect, e.x. 0000:06:00.0");
diff --git a/flow.h b/flow.h
index d4488e2..9e19ce2 100644
--- a/flow.h
+++ b/flow.h
@@ -41,6 +41,9 @@ struct tcpdirect_cuda_mbuf {
         int gpu_mem_fd_;
         int dma_buf_fd_;
         void *gpu_tx_mem_;
+        void *cpy_buffer;
+        size_t bytes_received;
+        void *tokens;
 };
 
 typedef void (*flow_handler)(struct flow *, uint32_t);
diff --git a/lib.h b/lib.h
index 6985224..579b0d9 100644
--- a/lib.h
+++ b/lib.h
@@ -107,6 +107,8 @@ struct options {
 
         /* tcp_stream */
 #ifdef WITH_TCPDIRECT
+        bool tcpd_validate;
+        bool tcpd_rx_cpy;
         const char *tcpd_nic_pci_addr;
         const char *tcpd_gpu_pci_addr;
         unsigned long long tcpdirect_phys_addr;
diff --git a/multi_neper.py b/multi_neper.py
index 32daf06..2f4ef9f 100755
--- a/multi_neper.py
+++ b/multi_neper.py
@@ -79,7 +79,8 @@ def build_neper_cmd(neper_dir: str, is_client: bool, dev: str,
                     cpu_list, buffer_size: int, phys_len: int,
                     nic_pci: str, gpu_pci: str,
                     control_port, source_port, port, length,
-                    src_ip, dst_ip, queue_start, queue_num)->str:
+                    src_ip, dst_ip, queue_start, queue_num,
+                    tcpd_validate, tcpd_rx_cpy)->str:
 
         # TODO tcp_stream_cuda2 -> tcp_stream eventually
         cmd = (f"taskset --cpu-list {cpu_list} {neper_dir}/tcp_stream_cuda2 -T {threads} -F {flows} --tcpdirect-phys-len {phys_len}"
@@ -92,6 +93,10 @@ def build_neper_cmd(neper_dir: str, is_client: bool, dev: str,
         else:
                 cmd = cmd + (f" --tcpdirect-link-name {dev} --tcpdirect-src-ip {src_ip} --tcpdirect-dst-ip {dst_ip}"
                              f" --queue-start {queue_start} --queue-num {queue_num}")
+                if tcpd_validate:
+                        cmd += " --tcpd-validate"
+                if tcpd_rx_cpy:
+                        cmd += " --tcpd-rx-cpy"
 
         return (cmd, env)
 
@@ -155,6 +160,8 @@ def parse_subprocess_outputs(subprocesses):
                                                           " flow-steering rules each will be"
                                                           " installed for queues [8-11])"),
                                                           type=int)
+        parser.add_argument("--tcpd-validate", action="store_true")
+        parser.add_argument("--tcpd-rx-cpy", action="store_true")
 
         parser.add_argument("--dry-run", default=False, action="store_true")
 
@@ -209,7 +216,7 @@ def parse_subprocess_outputs(subprocesses):
                                       args.threads, args.flows, cpu_range, args.buffer_size,
                                       args.phys_len, nic_pci, gpu_pci,
                                       ctrl_port, src_port, dst_port, args.length, src_ip, dst_ip,
-                                      args.q_start, args.q_num)
+                                      args.q_start, args.q_num, args.tcpd_validate, args.tcpd_rx_cpy)
 
                 cmds.append(cmd_env)
 
diff --git a/stream.c b/stream.c
index fc973ff..8867431 100644
--- a/stream.c
+++ b/stream.c
@@ -110,7 +110,8 @@ void stream_handler(struct flow *f, uint32_t events)
                                 if (t->opts->tcpd_nic_pci_addr)
                                         n = tcpdirect_recv(fd, mbuf,
                                                            opts->buffer_size,
-                                                           opts->recv_flags);
+                                                           opts->recv_flags,
+                                                           t);
                                 else
 #endif
                                 n = recv(fd, mbuf, opts->buffer_size,
diff --git a/tcpdirect.cu b/tcpdirect.cu
index b6e9c65..68fe01d 100644
--- a/tcpdirect.cu
+++ b/tcpdirect.cu
@@ -18,6 +18,7 @@
 #include <cstdint>
 #include <cstdio>
 #include <string>
+#include <vector>
 
 #include "tcpdirect.h"
 #include "logging.h"
@@ -28,6 +29,8 @@
 #define MSG_ZEROCOPY	0x4000000
 #endif
 
+#define LAST_PRIME 111
+
 #define MIN_RX_BUFFER_TOTAL_SIZE (1 << 28)
 #define GPUMEM_ALIGNMENT (1UL << 21)
 #define GPUMEM_MINSZ 0x400000
@@ -283,6 +286,8 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
   tmbuf->gpu_mem_fd_ = gpu_mem_fd_;
   tmbuf->dma_buf_fd_ = dma_buf_fd_;
   tmbuf->gpu_tx_mem_ = gpu_tx_mem_;
+  tmbuf->cpy_buffer = malloc(opts->buffer_size);
+  tmbuf->tokens = new std::vector<devmemtoken>();
   return 0;
 }
 
@@ -515,17 +520,23 @@ int tcpdirect_send(int socket, void *buf, size_t n, int flags) {
   return bytes_sent;
 }
 
-int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags) {
+int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread *t) {
   struct iovec iov;
   struct msghdr msg_local;
   struct msghdr *msg;
   struct tcpdirect_cuda_mbuf *tmbuf;
   int buf, ret, client_fd;
+  int buffer_size = n;
   size_t total_received = 0;
+  unsigned char *cpy_buffer;
+  const struct options *opts = t->opts;
+  std::vector<devmemtoken> *tokens;
 
   if (!f_mbuf) return -1;
 
   tmbuf = (struct tcpdirect_cuda_mbuf *)f_mbuf;
+  cpy_buffer = (unsigned char *)tmbuf->cpy_buffer;
+  tokens = (std::vector<devmemtoken> *)tmbuf->tokens;
 
   client_fd = socket;
 
@@ -537,7 +548,7 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags) {
   memset(msg, 0, sizeof(struct msghdr));
 
   iov.iov_base = buf_dummy;
-  iov.iov_len = n;
+  iov.iov_len = n - tmbuf->bytes_received;
   msg->msg_iov = &iov;
   msg->msg_iovlen = 1;
 
@@ -600,37 +611,66 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags) {
     //         devmemvec->frag_token,
     //         total_received);
 
-    // if (devmemvec->frag_token % 10 == 0) {
-    //   char mybuf[devmemvec->frag_size];
-    //   cudaMemcpy(mybuf,
-    //             (char *)tmbuf->gpu_tx_mem_ + devmemvec->frag_offset,
-    //             devmemvec->frag_size,
-    //             cudaMemcpyDeviceToHost);
-    //   printf("cudaFrag: %.25s\n", mybuf);
-    // }
-
     // sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_END;
     // ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
 
+    tokens->push_back(token);
+    // munmap(buf_mem, n);
+  }
+
+  tmbuf->bytes_received += received;
+
+  /* Once we've received fragments totaling buffer_size, we can copy from the
+   * CUDA buffer to a user-space buffer, and free the fragments in the CUDA
+   * buffer.
+  */
+  if (tmbuf->bytes_received == buffer_size) {
+    /* There is a performance impact when we cudaMemcpy from the CUDA buffer to
+     * the userspace buffer, so it's gated by a flag
+     */
+    if (opts->tcpd_rx_cpy || opts->tcpd_validate) {
+      cudaMemcpy(cpy_buffer, tmbuf->gpu_tx_mem_, buffer_size, cudaMemcpyDeviceToHost);
+
+      /* Ensure the sequence is what we expect:
+       * a repeating sequence of 1 to LAST_PRIME inclusive
+       */
+      if (opts->tcpd_validate) {
+        cudaDeviceSynchronize();
+        int i = 0;
+        int expected_val;
+        while (i < buffer_size) {
+          expected_val = (i % LAST_PRIME) + 1;
+          if (cpy_buffer[i] != expected_val) {
+            printf("Thread %i - incorrect byte %i, expected %i, got %i\n",
+                  t->index,
+                  i,
+                  expected_val,
+                  cpy_buffer[i]);
+          }
+          i++;
+        }
+      }
+    }
+
     ret = setsockopt(client_fd, SOL_SOCKET,
-                      SO_DEVMEM_DONTNEED, &token,
-                      sizeof(token));
+                      SO_DEVMEM_DONTNEED, tokens->data(),
+                      tokens->size() * sizeof(devmemtoken));
     if (ret) {
       perror("DONTNEED failed");
       exit(1);
     }
-
-    // munmap(buf_mem, n);
+    tokens->clear();
+    tmbuf->bytes_received = 0;
   }
   return total_received;
 }
 
 int cuda_flow_cleanup(void *f_mbuf) {
   struct tcpdirect_cuda_mbuf *t_mbuf = (struct tcpdirect_cuda_mbuf *)f_mbuf;
-
   close(t_mbuf->gpu_mem_fd_);
   close(t_mbuf->dma_buf_fd_);
   cudaFree(t_mbuf->gpu_tx_mem_);
+  free(t_mbuf->cpy_buffer);
   return 0;
 }
 #endif /* #ifdef WITH_TCPDIRECT */
\ No newline at end of file
diff --git a/tcpdirect.h b/tcpdirect.h
index 943c8d0..fd0157d 100644
--- a/tcpdirect.h
+++ b/tcpdirect.h
@@ -17,7 +17,7 @@ int cuda_flow_cleanup(void *f_mbuf);
 int udmabuf_setup_alloc(const struct options *opts, void **f_mbuf);
 int tcpdirect_send(int socket, void *buf, size_t n, int flags);
 int tcpdirect_udma_send(int fd, void *buf, size_t n, int flags);
-int tcpdirect_recv(int fd, void *f_mbuf, size_t n, int flags);
+int tcpdirect_recv(int fd, void *f_mbuf, size_t n, int flags, struct thread *t);
 
 #if __cplusplus
 }

From a8d203e75ced0ceb2ebfe57390c8907f5eb09088 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Tue, 3 Oct 2023 22:25:02 +0000
Subject: [PATCH 25/72] copy each fragment to cp_buffer

---
 flow.h       |  1 +
 tcpdirect.cu | 23 +++++++++++++++++++++--
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/flow.h b/flow.h
index 9e19ce2..f6598d4 100644
--- a/flow.h
+++ b/flow.h
@@ -44,6 +44,7 @@ struct tcpdirect_cuda_mbuf {
         void *cpy_buffer;
         size_t bytes_received;
         void *tokens;
+        void *vectors;
 };
 
 typedef void (*flow_handler)(struct flow *, uint32_t);
diff --git a/tcpdirect.cu b/tcpdirect.cu
index 68fe01d..04d2d5c 100644
--- a/tcpdirect.cu
+++ b/tcpdirect.cu
@@ -287,6 +287,7 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
   tmbuf->dma_buf_fd_ = dma_buf_fd_;
   tmbuf->gpu_tx_mem_ = gpu_tx_mem_;
   tmbuf->cpy_buffer = malloc(opts->buffer_size);
+  tmbuf->vectors = new std::vector<devmemvec>();
   tmbuf->tokens = new std::vector<devmemtoken>();
   return 0;
 }
@@ -530,12 +531,14 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread
   size_t total_received = 0;
   unsigned char *cpy_buffer;
   const struct options *opts = t->opts;
+  std::vector<devmemvec> *vectors;
   std::vector<devmemtoken> *tokens;
 
   if (!f_mbuf) return -1;
 
   tmbuf = (struct tcpdirect_cuda_mbuf *)f_mbuf;
   cpy_buffer = (unsigned char *)tmbuf->cpy_buffer;
+  vectors = (std::vector<devmemvec> *)tmbuf->vectors;
   tokens = (std::vector<devmemtoken> *)tmbuf->tokens;
 
   client_fd = socket;
@@ -613,7 +616,7 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread
 
     // sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_END;
     // ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
-
+    vectors->emplace_back(*devmemvec);
     tokens->push_back(token);
     // munmap(buf_mem, n);
   }
@@ -629,7 +632,19 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread
      * the userspace buffer, so it's gated by a flag
      */
     if (opts->tcpd_rx_cpy || opts->tcpd_validate) {
-      cudaMemcpy(cpy_buffer, tmbuf->gpu_tx_mem_, buffer_size, cudaMemcpyDeviceToHost);
+      for (int idx = 0; idx < vectors->size(); idx++) {
+        struct devmemvec vec = (*vectors)[idx];
+        struct devmemtoken token = (*tokens)[idx];
+
+        /* copy each fragment to the cpy_buffer in order, i.e.
+         * 1st fragment will occuply bytes [0-4095], 2nd fragment will
+         * occupy bytes [4096-8191], etc.
+         */
+        cudaMemcpy(cpy_buffer + (vec.frag_token - 1) * PAGE_SIZE,
+                   (char *)tmbuf->gpu_tx_mem_ + vec.frag_offset,
+                   vec.frag_size,
+                   cudaMemcpyDeviceToHost);
+      }
 
       /* Ensure the sequence is what we expect:
        * a repeating sequence of 1 to LAST_PRIME inclusive
@@ -646,6 +661,7 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread
                   i,
                   expected_val,
                   cpy_buffer[i]);
+            break;
           }
           i++;
         }
@@ -659,6 +675,7 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread
       perror("DONTNEED failed");
       exit(1);
     }
+    vectors->clear();
     tokens->clear();
     tmbuf->bytes_received = 0;
   }
@@ -671,6 +688,8 @@ int cuda_flow_cleanup(void *f_mbuf) {
   close(t_mbuf->dma_buf_fd_);
   cudaFree(t_mbuf->gpu_tx_mem_);
   free(t_mbuf->cpy_buffer);
+  free(t_mbuf->tokens);
+  free(t_mbuf->vectors);
   return 0;
 }
 #endif /* #ifdef WITH_TCPDIRECT */
\ No newline at end of file

From fb7805184a7ef67d8a24ed0c32996bcfe3883118 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Wed, 4 Oct 2023 14:32:49 +0000
Subject: [PATCH 26/72] keep track of bytes_sent for tcpdirect Tx

---
 flow.h       |  1 +
 tcpdirect.cu | 15 +++++++++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/flow.h b/flow.h
index f6598d4..6787def 100644
--- a/flow.h
+++ b/flow.h
@@ -43,6 +43,7 @@ struct tcpdirect_cuda_mbuf {
         void *gpu_tx_mem_;
         void *cpy_buffer;
         size_t bytes_received;
+        size_t bytes_sent;
         void *tokens;
         void *vectors;
 };
diff --git a/tcpdirect.cu b/tcpdirect.cu
index 04d2d5c..6b57f71 100644
--- a/tcpdirect.cu
+++ b/tcpdirect.cu
@@ -115,7 +115,7 @@ int memfd_create(const char *name, unsigned int flags)
 
 /* Fills buf of size n with a repeating sequence of 1 to 111 inclusive
  */
-void fill_buffer(void *buf, size_t n) {
+void fill_tx_buffer(void *buf, size_t n) {
 #define BUFSIZE 3996
   unsigned char src_buf[BUFSIZE];
   int ptr = 0, i = 0;
@@ -228,7 +228,8 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
 
   cudaMalloc(&gpu_tx_mem_, alloc_size);
   if (is_client) {
-    fill_buffer(gpu_tx_mem_, alloc_size);
+    fill_tx_buffer(gpu_tx_mem_, alloc_size);
+    cudaDeviceSynchronize();
   }
   unsigned int flag = 1;
   cuPointerSetAttribute(&flag,
@@ -289,6 +290,8 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
   tmbuf->cpy_buffer = malloc(opts->buffer_size);
   tmbuf->vectors = new std::vector<devmemvec>();
   tmbuf->tokens = new std::vector<devmemtoken>();
+  tmbuf->bytes_received = 0;
+  tmbuf->bytes_sent = 0;
   return 0;
 }
 
@@ -492,7 +495,7 @@ int tcpdirect_send(int socket, void *buf, size_t n, int flags) {
   // memset(cmsg, 0, sizeof(struct cmsghdr));
 
   iov.iov_base = NULL;
-  iov.iov_len = n;
+  iov.iov_len = n - tmbuf->bytes_sent;
 
   msg.msg_iov = &iov;
   msg.msg_iovlen = 1;
@@ -505,7 +508,7 @@ int tcpdirect_send(int socket, void *buf, size_t n, int flags) {
   cmsg->cmsg_type = SCM_DEVMEM_OFFSET;
   cmsg->cmsg_len = CMSG_LEN(sizeof(int) * 2);
   *((int*)CMSG_DATA(cmsg)) = gpu_mem_fd_;
-  ((int *)CMSG_DATA(cmsg))[1] = 0;
+  ((int *)CMSG_DATA(cmsg))[1] = (int)tmbuf->bytes_sent;
 
   ssize_t bytes_sent = sendmsg(socket, &msg, MSG_ZEROCOPY | MSG_DONTWAIT);
   if (bytes_sent < 0 && errno != EWOULDBLOCK && errno != EAGAIN) {
@@ -518,6 +521,10 @@ int tcpdirect_send(int socket, void *buf, size_t n, int flags) {
     exit(EXIT_FAILURE);
   }
 
+  tmbuf->bytes_sent += bytes_sent;
+  if (tmbuf->bytes_sent == n)
+    tmbuf->bytes_sent = 0;
+
   return bytes_sent;
 }
 

From 8665354b781e760400e8471643787dd12402ab3e Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Thu, 5 Oct 2023 19:10:41 +0000
Subject: [PATCH 27/72] co-opt num_ports option for flow-steer compat

Also force 1:1 thread:flow ratio

Because flow-steering required for TCPDirect, force incrementing threads
to listen on incrementing ports, i.e.

thread_0 listens on port x
thread_1 listens on port x+1
thread_2 listens on port x_2
etc.
---
 check_all_options.c |  6 ++++--
 multi_neper.py      | 46 +++++++++++++++++++++++++++++++--------------
 socket.c            | 29 ++++++++++++++++++++++++++++
 tcpdirect.cu        | 14 ++++++++------
 4 files changed, 73 insertions(+), 22 deletions(-)

diff --git a/check_all_options.c b/check_all_options.c
index c0e5523..922f0e5 100644
--- a/check_all_options.c
+++ b/check_all_options.c
@@ -110,8 +110,6 @@ void check_options_tcp_stream(struct options *opts, struct callbacks *cb)
             // CHECK((CUdeviceptr)gpu_tx_mem_ % PAGE_SIZE == 0);
 
             if (opts->client) {
-                  CHECK(cb, !opts->tcpd_validate,
-                        "Validation only allowed on hosts.");
                   CHECK(cb, !opts->tcpd_rx_cpy,
                         "Copying CUDA buffer to userspace only allowed on hosts.");
             } else {
@@ -120,6 +118,10 @@ void check_options_tcp_stream(struct options *opts, struct callbacks *cb)
                   CHECK(cb, opts->tcpdirect_dst_ip,
                         "Must provide destination IP address for TCPDirect host.");
             }
+            CHECK(cb, opts->num_flows == opts->num_threads,
+                  "Thread/Flow count must be equal when running in TCPDirect mode.");
+            CHECK(cb, opts->num_flows == opts->num_ports,
+                  "Number of ports should equal number of flows when running in TCPDirect mode.");
       }
 }
 
diff --git a/multi_neper.py b/multi_neper.py
index 2f4ef9f..f35f1a5 100755
--- a/multi_neper.py
+++ b/multi_neper.py
@@ -80,24 +80,33 @@ def build_neper_cmd(neper_dir: str, is_client: bool, dev: str,
                     nic_pci: str, gpu_pci: str,
                     control_port, source_port, port, length,
                     src_ip, dst_ip, queue_start, queue_num,
-                    tcpd_validate, tcpd_rx_cpy)->str:
+                    tcpd_validate, tcpd_rx_cpy)->tuple:
 
         # TODO tcp_stream_cuda2 -> tcp_stream eventually
-        cmd = (f"taskset --cpu-list {cpu_list} {neper_dir}/tcp_stream_cuda2 -T {threads} -F {flows} --tcpdirect-phys-len {phys_len}"
-                f" --port {port} --source-port {source_port} --control-port {control_port}"
-                f" --buffer-size {buffer_size} --tcpd-nic-pci-addr {nic_pci} --tcpd-gpu-pci-addr {gpu_pci} -l {length}")
+        cmd = (f"taskset --cpu-list {cpu_list} {neper_dir}/tcp_stream_cuda2"
+               f" -T {threads} -F {flows} --tcpdirect-phys-len {phys_len}"
+               f" --port {port} --source-port {source_port}"
+               f" --control-port {control_port}"
+               f" --buffer-size {buffer_size} --tcpd-nic-pci-addr {nic_pci}"
+               f" --tcpd-gpu-pci-addr {gpu_pci} -l {length}"
+               f" --num-ports {flows}")
+
+        if tcpd_validate:
+                cmd += " --tcpd-validate"
 
-        env = {"CUDA_VISIBLE_DEVICES": link_to_gpu_index[dev]}
         if is_client:
                 cmd += f" -c -H {dst_ip}"
         else:
-                cmd = cmd + (f" --tcpdirect-link-name {dev} --tcpdirect-src-ip {src_ip} --tcpdirect-dst-ip {dst_ip}"
-                             f" --queue-start {queue_start} --queue-num {queue_num}")
-                if tcpd_validate:
-                        cmd += " --tcpd-validate"
+                cmd = cmd + (f" --tcpdirect-link-name {dev}"
+                             f" --tcpdirect-src-ip {src_ip}"
+                             f" --tcpdirect-dst-ip {dst_ip}"
+                             f" --queue-start {queue_start}"
+                             f" --queue-num {queue_num}")
                 if tcpd_rx_cpy:
                         cmd += " --tcpd-rx-cpy"
 
+        env = {"CUDA_VISIBLE_DEVICES": link_to_gpu_index[dev]}
+
         return (cmd, env)
 
 # returns a CPU range for taskset
@@ -199,14 +208,18 @@ def parse_subprocess_outputs(subprocesses):
 
         cmds = []
         debug(f"running on {devices}")
+        is_client = args.client
+
         for i, dev in enumerate(devices):
                 nic_pci = link_to_nic_pci_addr[dev]
                 gpu_pci = link_to_gpu_pci_addr[dev]
 
+                # increment control port by 1, and src/dst ports by flow_count
+                # for each additional link we're running Neper on
                 ctrl_port = args.control_port + i
-                src_port = args.source_port + i * args.flows
-                dst_port = args.port + i
-                is_client = args.client
+                src_port = i * args.flows + args.source_port
+                dst_port = i * args.flows + args.port
+
                 src_ip, dst_ip = src_ips[i], hosts[i]
 
                 # TODO should CPU range be configurable by the user?
@@ -220,13 +233,18 @@ def parse_subprocess_outputs(subprocesses):
 
                 cmds.append(cmd_env)
 
-        debug(cmds)
+        for cmd in cmds:
+                debug(cmd)
+
         if not args.dry_run:
                 sp_list = run_cmds(cmds)
                 debug("parsing subprocesses outputs")
                 for dev, i in zip(devices, parse_subprocess_outputs(sp_list)):
                         if not args.client:
-                                print(f"[{dev}] Throughput (Mb/s): {i['throughput']}")
+                                try:
+                                        print(f"[{dev}] Throughput (Mb/s): {i['throughput']}")
+                                except KeyError:
+                                        print(f"[{dev}] Throughput (Mb/s): NA")
 
                 # TODO remove, flow-steering rules are installed via Neper now
                 # delete flow-steering rules
diff --git a/socket.c b/socket.c
index 7347d0f..e3cdfb2 100644
--- a/socket.c
+++ b/socket.c
@@ -254,6 +254,24 @@ void socket_listen(struct thread *t)
         struct addrinfo *ai = getaddrinfo_or_die(opts->host, opts->port, &hints,
                                                  cb);
         int port = atoi(opts->port);
+#ifdef WITH_TCPDIRECT
+        /* TCPDirect:
+         * Since each thread has a CUDA buffer, and
+         * flow-steering rules are required, threads, TCP connections, and
+         * CUDA buffers need to be 1:1:1.
+         *
+         * We enforce that by co-opting the num_ports option.
+         *
+         * thread/flow 0 will listen on port x, and use thread_0's buf
+         * thread_1/flow_1 listen on x+1 -> thread_1->f_mbuf
+         * etc...
+         */
+        if (opts->tcpd_gpu_pci_addr) {
+                port += t->index;
+                reset_port(ai, port, cb);
+        }
+#endif
+
         int i, n, s;
 
         struct flow_create_args args = {
@@ -269,6 +287,17 @@ void socket_listen(struct thread *t)
         switch (ai->ai_socktype) {
         case SOCK_STREAM:
                 n = opts->num_ports ? opts->num_ports : 1;
+#ifdef WITH_TCPDIRECT
+                /* TCPDirect:
+                 * See TCPDirect comment above^
+                 *
+                 * We are co-opting the num_ports option, so each thread/flow
+                 * listens on a port that's 1 larger than the previous thread's
+                 * port.
+                 */
+                if (opts->tcpd_gpu_pci_addr)
+                        n = 1;
+#endif
                 for (i = 0; i < n; i++) {
                         s = socket_bind_listener(t, ai);
                         socket_init_not_established(t, s);
diff --git a/tcpdirect.cu b/tcpdirect.cu
index 6b57f71..566d6fd 100644
--- a/tcpdirect.cu
+++ b/tcpdirect.cu
@@ -227,9 +227,9 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
   // }
 
   cudaMalloc(&gpu_tx_mem_, alloc_size);
-  if (is_client) {
-    fill_tx_buffer(gpu_tx_mem_, alloc_size);
-    cudaDeviceSynchronize();
+  if (is_client && opts->tcpd_validate) {
+          fill_tx_buffer(gpu_tx_mem_, alloc_size);
+          cudaDeviceSynchronize();
   }
   unsigned int flag = 1;
   cuPointerSetAttribute(&flag,
@@ -262,11 +262,13 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
 
     // copied from socket.c#socket_connect_one()
     int flow_idx = (t->flow_first + t->flow_count);
-    int source_port = flow_idx + opts->source_port;
+    int src_port = flow_idx + opts->source_port;
+    int dst_port = flow_idx + atoi(opts->port);
+
     char flow_steer_cmd[512];
     sprintf(flow_steer_cmd,
             "ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %i dst-port %s queue %i",
-            opts->tcpdirect_link_name, opts->tcpdirect_src_ip, opts->tcpdirect_dst_ip, source_port, opts->port, num_queues);
+            opts->tcpdirect_link_name, opts->tcpdirect_src_ip, opts->tcpdirect_dst_ip, src_port, dst_port, num_queues);
     ret = system(flow_steer_cmd);
 
     // only running the below ethtool commands after last thread/flow is setup
@@ -699,4 +701,4 @@ int cuda_flow_cleanup(void *f_mbuf) {
   free(t_mbuf->vectors);
   return 0;
 }
-#endif /* #ifdef WITH_TCPDIRECT */
\ No newline at end of file
+#endif /* #ifdef WITH_TCPDIRECT */

From 1ed0f555ad517fb71538d29ea470e973f2f74f0f Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Thu, 5 Oct 2023 19:24:46 +0000
Subject: [PATCH 28/72] fixing segfault

---
 tcpdirect.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tcpdirect.cu b/tcpdirect.cu
index 566d6fd..bf938d8 100644
--- a/tcpdirect.cu
+++ b/tcpdirect.cu
@@ -267,7 +267,7 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
 
     char flow_steer_cmd[512];
     sprintf(flow_steer_cmd,
-            "ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %i dst-port %s queue %i",
+            "ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %i dst-port %i queue %i",
             opts->tcpdirect_link_name, opts->tcpdirect_src_ip, opts->tcpdirect_dst_ip, src_port, dst_port, num_queues);
     ret = system(flow_steer_cmd);
 

From c36aa25101c0136eabdb32b723897a51ec8023bc Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Tue, 10 Oct 2023 15:34:25 +0000
Subject: [PATCH 29/72] clean up Makefile and build target

small nits regarding build warnings

build via `make tcp_stream WITH_TCPDIRECT=1` instead of
`make tcp_stream_cuda2`
---
 Makefile            | 36 +++++++++++++++---------------------
 check_all_options.c |  2 ++
 multi_neper.py      |  3 +--
 tcpdirect.cu        |  5 ++---
 4 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/Makefile b/Makefile
index d600eec..754b62c 100644
--- a/Makefile
+++ b/Makefile
@@ -18,7 +18,11 @@
 
 all: binaries
 
-CFLAGS = -std=c99 -Wall -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA -DNDEBUG=1 -DWITH_TCPDIRECT
+CFLAGS := -std=c99 -Wall -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA -DNDEBUG=1
+
+ifdef WITH_TCPDIRECT
+	CFLAGS += -DWITH_TCPDIRECT
+endif
 
 lib := \
 	check_all_options.o \
@@ -48,12 +52,10 @@ lib := \
 
 tcp_rr-objs := tcp_rr_main.o tcp_rr.o rr.o $(lib)
 
-tcp_stream-objs := tcp_stream_main.o tcp_stream.o stream.o tcpdirect.o $(lib)
-
-tcp_stream-cuda-objs := tcp_stream_main_cuda.o tcp_stream.o stream.o tcpdirect.o $(lib)
-
-tcp_stream-cuda2-objs := tcp_stream_main.o tcp_stream.o stream.o tcpdirect.o $(lib)
-# tcp_stream-cuda3-objs := tcp_stream_main.cu.o tcp_stream.o stream.o tcpdirect.o $(lib)
+tcp_stream-objs := tcp_stream_main.o tcp_stream.o stream.o $(lib)
+ifdef WITH_TCPDIRECT
+	tcp_stream-objs += tcpdirect.o
+endif
 
 tcp_crr-objs := tcp_crr_main.o tcp_crr.o rr.o $(lib)
 
@@ -72,23 +74,15 @@ ext-libs := -lm -lrt -lpthread
 tcpdirect.o: tcpdirect.cu
 	nvcc -arch=sm_90 -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA -DWITH_TCPDIRECT -c -o $@ $^
 
-tcp_stream_main_cuda.o: tcp_stream_main.cu
-	nvcc -arch=sm_90 -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA -c -o $@ $^
-
-tcp_stream_main.cu.o: tcp_stream_main.c
-	nvcc -arch=sm_90 -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA -DWITH_TCPDIRECT -c -o $@ $^
-
 tcp_rr: $(tcp_rr-objs)
 	$(CC) $(LDFLAGS) -o $@ $^ $(ext-libs)
 
 tcp_stream: $(tcp_stream-objs)
-	$(CC) $(LDFLAGS) -o $@ $^ $(ext-libs)
-
-tcp_stream_cuda2: $(tcp_stream-cuda2-objs)
-	g++ $(LDFLAGS) -o $@ $^ $(ext-libs) -lc -L/usr/local/cuda/lib64 -lcudart -lcuda
-
-tcp_stream_cuda: $(tcp_stream-cuda-objs)
+ifdef WITH_TCPDIRECT
 	g++ $(LDFLAGS) -o $@ $^ $(ext-libs) -lc -L/usr/local/cuda/lib64 -lcudart -lcuda
+else
+	$(CC) $(LDFLAGS) -o $@ $^ $(ext-libs)
+endif
 
 tcp_crr: $(tcp_crr-objs)
 	$(CC) $(LDFLAGS) -o $@ $^ $(ext-libs)
@@ -108,7 +102,7 @@ psp_crr: $(psp_crr-objs)
 psp_rr: $(psp_rr-objs)
 	$(CC) $(LDFLAGS) -o $@ $^ $(ext-libs)
 
-binaries: tcp_rr tcp_stream tcp_crr udp_rr udp_stream psp_stream psp_crr psp_rr tcp_stream_cuda tcp_stream_cuda2
+binaries: tcp_rr tcp_stream tcp_crr udp_rr udp_stream psp_stream psp_crr psp_rr
 
 clean:
-	rm -f *.o tcp_rr tcp_stream tcp_crr udp_rr udp_stream psp_stream psp_crr psp_rr tcp_stream_cuda tcp_stream_cuda2
+	rm -f *.o tcp_rr tcp_stream tcp_crr udp_rr udp_stream psp_stream psp_crr psp_rr
diff --git a/check_all_options.c b/check_all_options.c
index 922f0e5..f2fe602 100644
--- a/check_all_options.c
+++ b/check_all_options.c
@@ -101,6 +101,7 @@ void check_options_tcp_rr(struct options *opts, struct callbacks *cb)
 
 void check_options_tcp_stream(struct options *opts, struct callbacks *cb)
 {
+#ifdef WITH_TCPDIRECT
       if (opts->tcpd_gpu_pci_addr) {
             CHECK(cb, opts->tcpd_nic_pci_addr,
                   "Must provide NIC PCI address if GPU PCI address was provided.");
@@ -123,6 +124,7 @@ void check_options_tcp_stream(struct options *opts, struct callbacks *cb)
             CHECK(cb, opts->num_flows == opts->num_ports,
                   "Number of ports should equal number of flows when running in TCPDirect mode.");
       }
+#endif /* WITH_TCPDIRECT */
 }
 
 void check_options_udp_rr(struct options *opts, struct callbacks *cb)
diff --git a/multi_neper.py b/multi_neper.py
index f35f1a5..bd35734 100755
--- a/multi_neper.py
+++ b/multi_neper.py
@@ -82,8 +82,7 @@ def build_neper_cmd(neper_dir: str, is_client: bool, dev: str,
                     src_ip, dst_ip, queue_start, queue_num,
                     tcpd_validate, tcpd_rx_cpy)->tuple:
 
-        # TODO tcp_stream_cuda2 -> tcp_stream eventually
-        cmd = (f"taskset --cpu-list {cpu_list} {neper_dir}/tcp_stream_cuda2"
+        cmd = (f"taskset --cpu-list {cpu_list} {neper_dir}/tcp_stream"
                f" -T {threads} -F {flows} --tcpdirect-phys-len {phys_len}"
                f" --port {port} --source-port {source_port}"
                f" --control-port {control_port}"
diff --git a/tcpdirect.cu b/tcpdirect.cu
index bf938d8..5b32cb1 100644
--- a/tcpdirect.cu
+++ b/tcpdirect.cu
@@ -491,7 +491,6 @@ int tcpdirect_send(int socket, void *buf, size_t n, int flags) {
 
   tmbuf = (struct tcpdirect_cuda_mbuf *)buf;
   gpu_mem_fd_ = tmbuf->gpu_mem_fd_;
-  void *gpu_tx_mem_ = tmbuf->gpu_tx_mem_;
 
   memset(&msg, 0, sizeof(msg));
   // memset(cmsg, 0, sizeof(struct cmsghdr));
@@ -535,7 +534,7 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread
   struct msghdr msg_local;
   struct msghdr *msg;
   struct tcpdirect_cuda_mbuf *tmbuf;
-  int buf, ret, client_fd;
+  int ret, client_fd; // buf
   int buffer_size = n;
   size_t total_received = 0;
   unsigned char *cpy_buffer;
@@ -567,7 +566,7 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread
   msg->msg_control = offsetbuf;
   msg->msg_controllen = sizeof(offsetbuf);
 
-  char *buf_mem = NULL;
+  // char *buf_mem = NULL;
 
   if (msg->msg_flags & MSG_CTRUNC) {
     printf("fatal, cmsg truncated, current msg_controllen\n");

From 7087697c320787b0351aaaf659e21a0d95f8ca87 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Thu, 12 Oct 2023 18:24:41 +0000
Subject: [PATCH 30/72] rx-copy using block/thread CUDA programming

---
 check_all_options.c |   2 +-
 flow.h              |   6 +-
 tcpdirect.cu        | 132 +++++++++++++++++++++++++++++++++++---------
 3 files changed, 113 insertions(+), 27 deletions(-)

diff --git a/check_all_options.c b/check_all_options.c
index f2fe602..c730d6b 100644
--- a/check_all_options.c
+++ b/check_all_options.c
@@ -108,7 +108,7 @@ void check_options_tcp_stream(struct options *opts, struct callbacks *cb)
             CHECK(cb, opts->tcpdirect_phys_len > 0,
                   "Must provide non-zero --tcpdirect-phys-len flag if GPU PCI address was provided.");
             // TODO check page-alignment
-            // CHECK((CUdeviceptr)gpu_tx_mem_ % PAGE_SIZE == 0);
+            // CHECK((CUdeviceptr)gpu_gen_mem_ % PAGE_SIZE == 0);
 
             if (opts->client) {
                   CHECK(cb, !opts->tcpd_rx_cpy,
diff --git a/flow.h b/flow.h
index 6787def..fb7b755 100644
--- a/flow.h
+++ b/flow.h
@@ -40,7 +40,11 @@ struct tcpdirect_udma_mbuf {
 struct tcpdirect_cuda_mbuf {
         int gpu_mem_fd_;
         int dma_buf_fd_;
-        void *gpu_tx_mem_;
+        void *gpu_gen_mem_;
+        void *gpu_rx_mem_;
+        void *gpu_scatter_list_;
+        void *scattered_data_;
+        void *rx_blks_;
         void *cpy_buffer;
         size_t bytes_received;
         size_t bytes_sent;
diff --git a/tcpdirect.cu b/tcpdirect.cu
index 5b32cb1..7763e60 100644
--- a/tcpdirect.cu
+++ b/tcpdirect.cu
@@ -100,6 +100,12 @@ struct devmemtoken {
   __u32 token_count;
 };
 
+struct TcpDirectRxBlock {
+  uint64_t gpu_offset;
+  size_t size;
+  uint64_t paddr;
+};
+
 struct udmabuf_create {
   uint32_t memfd;
   uint32_t flags;
@@ -137,6 +143,47 @@ void fill_tx_buffer(void *buf, size_t n) {
   }
 }
 
+__global__ void scatter_copy_kernel(long3* scatter_list, uint8_t* dst,
+                                    uint8_t* src) {
+  int block_idx = blockIdx.x;
+  long3 blk = scatter_list[block_idx];
+  long dst_off = blk.x;
+  long src_off = blk.y;
+  long sz = blk.z;
+
+  int thread_sz = sz / blockDim.x;
+  int rem = sz % blockDim.x;
+  bool extra = (threadIdx.x < rem);
+  int thread_offset = sz / blockDim.x * threadIdx.x;
+  thread_offset += (extra) ? threadIdx.x : rem;
+
+  for (int i = 0; i < thread_sz; i++) {
+    dst[dst_off + thread_offset + i] = src[src_off + thread_offset + i];
+  }
+  if (extra) {
+    dst[dst_off + thread_offset + thread_sz] =
+        src[src_off + thread_offset + thread_sz];
+  }
+}
+
+void GatherRxData(struct tcpdirect_cuda_mbuf *tmbuf) {
+  int ret;
+  void *gpu_scatter_list_ = tmbuf->gpu_scatter_list_;
+  std::vector<long3> *scattered_data_ = (std::vector<long3> *)tmbuf->scattered_data_;
+  void *gpu_rx_mem_ = tmbuf->gpu_rx_mem_;
+  void *rx_buff_ = tmbuf->gpu_gen_mem_;
+
+  ret = cudaMemcpyAsync(gpu_scatter_list_,
+                        scattered_data_->data(),
+                        scattered_data_->size() * sizeof(long3),
+                        cudaMemcpyHostToDevice);
+  if (ret)
+    return;
+
+  scatter_copy_kernel<<<scattered_data_->size(), 256, 0>>>(
+      (long3*)gpu_scatter_list_, (uint8_t*)gpu_rx_mem_, (uint8_t*)rx_buff_);
+}
+
 int tcpdirect_setup_socket(int socket) {
   const int one = 1;
   if (setsockopt(socket, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one))
@@ -199,7 +246,7 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
 {
   bool is_client = opts->client;
   int ret;
-  void *gpu_tx_mem_;
+  void *gpu_gen_mem_;
   int gpu_mem_fd_;
   int dma_buf_fd_;
   int q_start = opts->queue_start;
@@ -226,18 +273,18 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
   //   exit(70);
   // }
 
-  cudaMalloc(&gpu_tx_mem_, alloc_size);
+  cudaMalloc(&gpu_gen_mem_, alloc_size);
   if (is_client && opts->tcpd_validate) {
-          fill_tx_buffer(gpu_tx_mem_, alloc_size);
+          fill_tx_buffer(gpu_gen_mem_, alloc_size);
           cudaDeviceSynchronize();
   }
   unsigned int flag = 1;
   cuPointerSetAttribute(&flag,
                         CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
-                        (CUdeviceptr)gpu_tx_mem_);
+                        (CUdeviceptr)gpu_gen_mem_);
 
   gpu_mem_fd_ = get_gpumem_dmabuf_pages_fd(gpu_pci_addr, nic_pci_addr,
-                                           gpu_tx_mem_, alloc_size,
+                                           gpu_gen_mem_, alloc_size,
                                            &dma_buf_fd_, is_client);
 
   if (gpu_mem_fd_ < 0) {
@@ -288,12 +335,17 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
   *f_mbuf = tmbuf;
   tmbuf->gpu_mem_fd_ = gpu_mem_fd_;
   tmbuf->dma_buf_fd_ = dma_buf_fd_;
-  tmbuf->gpu_tx_mem_ = gpu_tx_mem_;
+  tmbuf->gpu_gen_mem_ = gpu_gen_mem_;
   tmbuf->cpy_buffer = malloc(opts->buffer_size);
   tmbuf->vectors = new std::vector<devmemvec>();
   tmbuf->tokens = new std::vector<devmemtoken>();
   tmbuf->bytes_received = 0;
   tmbuf->bytes_sent = 0;
+
+  cudaMalloc(&tmbuf->gpu_rx_mem_, opts->buffer_size);
+  cudaMalloc(&tmbuf->gpu_scatter_list_, opts->buffer_size);
+  tmbuf->rx_blks_ = new std::vector<TcpDirectRxBlock>();
+  tmbuf->scattered_data_ = new std::vector<long3>();
   return 0;
 }
 
@@ -541,6 +593,8 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread
   const struct options *opts = t->opts;
   std::vector<devmemvec> *vectors;
   std::vector<devmemtoken> *tokens;
+  std::vector<TcpDirectRxBlock> *rx_blks_;
+  std::vector<long3> *scattered_data_;
 
   if (!f_mbuf) return -1;
 
@@ -548,6 +602,8 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread
   cpy_buffer = (unsigned char *)tmbuf->cpy_buffer;
   vectors = (std::vector<devmemvec> *)tmbuf->vectors;
   tokens = (std::vector<devmemtoken> *)tmbuf->tokens;
+  rx_blks_ = (std::vector<TcpDirectRxBlock> *)tmbuf->rx_blks_;
+  scattered_data_ = (std::vector<long3> *)tmbuf->scattered_data_;
 
   client_fd = socket;
 
@@ -570,7 +626,9 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread
 
   if (msg->msg_flags & MSG_CTRUNC) {
     printf("fatal, cmsg truncated, current msg_controllen\n");
- }
+  }
+
+  rx_blks_->clear();
 
   ssize_t received = recvmsg(socket, msg, MSG_SOCK_DEVMEM | MSG_DONTWAIT);
   if (received < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
@@ -601,6 +659,11 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread
     }
 
     struct devmemtoken token = { devmemvec->frag_token, 1 };
+    struct TcpDirectRxBlock blk;
+
+    blk.gpu_offset = (uint64_t)devmemvec->frag_offset;
+    blk.size = devmemvec->frag_size;
+    rx_blks_->emplace_back(blk);
 
     // struct dma_buf_sync sync = { 0 };
     // sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_START;
@@ -629,6 +692,15 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread
     // munmap(buf_mem, n);
   }
 
+  size_t dst_offset = tmbuf->bytes_received;
+  for (int i = 0; i < rx_blks_->size(); i++) {
+    struct TcpDirectRxBlock blk = rx_blks_->at(i);
+    size_t off = (size_t)blk.gpu_offset;
+    scattered_data_->emplace_back(
+        make_long3((long)dst_offset, (long)off, (long)blk.size));
+
+    dst_offset += blk.size;
+  }
   tmbuf->bytes_received += received;
 
   /* Once we've received fragments totaling buffer_size, we can copy from the
@@ -636,10 +708,14 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread
    * buffer.
   */
   if (tmbuf->bytes_received == buffer_size) {
+    if (opts->tcpd_rx_cpy) {
+      GatherRxData(tmbuf);
+      cudaDeviceSynchronize();
+    }
     /* There is a performance impact when we cudaMemcpy from the CUDA buffer to
      * the userspace buffer, so it's gated by a flag
      */
-    if (opts->tcpd_rx_cpy || opts->tcpd_validate) {
+    if (opts->tcpd_validate) {
       for (int idx = 0; idx < vectors->size(); idx++) {
         struct devmemvec vec = (*vectors)[idx];
         struct devmemtoken token = (*tokens)[idx];
@@ -649,7 +725,7 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread
          * occupy bytes [4096-8191], etc.
          */
         cudaMemcpy(cpy_buffer + (vec.frag_token - 1) * PAGE_SIZE,
-                   (char *)tmbuf->gpu_tx_mem_ + vec.frag_offset,
+                   (char *)tmbuf->gpu_gen_mem_ + vec.frag_offset,
                    vec.frag_size,
                    cudaMemcpyDeviceToHost);
       }
@@ -657,22 +733,21 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread
       /* Ensure the sequence is what we expect:
        * a repeating sequence of 1 to LAST_PRIME inclusive
        */
-      if (opts->tcpd_validate) {
-        cudaDeviceSynchronize();
-        int i = 0;
-        int expected_val;
-        while (i < buffer_size) {
-          expected_val = (i % LAST_PRIME) + 1;
-          if (cpy_buffer[i] != expected_val) {
-            printf("Thread %i - incorrect byte %i, expected %i, got %i\n",
-                  t->index,
-                  i,
-                  expected_val,
-                  cpy_buffer[i]);
-            break;
-          }
-          i++;
+      cudaDeviceSynchronize();
+      int i = 0;
+      int expected_val;
+      while (i < buffer_size) {
+        expected_val = (i % LAST_PRIME) + 1;
+        if (cpy_buffer[i] != expected_val) {
+          LOG_WARN(t->cb,
+                   "Thread %i - incorrect byte %i, expected %i, got %i",
+                   t->index,
+                   i,
+                   expected_val,
+                   cpy_buffer[i]);
+          break;
         }
+        i++;
       }
     }
 
@@ -685,6 +760,8 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread
     }
     vectors->clear();
     tokens->clear();
+    rx_blks_->clear();
+    scattered_data_->clear();
     tmbuf->bytes_received = 0;
   }
   return total_received;
@@ -694,10 +771,15 @@ int cuda_flow_cleanup(void *f_mbuf) {
   struct tcpdirect_cuda_mbuf *t_mbuf = (struct tcpdirect_cuda_mbuf *)f_mbuf;
   close(t_mbuf->gpu_mem_fd_);
   close(t_mbuf->dma_buf_fd_);
-  cudaFree(t_mbuf->gpu_tx_mem_);
+  cudaFree(t_mbuf->gpu_gen_mem_);
   free(t_mbuf->cpy_buffer);
   free(t_mbuf->tokens);
   free(t_mbuf->vectors);
+
+  cudaFree(t_mbuf->gpu_rx_mem_);
+  cudaFree(t_mbuf->gpu_scatter_list_);
+  free(t_mbuf->rx_blks_);
+  free(t_mbuf->scattered_data_);
   return 0;
 }
 #endif /* #ifdef WITH_TCPDIRECT */

From b44fef3ff778d9475d0030027b3f9b38bf52c475 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Thu, 12 Oct 2023 21:03:56 +0000
Subject: [PATCH 31/72] change WITH_TCPDIRECT to WITH_TCPDEVMEM

---
 Makefile            | 12 ++++++------
 check_all_options.c |  4 ++--
 define_all_flags.c  |  4 ++--
 flow.c              |  4 ++--
 lib.h               |  4 ++--
 socket.c            | 12 ++++++------
 stream.c            |  8 ++++----
 tcpdirect.cu        |  4 ++--
 8 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/Makefile b/Makefile
index 754b62c..522cb3c 100644
--- a/Makefile
+++ b/Makefile
@@ -18,10 +18,10 @@
 
 all: binaries
 
-CFLAGS := -std=c99 -Wall -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA -DNDEBUG=1
+CFLAGS := -std=c99 -Wall -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA
 
-ifdef WITH_TCPDIRECT
-	CFLAGS += -DWITH_TCPDIRECT
+ifdef WITH_TCPDEVMEM
+	CFLAGS += -DWITH_TCPDEVMEM
 endif
 
 lib := \
@@ -53,7 +53,7 @@ lib := \
 tcp_rr-objs := tcp_rr_main.o tcp_rr.o rr.o $(lib)
 
 tcp_stream-objs := tcp_stream_main.o tcp_stream.o stream.o $(lib)
-ifdef WITH_TCPDIRECT
+ifdef WITH_TCPDEVMEM
 	tcp_stream-objs += tcpdirect.o
 endif
 
@@ -72,13 +72,13 @@ psp_rr-objs := psp_rr_main.o psp_rr.o rr.o psp_lib.o $(lib)
 ext-libs := -lm -lrt -lpthread
 
 tcpdirect.o: tcpdirect.cu
-	nvcc -arch=sm_90 -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA -DWITH_TCPDIRECT -c -o $@ $^
+	nvcc -arch=sm_90 -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA -DWITH_TCPDEVMEM -c -o $@ $^
 
 tcp_rr: $(tcp_rr-objs)
 	$(CC) $(LDFLAGS) -o $@ $^ $(ext-libs)
 
 tcp_stream: $(tcp_stream-objs)
-ifdef WITH_TCPDIRECT
+ifdef WITH_TCPDEVMEM
 	g++ $(LDFLAGS) -o $@ $^ $(ext-libs) -lc -L/usr/local/cuda/lib64 -lcudart -lcuda
 else
 	$(CC) $(LDFLAGS) -o $@ $^ $(ext-libs)
diff --git a/check_all_options.c b/check_all_options.c
index c730d6b..d65ce65 100644
--- a/check_all_options.c
+++ b/check_all_options.c
@@ -101,7 +101,7 @@ void check_options_tcp_rr(struct options *opts, struct callbacks *cb)
 
 void check_options_tcp_stream(struct options *opts, struct callbacks *cb)
 {
-#ifdef WITH_TCPDIRECT
+#ifdef WITH_TCPDEVMEM
       if (opts->tcpd_gpu_pci_addr) {
             CHECK(cb, opts->tcpd_nic_pci_addr,
                   "Must provide NIC PCI address if GPU PCI address was provided.");
@@ -124,7 +124,7 @@ void check_options_tcp_stream(struct options *opts, struct callbacks *cb)
             CHECK(cb, opts->num_flows == opts->num_ports,
                   "Number of ports should equal number of flows when running in TCPDirect mode.");
       }
-#endif /* WITH_TCPDIRECT */
+#endif /* WITH_TCPDEVMEM */
 }
 
 void check_options_udp_rr(struct options *opts, struct callbacks *cb)
diff --git a/define_all_flags.c b/define_all_flags.c
index 5602e1a..5a755d8 100644
--- a/define_all_flags.c
+++ b/define_all_flags.c
@@ -141,7 +141,7 @@ struct flags_parser *add_flags_tcp_stream(struct flags_parser *fp)
         DEFINE_FLAG(fp, bool,          enable_write,    false,   'w', "Write to flows? Enabled by default for the client");
         DEFINE_FLAG(fp, bool,          enable_tcp_maerts,    false,   'M', "Enables TCP_MAERTS test (server writes and client reads). It overrides enable_read, and enable_write");
         DEFINE_FLAG(fp, bool,          async_connect,   false,   0,  "use non blocking connect");
-#ifdef WITH_TCPDIRECT
+#ifdef WITH_TCPDEVMEM
         DEFINE_FLAG(fp, bool,                        tcpd_validate, false, 0, "Validates that received data is a repeating sequence of 1 to 111 inclusive");
         DEFINE_FLAG(fp, bool,                        tcpd_rx_cpy, false, 0, "After the CUDA buffer is filled to buffer_size, calls cudaMemcpy to a userspace buffer");
         DEFINE_FLAG(fp, const char *,  tcpd_nic_pci_addr, 0,     0,   "NIC PCI addr, e.x. 0000:06:00.0");
@@ -153,7 +153,7 @@ struct flags_parser *add_flags_tcp_stream(struct flags_parser *fp)
         DEFINE_FLAG(fp, const char *,                tcpdirect_link_name, "eth1", 0, "Link name to bind DMA buffer_pages for Rx");
         DEFINE_FLAG(fp, int,                         queue_start, 8, 0, "Queue to start flow-steering at");
         DEFINE_FLAG(fp, int,                         queue_num, 4, 0, "Number of queues to flow-steer to");
-#endif
+#endif /* WITH_TCPDEVMEM */
 
         /* Return the updated fp */
         return (fp);
diff --git a/flow.c b/flow.c
index 7dd054f..f24b855 100644
--- a/flow.c
+++ b/flow.c
@@ -19,7 +19,7 @@
 #include "socket.h"
 #include "thread.h"
 #include "stats.h"
-#ifdef WITH_TCPDIRECT
+#ifdef WITH_TCPDEVMEM
 #include "tcpdirect.h"
 #endif
 
@@ -253,7 +253,7 @@ void flow_delete(struct flow *f)
                 thread_clear_flow_or_die(f->f_thread, f);
         }
 
-#ifdef WITH_TCPDIRECT
+#ifdef WITH_TCPDEVMEM
         if (flow_thread(f)->opts->tcpd_gpu_pci_addr) {
                 cuda_flow_cleanup(f->f_mbuf);
         } else if (flow_thread(f)->opts->tcpd_nic_pci_addr) {
diff --git a/lib.h b/lib.h
index 579b0d9..78a187c 100644
--- a/lib.h
+++ b/lib.h
@@ -106,7 +106,7 @@ struct options {
         bool async_connect;
 
         /* tcp_stream */
-#ifdef WITH_TCPDIRECT
+#ifdef WITH_TCPDEVMEM
         bool tcpd_validate;
         bool tcpd_rx_cpy;
         const char *tcpd_nic_pci_addr;
@@ -118,7 +118,7 @@ struct options {
         const char *tcpdirect_link_name;
         int queue_start;
         int queue_num;
-#endif
+#endif /* WITH_TCPDEVMEM */
         bool enable_read;
         bool enable_write;
         bool enable_tcp_maerts;
diff --git a/socket.c b/socket.c
index e3cdfb2..b33100e 100644
--- a/socket.c
+++ b/socket.c
@@ -18,7 +18,7 @@
 #include "flow.h"
 #include "socket.h"
 #include "thread.h"
-#ifdef WITH_TCPDIRECT
+#ifdef WITH_TCPDEVMEM
 #include "tcpdirect.h"
 #endif
 
@@ -70,7 +70,7 @@ static void socket_init_not_established(struct thread *t, int s)
                 if (err)
                         PLOG_ERROR(t->cb, "setsockopt(SO_LINGER)");
         }
-#ifdef WITH_TCPDIRECT
+#ifdef WITH_TCPDEVMEM
         if (!t->f_mbuf && opts->tcpd_gpu_pci_addr) {
                 if (tcpdirect_cuda_setup_alloc(t->opts, &t->f_mbuf, t)) {
                         LOG_ERROR(t->cb, "%s: failed to setup tcpdirect CUDA socket",
@@ -80,7 +80,7 @@ static void socket_init_not_established(struct thread *t, int s)
         }
         if (opts->tcpd_nic_pci_addr)
                 tcpdirect_setup_socket(s);
-#endif
+#endif /* WITH_TCPDEVMEM */
 }
 
 /*
@@ -254,7 +254,7 @@ void socket_listen(struct thread *t)
         struct addrinfo *ai = getaddrinfo_or_die(opts->host, opts->port, &hints,
                                                  cb);
         int port = atoi(opts->port);
-#ifdef WITH_TCPDIRECT
+#ifdef WITH_TCPDEVMEM
         /* TCPDirect:
          * Since each thread has a CUDA buffer, and
          * flow-steering rules are required, threads, TCP connections, and
@@ -287,7 +287,7 @@ void socket_listen(struct thread *t)
         switch (ai->ai_socktype) {
         case SOCK_STREAM:
                 n = opts->num_ports ? opts->num_ports : 1;
-#ifdef WITH_TCPDIRECT
+#ifdef WITH_TCPDEVMEM
                 /* TCPDirect:
                  * See TCPDirect comment above^
                  *
@@ -297,7 +297,7 @@ void socket_listen(struct thread *t)
                  */
                 if (opts->tcpd_gpu_pci_addr)
                         n = 1;
-#endif
+#endif /* WITH_TCPDEVMEM */
                 for (i = 0; i < n; i++) {
                         s = socket_bind_listener(t, ai);
                         socket_init_not_established(t, s);
diff --git a/stream.c b/stream.c
index 8867431..eb0f831 100644
--- a/stream.c
+++ b/stream.c
@@ -23,7 +23,7 @@
 #include "socket.h"
 #include "stats.h"
 #include "thread.h"
-#ifdef WITH_TCPDIRECT
+#ifdef WITH_TCPDEVMEM
 #include "tcpdirect.h"
 #endif
 
@@ -31,7 +31,7 @@ static void *stream_alloc(struct thread *t)
 {
         const struct options *opts = t->opts;
 
-#ifdef WITH_TCPDIRECT
+#ifdef WITH_TCPDEVMEM
         if (!t->f_mbuf && t->opts->tcpd_gpu_pci_addr) {
                 if (tcpdirect_cuda_setup_alloc(t->opts, &t->f_mbuf, t)) {
                         LOG_ERROR(t->cb, "%s: failed to setup tcpdirect CUDA socket",
@@ -106,7 +106,7 @@ void stream_handler(struct flow *f, uint32_t events)
         if (events & EPOLLIN)
                 do {
                         do {
-#ifdef WITH_TCPDIRECT
+#ifdef WITH_TCPDEVMEM
                                 if (t->opts->tcpd_nic_pci_addr)
                                         n = tcpdirect_recv(fd, mbuf,
                                                            opts->buffer_size,
@@ -131,7 +131,7 @@ void stream_handler(struct flow *f, uint32_t events)
 
         if (events & EPOLLOUT)
                 do {
-#ifdef WITH_TCPDIRECT
+#ifdef WITH_TCPDEVMEM
                         if (t->opts->tcpd_gpu_pci_addr) {
                                 n = tcpdirect_send(fd, mbuf, opts->buffer_size, opts->send_flags);
                         }else if (t->opts->tcpd_nic_pci_addr) {
diff --git a/tcpdirect.cu b/tcpdirect.cu
index 7763e60..bd44618 100644
--- a/tcpdirect.cu
+++ b/tcpdirect.cu
@@ -1,4 +1,4 @@
-#ifdef WITH_TCPDIRECT
+#ifdef WITH_TCPDEVMEM
 #include <cuda.h>
 #include <cuda_runtime.h>
 
@@ -782,4 +782,4 @@ int cuda_flow_cleanup(void *f_mbuf) {
   free(t_mbuf->scattered_data_);
   return 0;
 }
-#endif /* #ifdef WITH_TCPDIRECT */
+#endif /* #ifdef WITH_TCPDEVMEM */

From 5366563e13eb430d066bfbda29737d44d05637ea Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Thu, 12 Oct 2023 21:22:50 +0000
Subject: [PATCH 32/72] tcpdirect filename to tcpdevmem filename

remove some extraneous comments
tcpdirect -> tcpd (tcpdevmem)
---
 Makefile                     |   4 +-
 check_all_options.c          |  18 +++---
 define_all_flags.c           |  21 ++++---
 flow.c                       |   4 +-
 flow.h                       |  26 ---------
 lib.h                        |   9 ++-
 multi_neper.py               |  56 ++-----------------
 socket.c                     |  14 ++---
 stream.c                     |  14 ++---
 tcpdirect.cu => tcpdevmem.cu | 105 ++++++++++++-----------------------
 tcpdevmem.h                  |  52 +++++++++++++++++
 tcpdirect.h                  |  26 ---------
 12 files changed, 133 insertions(+), 216 deletions(-)
 rename tcpdirect.cu => tcpdevmem.cu (86%)
 create mode 100644 tcpdevmem.h
 delete mode 100644 tcpdirect.h

diff --git a/Makefile b/Makefile
index 522cb3c..e64ecea 100644
--- a/Makefile
+++ b/Makefile
@@ -54,7 +54,7 @@ tcp_rr-objs := tcp_rr_main.o tcp_rr.o rr.o $(lib)
 
 tcp_stream-objs := tcp_stream_main.o tcp_stream.o stream.o $(lib)
 ifdef WITH_TCPDEVMEM
-	tcp_stream-objs += tcpdirect.o
+	tcp_stream-objs += tcpdevmem.o
 endif
 
 tcp_crr-objs := tcp_crr_main.o tcp_crr.o rr.o $(lib)
@@ -71,7 +71,7 @@ psp_rr-objs := psp_rr_main.o psp_rr.o rr.o psp_lib.o $(lib)
 
 ext-libs := -lm -lrt -lpthread
 
-tcpdirect.o: tcpdirect.cu
+tcpdevmem.o: tcpdevmem.cu
 	nvcc -arch=sm_90 -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA -DWITH_TCPDEVMEM -c -o $@ $^
 
 tcp_rr: $(tcp_rr-objs)
diff --git a/check_all_options.c b/check_all_options.c
index d65ce65..0f667e7 100644
--- a/check_all_options.c
+++ b/check_all_options.c
@@ -105,24 +105,22 @@ void check_options_tcp_stream(struct options *opts, struct callbacks *cb)
       if (opts->tcpd_gpu_pci_addr) {
             CHECK(cb, opts->tcpd_nic_pci_addr,
                   "Must provide NIC PCI address if GPU PCI address was provided.");
-            CHECK(cb, opts->tcpdirect_phys_len > 0,
-                  "Must provide non-zero --tcpdirect-phys-len flag if GPU PCI address was provided.");
-            // TODO check page-alignment
-            // CHECK((CUdeviceptr)gpu_gen_mem_ % PAGE_SIZE == 0);
+            CHECK(cb, opts->tcpd_phys_len > 0,
+                  "Must provide non-zero --tcpd-phys-len flag if GPU PCI address was provided.");
 
             if (opts->client) {
                   CHECK(cb, !opts->tcpd_rx_cpy,
                         "Copying CUDA buffer to userspace only allowed on hosts.");
             } else {
-                  CHECK(cb, opts->tcpdirect_src_ip,
-                        "Must provide source IP address for TCPDirect host.");
-                  CHECK(cb, opts->tcpdirect_dst_ip,
-                        "Must provide destination IP address for TCPDirect host.");
+                  CHECK(cb, opts->tcpd_src_ip,
+                        "Must provide source IP address for devmem TCP host.");
+                  CHECK(cb, opts->tcpd_dst_ip,
+                        "Must provide destination IP address for devmem TCP host.");
             }
             CHECK(cb, opts->num_flows == opts->num_threads,
-                  "Thread/Flow count must be equal when running in TCPDirect mode.");
+                  "Thread/Flow count must be equal when running in devmem TCP mode.");
             CHECK(cb, opts->num_flows == opts->num_ports,
-                  "Number of ports should equal number of flows when running in TCPDirect mode.");
+                  "Number of ports should equal number of flows when running in devmem TCP mode.");
       }
 #endif /* WITH_TCPDEVMEM */
 }
diff --git a/define_all_flags.c b/define_all_flags.c
index 5a755d8..3bbcf15 100644
--- a/define_all_flags.c
+++ b/define_all_flags.c
@@ -142,17 +142,16 @@ struct flags_parser *add_flags_tcp_stream(struct flags_parser *fp)
         DEFINE_FLAG(fp, bool,          enable_tcp_maerts,    false,   'M', "Enables TCP_MAERTS test (server writes and client reads). It overrides enable_read, and enable_write");
         DEFINE_FLAG(fp, bool,          async_connect,   false,   0,  "use non blocking connect");
 #ifdef WITH_TCPDEVMEM
-        DEFINE_FLAG(fp, bool,                        tcpd_validate, false, 0, "Validates that received data is a repeating sequence of 1 to 111 inclusive");
-        DEFINE_FLAG(fp, bool,                        tcpd_rx_cpy, false, 0, "After the CUDA buffer is filled to buffer_size, calls cudaMemcpy to a userspace buffer");
-        DEFINE_FLAG(fp, const char *,  tcpd_nic_pci_addr, 0,     0,   "NIC PCI addr, e.x. 0000:06:00.0");
-        DEFINE_FLAG(fp, const char *,  tcpd_gpu_pci_addr, 0,     0,   "GPU PCI addr, e.x. 0000:04:00.0");
-        DEFINE_FLAG(fp, unsigned long long,          tcpdirect_phys_addr, 0, 0, "Set the remote memory physical address for tcpdirect, e.x. 0000:06:00.0");
-        DEFINE_FLAG(fp, unsigned long long,          tcpdirect_phys_len, 0, 0, "Set the remote memory length for tcpdirect");
-        DEFINE_FLAG(fp, const char *,                tcpdirect_src_ip, 0, 0, "Set the src ip address for tcpdirect");
-        DEFINE_FLAG(fp, const char *,                tcpdirect_dst_ip, 0, 0, "Set the dst ip address for tcpdirect");
-        DEFINE_FLAG(fp, const char *,                tcpdirect_link_name, "eth1", 0, "Link name to bind DMA buffer_pages for Rx");
-        DEFINE_FLAG(fp, int,                         queue_start, 8, 0, "Queue to start flow-steering at");
-        DEFINE_FLAG(fp, int,                         queue_num, 4, 0, "Number of queues to flow-steer to");
+        DEFINE_FLAG(fp, bool,          tcpd_validate,   false,  0, "Validates that received data is a repeating sequence of 1 to 111 inclusive");
+        DEFINE_FLAG(fp, bool,          tcpd_rx_cpy,     false,  0, "After the CUDA buffer is filled to buffer_size, calls cudaMemcpy to a userspace buffer");
+        DEFINE_FLAG(fp, const char *,  tcpd_nic_pci_addr, 0,    0, "NIC PCI addr, e.x. 0000:06:00.0");
+        DEFINE_FLAG(fp, const char *,  tcpd_gpu_pci_addr, 0,    0, "GPU PCI addr, e.x. 0000:04:00.0");
+        DEFINE_FLAG(fp, unsigned long long, tcpd_phys_len, 0,   0, "Remote memory length for tcpdevmem");
+        DEFINE_FLAG(fp, const char *,  tcpd_src_ip,     0,      0, "Src ip address for tcpdevmem");
+        DEFINE_FLAG(fp, const char *,  tcpd_dst_ip,     0,      0, "Dst ip address for tcpdevmem");
+        DEFINE_FLAG(fp, const char *,  tcpd_link_name,  "eth1", 0, "Link name to bind DMA buffer_pages for Rx");
+        DEFINE_FLAG(fp, int,           queue_start,     8,      0, "Queue to start flow-steering at");
+        DEFINE_FLAG(fp, int,           queue_num,       4,      0, "Number of queues to flow-steer to");
 #endif /* WITH_TCPDEVMEM */
 
         /* Return the updated fp */
diff --git a/flow.c b/flow.c
index f24b855..1f1042c 100644
--- a/flow.c
+++ b/flow.c
@@ -20,7 +20,7 @@
 #include "thread.h"
 #include "stats.h"
 #ifdef WITH_TCPDEVMEM
-#include "tcpdirect.h"
+#include "tcpdevmem.h"
 #endif
 
 /*
@@ -257,7 +257,7 @@ void flow_delete(struct flow *f)
         if (flow_thread(f)->opts->tcpd_gpu_pci_addr) {
                 cuda_flow_cleanup(f->f_mbuf);
         } else if (flow_thread(f)->opts->tcpd_nic_pci_addr) {
-                struct tcpdirect_udma_mbuf *t_mbuf = (struct tcpdirect_udma_mbuf *)f->f_mbuf;
+                struct tcpdevmem_udma_mbuf *t_mbuf = (struct tcpdevmem_udma_mbuf *)f->f_mbuf;
 
                 close(t_mbuf->buf_pages);
                 close(t_mbuf->buf);
diff --git a/flow.h b/flow.h
index fb7b755..c56691a 100644
--- a/flow.h
+++ b/flow.h
@@ -26,32 +26,6 @@ struct flow;  /* note: struct is defined opaquely within flow.c */
 struct neper_stat;
 struct thread;
 
-struct tcpdirect_udma_mbuf {
-        struct msghdr msg;
-        int dmabuf_fd;
-        int pages_fd;
-
-        int devfd;
-        int memfd;
-        int buf;
-        int buf_pages;
-};
-
-struct tcpdirect_cuda_mbuf {
-        int gpu_mem_fd_;
-        int dma_buf_fd_;
-        void *gpu_gen_mem_;
-        void *gpu_rx_mem_;
-        void *gpu_scatter_list_;
-        void *scattered_data_;
-        void *rx_blks_;
-        void *cpy_buffer;
-        size_t bytes_received;
-        size_t bytes_sent;
-        void *tokens;
-        void *vectors;
-};
-
 typedef void (*flow_handler)(struct flow *, uint32_t);
 
 /* Simple accessors. */
diff --git a/lib.h b/lib.h
index 78a187c..35eef24 100644
--- a/lib.h
+++ b/lib.h
@@ -111,11 +111,10 @@ struct options {
         bool tcpd_rx_cpy;
         const char *tcpd_nic_pci_addr;
         const char *tcpd_gpu_pci_addr;
-        unsigned long long tcpdirect_phys_addr;
-        unsigned long long tcpdirect_phys_len;
-        const char *tcpdirect_src_ip;
-        const char *tcpdirect_dst_ip;
-        const char *tcpdirect_link_name;
+        unsigned long long tcpd_phys_len;
+        const char *tcpd_src_ip;
+        const char *tcpd_dst_ip;
+        const char *tcpd_link_name;
         int queue_start;
         int queue_num;
 #endif /* WITH_TCPDEVMEM */
diff --git a/multi_neper.py b/multi_neper.py
index bd35734..1ee87f0 100755
--- a/multi_neper.py
+++ b/multi_neper.py
@@ -43,36 +43,6 @@ def run_pre_neper_cmds(dev: str):
         for cmd in cmds:
                 subprocess.run(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
 
-# adds flow-steering rules, e.x.
-# ethtool -N eth1 flow-type tcp4 ...
-def install_flow_steer_rules(dev, threads: int, src_port, port, src_ip, dst_ip, q_start, q_num)->list:
-        subprocesses, rules = [], []
-
-        for i in range(threads):
-                queue = q_start + (i % q_num)
-                flow_steering_cmd = f"ethtool -N {dev} flow-type tcp4 src-ip {src_ip} dst-ip {dst_ip} src-port {src_port + i} dst-port {port} queue {queue}"
-                debug(flow_steering_cmd)
-                sp = subprocess.run(flow_steering_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-                subprocesses.append(sp)
-
-                line = sp.stdout.strip()
-                # the expected output will be similar to:
-                # "Added rule with ID 19989"
-                if "Added rule with ID" in line:
-                        rule = line.split()[-1]
-                        debug(f"[{dev}] added rule {rule}: {src_ip} {dst_ip} {src_port + i} {port}")
-                        rules.append(rule)
-
-        return rules
-
-
-# deletes flow-steering rules, given a list of rules and a link name
-def del_flow_steer_rules(dev: str, rules: list):
-        for rule in rules:
-                del_cmd = f"ethtool -N {dev} delete {rule}"
-                debug(f"[{dev}] deleting rule {rule}")
-                subprocess.run(del_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
-
 # returns a 2-tuple of a Neper command and a dict of env vars
 def build_neper_cmd(neper_dir: str, is_client: bool, dev: str,
                     threads: int, flows: int,
@@ -83,7 +53,7 @@ def build_neper_cmd(neper_dir: str, is_client: bool, dev: str,
                     tcpd_validate, tcpd_rx_cpy)->tuple:
 
         cmd = (f"taskset --cpu-list {cpu_list} {neper_dir}/tcp_stream"
-               f" -T {threads} -F {flows} --tcpdirect-phys-len {phys_len}"
+               f" -T {threads} -F {flows} --tcpd-phys-len {phys_len}"
                f" --port {port} --source-port {source_port}"
                f" --control-port {control_port}"
                f" --buffer-size {buffer_size} --tcpd-nic-pci-addr {nic_pci}"
@@ -96,9 +66,9 @@ def build_neper_cmd(neper_dir: str, is_client: bool, dev: str,
         if is_client:
                 cmd += f" -c -H {dst_ip}"
         else:
-                cmd = cmd + (f" --tcpdirect-link-name {dev}"
-                             f" --tcpdirect-src-ip {src_ip}"
-                             f" --tcpdirect-dst-ip {dst_ip}"
+                cmd = cmd + (f" --tcpd-link-name {dev}"
+                             f" --tcpd-src-ip {src_ip}"
+                             f" --tcpd-dst-ip {dst_ip}"
                              f" --queue-start {queue_start}"
                              f" --queue-num {queue_num}")
                 if tcpd_rx_cpy:
@@ -194,17 +164,6 @@ def parse_subprocess_outputs(subprocesses):
                         if not args.dry_run:
                                 run_pre_neper_cmds(dev)
 
-                        # TODO flow-steering rules installed in Neper now
-                        # control_port = args.control_port + i
-                        # starting_port = i * args.threads + args.source_port
-                        # dev = devices[i]
-                        # src_ip, dst_ip = src_ips[i], hosts[i]
-
-                        # # TODO port_start q_start, q_num
-                        # dst_port = args.port + i
-                        # rules = install_flow_steer_rules(dev, args.threads, starting_port, dst_port, src_ip, dst_ip, args.q_start, args.q_num)
-                        # dev_to_rule[dev] = rules
-
         cmds = []
         debug(f"running on {devices}")
         is_client = args.client
@@ -244,10 +203,3 @@ def parse_subprocess_outputs(subprocesses):
                                         print(f"[{dev}] Throughput (Mb/s): {i['throughput']}")
                                 except KeyError:
                                         print(f"[{dev}] Throughput (Mb/s): NA")
-
-                # TODO remove, flow-steering rules are installed via Neper now
-                # delete flow-steering rules
-                # if not args.client:
-                #         info("deleting flow-steering rules")
-                #         for dev in dev_to_rule:
-                #                 del_flow_steer_rules(dev, dev_to_rule[dev])
diff --git a/socket.c b/socket.c
index b33100e..a8bd91f 100644
--- a/socket.c
+++ b/socket.c
@@ -19,7 +19,7 @@
 #include "socket.h"
 #include "thread.h"
 #ifdef WITH_TCPDEVMEM
-#include "tcpdirect.h"
+#include "tcpdevmem.h"
 #endif
 
 #ifndef NO_LIBNUMA
@@ -72,14 +72,14 @@ static void socket_init_not_established(struct thread *t, int s)
         }
 #ifdef WITH_TCPDEVMEM
         if (!t->f_mbuf && opts->tcpd_gpu_pci_addr) {
-                if (tcpdirect_cuda_setup_alloc(t->opts, &t->f_mbuf, t)) {
-                        LOG_ERROR(t->cb, "%s: failed to setup tcpdirect CUDA socket",
+                if (tcpd_cuda_setup_alloc(t->opts, &t->f_mbuf, t)) {
+                        LOG_FATAL(t->cb, "%s: failed to setup devmem CUDA socket",
                                   __func__);
                         exit(1);
                 }
         }
         if (opts->tcpd_nic_pci_addr)
-                tcpdirect_setup_socket(s);
+                tcpd_setup_socket(s);
 #endif /* WITH_TCPDEVMEM */
 }
 
@@ -255,7 +255,7 @@ void socket_listen(struct thread *t)
                                                  cb);
         int port = atoi(opts->port);
 #ifdef WITH_TCPDEVMEM
-        /* TCPDirect:
+        /* TCP Devmem:
          * Since each thread has a CUDA buffer, and
          * flow-steering rules are required, threads, TCP connections, and
          * CUDA buffers need to be 1:1:1.
@@ -288,8 +288,8 @@ void socket_listen(struct thread *t)
         case SOCK_STREAM:
                 n = opts->num_ports ? opts->num_ports : 1;
 #ifdef WITH_TCPDEVMEM
-                /* TCPDirect:
-                 * See TCPDirect comment above^
+                /* TCP Devmem:
+                 * See TCP Devmem comment above^
                  *
                  * We are co-opting the num_ports option, so each thread/flow
                  * listens on a port that's 1 larger than the previous thread's
diff --git a/stream.c b/stream.c
index eb0f831..be7c259 100644
--- a/stream.c
+++ b/stream.c
@@ -24,7 +24,7 @@
 #include "stats.h"
 #include "thread.h"
 #ifdef WITH_TCPDEVMEM
-#include "tcpdirect.h"
+#include "tcpdevmem.h"
 #endif
 
 static void *stream_alloc(struct thread *t)
@@ -33,8 +33,8 @@ static void *stream_alloc(struct thread *t)
 
 #ifdef WITH_TCPDEVMEM
         if (!t->f_mbuf && t->opts->tcpd_gpu_pci_addr) {
-                if (tcpdirect_cuda_setup_alloc(t->opts, &t->f_mbuf, t)) {
-                        LOG_ERROR(t->cb, "%s: failed to setup tcpdirect CUDA socket",
+                if (tcpd_cuda_setup_alloc(t->opts, &t->f_mbuf, t)) {
+                        LOG_FATAL(t->cb, "%s: failed to setup devmem CUDA socket",
                                   __func__);
                         exit(1);
                 }
@@ -42,7 +42,7 @@ static void *stream_alloc(struct thread *t)
 
         if (!t->f_mbuf && t->opts->tcpd_nic_pci_addr) {
                 if (udmabuf_setup_alloc(t->opts, &t->f_mbuf)) {
-                        LOG_ERROR(t->cb, "%s: failed to setup tcpdirect UDMA socket",
+                        LOG_FATAL(t->cb, "%s: failed to setup devmem UDMABUF socket",
                                   __func__);
                         exit(1);
                 }
@@ -108,7 +108,7 @@ void stream_handler(struct flow *f, uint32_t events)
                         do {
 #ifdef WITH_TCPDEVMEM
                                 if (t->opts->tcpd_nic_pci_addr)
-                                        n = tcpdirect_recv(fd, mbuf,
+                                        n = tcpd_recv(fd, mbuf,
                                                            opts->buffer_size,
                                                            opts->recv_flags,
                                                            t);
@@ -133,9 +133,9 @@ void stream_handler(struct flow *f, uint32_t events)
                 do {
 #ifdef WITH_TCPDEVMEM
                         if (t->opts->tcpd_gpu_pci_addr) {
-                                n = tcpdirect_send(fd, mbuf, opts->buffer_size, opts->send_flags);
+                                n = tcpd_send(fd, mbuf, opts->buffer_size, opts->send_flags);
                         }else if (t->opts->tcpd_nic_pci_addr) {
-                                n = tcpdirect_udma_send(fd, mbuf,
+                                n = tcpd_udma_send(fd, mbuf,
                                         opts->buffer_size, opts->send_flags);
                         } else
 #endif
diff --git a/tcpdirect.cu b/tcpdevmem.cu
similarity index 86%
rename from tcpdirect.cu
rename to tcpdevmem.cu
index bd44618..e77e9f7 100644
--- a/tcpdirect.cu
+++ b/tcpdevmem.cu
@@ -20,7 +20,7 @@
 #include <string>
 #include <vector>
 
-#include "tcpdirect.h"
+#include "tcpdevmem.h"
 #include "logging.h"
 #include "flow.h"
 #include "thread.h"
@@ -100,7 +100,7 @@ struct devmemtoken {
   __u32 token_count;
 };
 
-struct TcpDirectRxBlock {
+struct TcpdRxBlock {
   uint64_t gpu_offset;
   size_t size;
   uint64_t paddr;
@@ -166,7 +166,7 @@ __global__ void scatter_copy_kernel(long3* scatter_list, uint8_t* dst,
   }
 }
 
-void GatherRxData(struct tcpdirect_cuda_mbuf *tmbuf) {
+void gather_rx_data(struct tcpdevmem_cuda_mbuf *tmbuf) {
   int ret;
   void *gpu_scatter_list_ = tmbuf->gpu_scatter_list_;
   std::vector<long3> *scattered_data_ = (std::vector<long3> *)tmbuf->scattered_data_;
@@ -184,13 +184,13 @@ void GatherRxData(struct tcpdirect_cuda_mbuf *tmbuf) {
       (long3*)gpu_scatter_list_, (uint8_t*)gpu_rx_mem_, (uint8_t*)rx_buff_);
 }
 
-int tcpdirect_setup_socket(int socket) {
+int tcpd_setup_socket(int socket) {
   const int one = 1;
   if (setsockopt(socket, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one))
       || setsockopt(socket, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one))
       || setsockopt(socket, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one))
      ) {
-    perror("tcpdirect_setup_socket");
+    perror("tcpd_setup_socket");
     exit(EXIT_FAILURE);
   }
 
@@ -242,7 +242,7 @@ err_close_dmabuf:
   return err;
 }
 
-int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct thread *t)
+int tcpd_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct thread *t)
 {
   bool is_client = opts->client;
   int ret;
@@ -251,13 +251,13 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
   int dma_buf_fd_;
   int q_start = opts->queue_start;
   int q_num = opts->queue_num;
-  struct tcpdirect_cuda_mbuf *tmbuf;
-  const char *gpu_pci_addr = opts->tcpd_gpu_pci_addr;  // "0000:04:00.0"
-  const char *nic_pci_addr = opts->tcpd_nic_pci_addr;  // "0000:06:00.0"
-  size_t alloc_size = opts->tcpdirect_phys_len;
+  struct tcpdevmem_cuda_mbuf *tmbuf;
+  const char *gpu_pci_addr = opts->tcpd_gpu_pci_addr;
+  const char *nic_pci_addr = opts->tcpd_nic_pci_addr;
+  size_t alloc_size = opts->tcpd_phys_len;
 
   tmbuf =
-    (struct tcpdirect_cuda_mbuf *)calloc(1, sizeof(struct tcpdirect_cuda_mbuf));
+    (struct tcpdevmem_cuda_mbuf *)calloc(1, sizeof(struct tcpdevmem_cuda_mbuf));
   if (!tmbuf) {
     exit(EXIT_FAILURE);
   }
@@ -266,13 +266,6 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
     alloc_size += GPUMEM_ALIGNMENT - (alloc_size % GPUMEM_ALIGNMENT);
   }
 
-  // unnecessary if CUDA_VISIBLE_DEVICES env var is set
-  // ret = cudaSetDevice(opts->tcpdirect_gpu_idx);
-  // if (ret != 0) {
-  //   printf("cudaSetDevice failed: index %i", opts->tcpdirect_gpu_idx);
-  //   exit(70);
-  // }
-
   cudaMalloc(&gpu_gen_mem_, alloc_size);
   if (is_client && opts->tcpd_validate) {
           fill_tx_buffer(gpu_gen_mem_, alloc_size);
@@ -297,7 +290,7 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
     printf("Bind to queue %i\n", num_queues);
     struct dma_buf_pages_bind_rx_queue bind_cmd;
 
-    strcpy(bind_cmd.ifname, opts->tcpdirect_link_name);
+    strcpy(bind_cmd.ifname, opts->tcpd_link_name);
     bind_cmd.rxq_idx = num_queues;
 
     ret = ioctl(gpu_mem_fd_, DMA_BUF_PAGES_BIND_RX, &bind_cmd);
@@ -315,17 +308,17 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
     char flow_steer_cmd[512];
     sprintf(flow_steer_cmd,
             "ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %i dst-port %i queue %i",
-            opts->tcpdirect_link_name, opts->tcpdirect_src_ip, opts->tcpdirect_dst_ip, src_port, dst_port, num_queues);
+            opts->tcpd_link_name, opts->tcpd_src_ip, opts->tcpd_dst_ip, src_port, dst_port, num_queues);
     ret = system(flow_steer_cmd);
 
     // only running the below ethtool commands after last thread/flow is setup
     if (flow_idx + t->flow_limit >= opts->num_flows) {
       char ethtool_cmd[512];
-      sprintf(ethtool_cmd, "ethtool --set-priv-flags %s enable-strict-header-split on", opts->tcpdirect_link_name);
+      sprintf(ethtool_cmd, "ethtool --set-priv-flags %s enable-strict-header-split on", opts->tcpd_link_name);
       ret = ret | system(ethtool_cmd);
-      sprintf(ethtool_cmd, "ethtool --set-priv-flags %s enable-header-split on", opts->tcpdirect_link_name);
+      sprintf(ethtool_cmd, "ethtool --set-priv-flags %s enable-header-split on", opts->tcpd_link_name);
       ret = ret | system(ethtool_cmd);
-      sprintf(ethtool_cmd, "ethtool --set-rxfh-indir %s equal 8", opts->tcpdirect_link_name);
+      sprintf(ethtool_cmd, "ethtool --set-rxfh-indir %s equal 8", opts->tcpd_link_name);
       ret = ret | system(ethtool_cmd);
       printf("ethtool cmds returned %i, sleeping 1...\n", ret);
       sleep(1);
@@ -344,7 +337,7 @@ int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct
 
   cudaMalloc(&tmbuf->gpu_rx_mem_, opts->buffer_size);
   cudaMalloc(&tmbuf->gpu_scatter_list_, opts->buffer_size);
-  tmbuf->rx_blks_ = new std::vector<TcpDirectRxBlock>();
+  tmbuf->rx_blks_ = new std::vector<TcpdRxBlock>();
   tmbuf->scattered_data_ = new std::vector<long3>();
   return 0;
 }
@@ -356,9 +349,9 @@ int udmabuf_setup_alloc(const struct options *opts, void **f_mbuf) {
   int buf;
   int buf_pages;
   int ret;
-  size_t size = opts->tcpdirect_phys_len;
+  size_t size = opts->tcpd_phys_len;
 
-  struct tcpdirect_udma_mbuf *tmbuf;
+  struct tcpdevmem_udma_mbuf *tmbuf;
   struct dma_buf_create_pages_info pages_create_info;
   struct udmabuf_create create;
 
@@ -366,7 +359,7 @@ int udmabuf_setup_alloc(const struct options *opts, void **f_mbuf) {
 
   if (*f_mbuf) return 0;
 
-  tmbuf = (struct tcpdirect_udma_mbuf *)calloc(1, sizeof(struct tcpdirect_udma_mbuf));
+  tmbuf = (struct tcpdevmem_udma_mbuf *)calloc(1, sizeof(struct tcpdevmem_udma_mbuf));
   if (!tmbuf) {
     exit(EXIT_FAILURE);
   }
@@ -411,8 +404,6 @@ int udmabuf_setup_alloc(const struct options *opts, void **f_mbuf) {
   pages_create_info.dma_buf_fd = buf;
   pages_create_info.create_page_pool = is_client ? 0 : 1;
 
-  /* TODO: hardcoded NIC pci address */
-  // "0000:06:00.0"
   ret = sscanf(opts->tcpd_nic_pci_addr, "0000:%llx:%llx.%llx",
          &pages_create_info.pci_bdf[0],
          &pages_create_info.pci_bdf[1],
@@ -464,18 +455,18 @@ int udmabuf_setup_alloc(const struct options *opts, void **f_mbuf) {
   return 0;
 }
 
-int tcpdirect_udma_send(int socket, void *f_mbuf, size_t n, int flags) {
+int tcpd_udma_send(int socket, void *f_mbuf, size_t n, int flags) {
   int buf_pages, buf;
   struct iovec iov;
   struct msghdr *msg;
   struct cmsghdr *cmsg;
   char buf_dummy[n];
   char offsetbuf[CMSG_SPACE(sizeof(uint32_t) * 2)];
-  struct tcpdirect_udma_mbuf *tmbuf;
+  struct tcpdevmem_udma_mbuf *tmbuf;
 
   if (!f_mbuf) return -1;
 
-  tmbuf = (struct tcpdirect_udma_mbuf *)f_mbuf;
+  tmbuf = (struct tcpdevmem_udma_mbuf *)f_mbuf;
   buf_pages = tmbuf->buf_pages;
   buf = tmbuf->buf;
   msg = &tmbuf->msg;
@@ -531,17 +522,17 @@ int tcpdirect_udma_send(int socket, void *f_mbuf, size_t n, int flags) {
   return bytes_sent;
 }
 
-int tcpdirect_send(int socket, void *buf, size_t n, int flags) {
+int tcpd_send(int socket, void *buf, size_t n, int flags) {
   int gpu_mem_fd_;
   struct iovec iov;
   struct msghdr msg;
   struct cmsghdr *cmsg;
   char offsetbuf[CMSG_SPACE(sizeof(uint32_t) * 2)];
-  struct tcpdirect_cuda_mbuf *tmbuf;
+  struct tcpdevmem_cuda_mbuf *tmbuf;
 
   if (!buf) return -1;
 
-  tmbuf = (struct tcpdirect_cuda_mbuf *)buf;
+  tmbuf = (struct tcpdevmem_cuda_mbuf *)buf;
   gpu_mem_fd_ = tmbuf->gpu_mem_fd_;
 
   memset(&msg, 0, sizeof(msg));
@@ -581,34 +572,33 @@ int tcpdirect_send(int socket, void *buf, size_t n, int flags) {
   return bytes_sent;
 }
 
-int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread *t) {
+int tcpd_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread *t) {
   struct iovec iov;
   struct msghdr msg_local;
   struct msghdr *msg;
-  struct tcpdirect_cuda_mbuf *tmbuf;
-  int ret, client_fd; // buf
+  struct tcpdevmem_cuda_mbuf *tmbuf;
+  int ret, client_fd;
   int buffer_size = n;
   size_t total_received = 0;
   unsigned char *cpy_buffer;
   const struct options *opts = t->opts;
   std::vector<devmemvec> *vectors;
   std::vector<devmemtoken> *tokens;
-  std::vector<TcpDirectRxBlock> *rx_blks_;
+  std::vector<TcpdRxBlock> *rx_blks_;
   std::vector<long3> *scattered_data_;
 
   if (!f_mbuf) return -1;
 
-  tmbuf = (struct tcpdirect_cuda_mbuf *)f_mbuf;
+  tmbuf = (struct tcpdevmem_cuda_mbuf *)f_mbuf;
   cpy_buffer = (unsigned char *)tmbuf->cpy_buffer;
   vectors = (std::vector<devmemvec> *)tmbuf->vectors;
   tokens = (std::vector<devmemtoken> *)tmbuf->tokens;
-  rx_blks_ = (std::vector<TcpDirectRxBlock> *)tmbuf->rx_blks_;
+  rx_blks_ = (std::vector<TcpdRxBlock> *)tmbuf->rx_blks_;
   scattered_data_ = (std::vector<long3> *)tmbuf->scattered_data_;
 
   client_fd = socket;
 
   char buf_dummy[n];
-  // char offsetbuf[CMSG_SPACE(sizeof(uint32_t) * 128)];
   char offsetbuf[CMSG_SPACE(sizeof(int) * 1000)];
   msg = &msg_local;
 
@@ -659,42 +649,21 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread
     }
 
     struct devmemtoken token = { devmemvec->frag_token, 1 };
-    struct TcpDirectRxBlock blk;
+    struct TcpdRxBlock blk;
 
     blk.gpu_offset = (uint64_t)devmemvec->frag_offset;
     blk.size = devmemvec->frag_size;
     rx_blks_->emplace_back(blk);
 
-    // struct dma_buf_sync sync = { 0 };
-    // sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_START;
-    // ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
-
-    // buf_mem = (char *)mmap(NULL, n, PROT_READ | PROT_WRITE,
-    //                MAP_SHARED, buf, 0);
-    // if (buf_mem == MAP_FAILED) {
-    //   perror("mmap()");
-    //   exit(1);
-    // }
     total_received += devmemvec->frag_size;
-    // printf("\n\nreceived frag_page=%u, in_page_offset=%u,"
-    //         " frag_offset=%u, frag_size=%u, token=%u"
-    //         " total_received=%lu\n",
-    //         devmemvec->frag_offset >> PAGE_SHIFT,
-    //         devmemvec->frag_offset % PAGE_SIZE,
-    //         devmemvec->frag_offset, devmemvec->frag_size,
-    //         devmemvec->frag_token,
-    //         total_received);
-
-    // sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_END;
-    // ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
+
     vectors->emplace_back(*devmemvec);
     tokens->push_back(token);
-    // munmap(buf_mem, n);
   }
 
   size_t dst_offset = tmbuf->bytes_received;
   for (int i = 0; i < rx_blks_->size(); i++) {
-    struct TcpDirectRxBlock blk = rx_blks_->at(i);
+    struct TcpdRxBlock blk = rx_blks_->at(i);
     size_t off = (size_t)blk.gpu_offset;
     scattered_data_->emplace_back(
         make_long3((long)dst_offset, (long)off, (long)blk.size));
@@ -709,7 +678,7 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread
   */
   if (tmbuf->bytes_received == buffer_size) {
     if (opts->tcpd_rx_cpy) {
-      GatherRxData(tmbuf);
+      gather_rx_data(tmbuf);
       cudaDeviceSynchronize();
     }
     /* There is a performance impact when we cudaMemcpy from the CUDA buffer to
@@ -768,7 +737,7 @@ int tcpdirect_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread
 }
 
 int cuda_flow_cleanup(void *f_mbuf) {
-  struct tcpdirect_cuda_mbuf *t_mbuf = (struct tcpdirect_cuda_mbuf *)f_mbuf;
+  struct tcpdevmem_cuda_mbuf *t_mbuf = (struct tcpdevmem_cuda_mbuf *)f_mbuf;
   close(t_mbuf->gpu_mem_fd_);
   close(t_mbuf->dma_buf_fd_);
   cudaFree(t_mbuf->gpu_gen_mem_);
diff --git a/tcpdevmem.h b/tcpdevmem.h
new file mode 100644
index 0000000..f34c5a4
--- /dev/null
+++ b/tcpdevmem.h
@@ -0,0 +1,52 @@
+#ifndef THIRD_PARTY_NEPER_DEVMEM_H_
+#define THIRD_PARTY_NEPER_DEVMEM_H_
+
+#if __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+#include "common.h"
+#include "flags.h"
+#include "lib.h"
+
+struct tcpdevmem_udma_mbuf {
+        struct msghdr msg;
+        int dmabuf_fd;
+        int pages_fd;
+
+        int devfd;
+        int memfd;
+        int buf;
+        int buf_pages;
+};
+
+struct tcpdevmem_cuda_mbuf {
+        int gpu_mem_fd_;
+        int dma_buf_fd_;
+        void *gpu_gen_mem_;
+        void *gpu_rx_mem_;
+        void *gpu_scatter_list_;
+        void *scattered_data_;
+        void *rx_blks_;
+        void *cpy_buffer;
+        size_t bytes_received;
+        size_t bytes_sent;
+        void *tokens;
+        void *vectors;
+};
+
+int tcpd_setup_socket(int socket);
+int tcpd_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct thread *t);
+int cuda_flow_cleanup(void *f_mbuf);
+int udmabuf_setup_alloc(const struct options *opts, void **f_mbuf);
+int tcpd_send(int socket, void *buf, size_t n, int flags);
+int tcpd_udma_send(int fd, void *buf, size_t n, int flags);
+int tcpd_recv(int fd, void *f_mbuf, size_t n, int flags, struct thread *t);
+
+#if __cplusplus
+}
+#endif
+
+#endif  // THIRD_PARTY_NEPER_DEVMEM_H_
diff --git a/tcpdirect.h b/tcpdirect.h
deleted file mode 100644
index fd0157d..0000000
--- a/tcpdirect.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef THIRD_PARTY_NEPER_TCPDIRECT_H_
-#define THIRD_PARTY_NEPER_TCPDIRECT_H_
-
-#if __cplusplus
-extern "C" {
-#endif
-
-#include <stdint.h>
-
-#include "common.h"
-#include "flags.h"
-#include "lib.h"
-
-int tcpdirect_setup_socket(int socket);
-int tcpdirect_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct thread *t);
-int cuda_flow_cleanup(void *f_mbuf);
-int udmabuf_setup_alloc(const struct options *opts, void **f_mbuf);
-int tcpdirect_send(int socket, void *buf, size_t n, int flags);
-int tcpdirect_udma_send(int fd, void *buf, size_t n, int flags);
-int tcpdirect_recv(int fd, void *f_mbuf, size_t n, int flags, struct thread *t);
-
-#if __cplusplus
-}
-#endif
-
-#endif  // THIRD_PARTY_NEPER_TCPDIRECT_H_

From b717eff0a761708ca556bdf66d653664c88d5883 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Wed, 25 Oct 2023 15:16:16 +0000
Subject: [PATCH 33/72] UDMAbuf support, splitting cuda/udma into diff files

---
 Makefile                          |  25 +-
 check_all_options.c               |  24 +-
 define_all_flags.c                |   4 +-
 flow.c                            |  18 +-
 lib.h                             |   4 +-
 multi_neper.py                    |  21 +-
 socket.c                          |  23 +-
 stream.c                          |  40 ++--
 tcpdevmem.c                       |  75 ++++++
 tcpdevmem.h                       | 101 ++++----
 tcpdevmem.cu => tcpdevmem_cuda.cu | 371 +++++-------------------------
 tcpdevmem_cuda.h                  |  41 ++++
 tcpdevmem_udma.c                  | 305 ++++++++++++++++++++++++
 tcpdevmem_udma.h                  |  37 +++
 14 files changed, 681 insertions(+), 408 deletions(-)
 create mode 100644 tcpdevmem.c
 rename tcpdevmem.cu => tcpdevmem_cuda.cu (59%)
 create mode 100644 tcpdevmem_cuda.h
 create mode 100644 tcpdevmem_udma.c
 create mode 100644 tcpdevmem_udma.h

diff --git a/Makefile b/Makefile
index e64ecea..ba309f4 100644
--- a/Makefile
+++ b/Makefile
@@ -20,9 +20,15 @@ all: binaries
 
 CFLAGS := -std=c99 -Wall -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA
 
-ifdef WITH_TCPDEVMEM
-	CFLAGS += -DWITH_TCPDEVMEM
+ifdef WITH_TCPDEVMEM_CUDA
+	CFLAGS += -DWITH_TCPDEVMEM_CUDA
 endif
+ifdef WITH_TCPDEVMEM_UDMA
+	CFLAGS += -DWITH_TCPDEVMEM_UDMA
+endif
+
+ifndef_any_of = $(filter undefined,$(foreach v,$(1),$(origin $(v))))
+ifdef_any_of = $(filter-out undefined,$(foreach v,$(1),$(origin $(v))))
 
 lib := \
 	check_all_options.o \
@@ -53,10 +59,17 @@ lib := \
 tcp_rr-objs := tcp_rr_main.o tcp_rr.o rr.o $(lib)
 
 tcp_stream-objs := tcp_stream_main.o tcp_stream.o stream.o $(lib)
-ifdef WITH_TCPDEVMEM
+ifdef WITH_TCPDEVMEM_CUDA
+	tcp_stream-objs += tcpdevmem_cuda.o
+endif
+ifdef WITH_TCPDEVMEM_UDMA
+	tcp_stream-objs += tcpdevmem_udma.o
+endif
+ifneq ($(call ifdef_any_of,WITH_TCPDEVMEM_CUDA WITH_TCPDEVMEM_UDMA),)
 	tcp_stream-objs += tcpdevmem.o
 endif
 
+
 tcp_crr-objs := tcp_crr_main.o tcp_crr.o rr.o $(lib)
 
 udp_rr-objs := udp_rr_main.o udp_rr.o rr.o $(lib)
@@ -71,14 +84,14 @@ psp_rr-objs := psp_rr_main.o psp_rr.o rr.o psp_lib.o $(lib)
 
 ext-libs := -lm -lrt -lpthread
 
-tcpdevmem.o: tcpdevmem.cu
-	nvcc -arch=sm_90 -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA -DWITH_TCPDEVMEM -c -o $@ $^
+tcpdevmem_cuda.o: tcpdevmem_cuda.cu
+	nvcc -arch=sm_90 -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA -DWITH_TCPDEVMEM_CUDA -c -o $@ $^
 
 tcp_rr: $(tcp_rr-objs)
 	$(CC) $(LDFLAGS) -o $@ $^ $(ext-libs)
 
 tcp_stream: $(tcp_stream-objs)
-ifdef WITH_TCPDEVMEM
+ifdef WITH_TCPDEVMEM_CUDA
 	g++ $(LDFLAGS) -o $@ $^ $(ext-libs) -lc -L/usr/local/cuda/lib64 -lcudart -lcuda
 else
 	$(CC) $(LDFLAGS) -o $@ $^ $(ext-libs)
diff --git a/check_all_options.c b/check_all_options.c
index 0f667e7..bfca4c1 100644
--- a/check_all_options.c
+++ b/check_all_options.c
@@ -101,28 +101,34 @@ void check_options_tcp_rr(struct options *opts, struct callbacks *cb)
 
 void check_options_tcp_stream(struct options *opts, struct callbacks *cb)
 {
-#ifdef WITH_TCPDEVMEM
+#ifdef WITH_TCPDEVMEM_CUDA
       if (opts->tcpd_gpu_pci_addr) {
             CHECK(cb, opts->tcpd_nic_pci_addr,
                   "Must provide NIC PCI address if GPU PCI address was provided.");
-            CHECK(cb, opts->tcpd_phys_len > 0,
-                  "Must provide non-zero --tcpd-phys-len flag if GPU PCI address was provided.");
 
             if (opts->client) {
                   CHECK(cb, !opts->tcpd_rx_cpy,
                         "Copying CUDA buffer to userspace only allowed on hosts.");
-            } else {
-                  CHECK(cb, opts->tcpd_src_ip,
-                        "Must provide source IP address for devmem TCP host.");
-                  CHECK(cb, opts->tcpd_dst_ip,
-                        "Must provide destination IP address for devmem TCP host.");
             }
+      }
+#endif /* WITH_TCPDEVMEM_CUDA */
+#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMA)
+      if (opts->tcpd_nic_pci_addr) {
+            CHECK(cb, opts->tcpd_phys_len > 0,
+                  "Must provide non-zero --tcpd-phys-len flag when running in devmem TCP mode.");
             CHECK(cb, opts->num_flows == opts->num_threads,
                   "Thread/Flow count must be equal when running in devmem TCP mode.");
             CHECK(cb, opts->num_flows == opts->num_ports,
                   "Number of ports should equal number of flows when running in devmem TCP mode.");
+
+            if (!opts->client) {
+                  CHECK(cb, opts->tcpd_src_ip,
+                        "Must provide source IP address for devmem TCP host.");
+                  CHECK(cb, opts->tcpd_dst_ip,
+                        "Must provide destination IP address for devmem TCP host.");
+            }
       }
-#endif /* WITH_TCPDEVMEM */
+#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMA */
 }
 
 void check_options_udp_rr(struct options *opts, struct callbacks *cb)
diff --git a/define_all_flags.c b/define_all_flags.c
index 3bbcf15..42234da 100644
--- a/define_all_flags.c
+++ b/define_all_flags.c
@@ -141,7 +141,7 @@ struct flags_parser *add_flags_tcp_stream(struct flags_parser *fp)
         DEFINE_FLAG(fp, bool,          enable_write,    false,   'w', "Write to flows? Enabled by default for the client");
         DEFINE_FLAG(fp, bool,          enable_tcp_maerts,    false,   'M', "Enables TCP_MAERTS test (server writes and client reads). It overrides enable_read, and enable_write");
         DEFINE_FLAG(fp, bool,          async_connect,   false,   0,  "use non blocking connect");
-#ifdef WITH_TCPDEVMEM
+#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMA)
         DEFINE_FLAG(fp, bool,          tcpd_validate,   false,  0, "Validates that received data is a repeating sequence of 1 to 111 inclusive");
         DEFINE_FLAG(fp, bool,          tcpd_rx_cpy,     false,  0, "After the CUDA buffer is filled to buffer_size, calls cudaMemcpy to a userspace buffer");
         DEFINE_FLAG(fp, const char *,  tcpd_nic_pci_addr, 0,    0, "NIC PCI addr, e.x. 0000:06:00.0");
@@ -152,7 +152,7 @@ struct flags_parser *add_flags_tcp_stream(struct flags_parser *fp)
         DEFINE_FLAG(fp, const char *,  tcpd_link_name,  "eth1", 0, "Link name to bind DMA buffer_pages for Rx");
         DEFINE_FLAG(fp, int,           queue_start,     8,      0, "Queue to start flow-steering at");
         DEFINE_FLAG(fp, int,           queue_num,       4,      0, "Number of queues to flow-steer to");
-#endif /* WITH_TCPDEVMEM */
+#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMA */
 
         /* Return the updated fp */
         return (fp);
diff --git a/flow.c b/flow.c
index 1f1042c..5f1543e 100644
--- a/flow.c
+++ b/flow.c
@@ -19,9 +19,12 @@
 #include "socket.h"
 #include "thread.h"
 #include "stats.h"
-#ifdef WITH_TCPDEVMEM
-#include "tcpdevmem.h"
-#endif
+#ifdef WITH_TCPDEVMEM_CUDA
+#include "tcpdevmem_cuda.h"
+#endif /* WITH_TCPDEVMEM_CUDA */
+#ifdef WITH_TCPDEVMEM_UDMA
+#include "tcpdevmem_udma.h"
+#endif /* WITH_TCPDEVMEM_UDMA */
 
 /*
  * We define the flow struct locally to this file to force outside users to go
@@ -253,10 +256,13 @@ void flow_delete(struct flow *f)
                 thread_clear_flow_or_die(f->f_thread, f);
         }
 
-#ifdef WITH_TCPDEVMEM
+#ifdef WITH_TCPDEVMEM_CUDA
         if (flow_thread(f)->opts->tcpd_gpu_pci_addr) {
                 cuda_flow_cleanup(f->f_mbuf);
-        } else if (flow_thread(f)->opts->tcpd_nic_pci_addr) {
+        } else
+#endif /* WITH_TCPDEVMEM_CUDA */
+#ifdef WITH_TCPDEVMEM_UDMA
+        if (flow_thread(f)->opts->tcpd_nic_pci_addr) {
                 struct tcpdevmem_udma_mbuf *t_mbuf = (struct tcpdevmem_udma_mbuf *)f->f_mbuf;
 
                 close(t_mbuf->buf_pages);
@@ -264,7 +270,7 @@ void flow_delete(struct flow *f)
                 close(t_mbuf->memfd);
                 close(t_mbuf->devfd);
         }
-#endif
+#endif /* WITH_TCPDEVMEM_UDMA */
 
 /* TODO: need to free the stat struct here for crr tests */
         free(f->f_opaque);
diff --git a/lib.h b/lib.h
index 35eef24..5e4c840 100644
--- a/lib.h
+++ b/lib.h
@@ -106,7 +106,7 @@ struct options {
         bool async_connect;
 
         /* tcp_stream */
-#ifdef WITH_TCPDEVMEM
+#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMA)
         bool tcpd_validate;
         bool tcpd_rx_cpy;
         const char *tcpd_nic_pci_addr;
@@ -117,7 +117,7 @@ struct options {
         const char *tcpd_link_name;
         int queue_start;
         int queue_num;
-#endif /* WITH_TCPDEVMEM */
+#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMA */
         bool enable_read;
         bool enable_write;
         bool enable_tcp_maerts;
diff --git a/multi_neper.py b/multi_neper.py
index 1ee87f0..c2af306 100755
--- a/multi_neper.py
+++ b/multi_neper.py
@@ -53,13 +53,19 @@ def build_neper_cmd(neper_dir: str, is_client: bool, dev: str,
                     tcpd_validate, tcpd_rx_cpy)->tuple:
 
         cmd = (f"taskset --cpu-list {cpu_list} {neper_dir}/tcp_stream"
-               f" -T {threads} -F {flows} --tcpd-phys-len {phys_len}"
+               f" -T {threads} -F {flows}"
                f" --port {port} --source-port {source_port}"
                f" --control-port {control_port}"
-               f" --buffer-size {buffer_size} --tcpd-nic-pci-addr {nic_pci}"
-               f" --tcpd-gpu-pci-addr {gpu_pci} -l {length}"
+               f" --buffer-size {buffer_size} "
+               f" -l {length}"
                f" --num-ports {flows}")
 
+        if phys_len:
+                cmd += f" --tcpd-phys-len {phys_len}"
+        if nic_pci:
+                cmd += f" --tcpd-nic-pci-addr {nic_pci}"
+        if gpu_pci:
+                cmd += f" --tcpd-gpu-pci-addr {gpu_pci}"
         if tcpd_validate:
                 cmd += " --tcpd-validate"
 
@@ -145,6 +151,7 @@ def parse_subprocess_outputs(subprocesses):
 
         parser.add_argument("-l", "--length", default=10)
         parser.add_argument("--log", default="WARNING")
+        parser.add_argument("-m", "--mode", default="cuda", help="cuda|udma|default")
 
         args = parser.parse_args()
 
@@ -169,8 +176,12 @@ def parse_subprocess_outputs(subprocesses):
         is_client = args.client
 
         for i, dev in enumerate(devices):
-                nic_pci = link_to_nic_pci_addr[dev]
-                gpu_pci = link_to_gpu_pci_addr[dev]
+                nic_pci, gpu_pci = None, None
+
+                if args.mode.lower() in ["cuda", "udma"]:
+                        nic_pci = link_to_nic_pci_addr[dev]
+                if args.mode.lower() == "cuda":
+                        gpu_pci = link_to_gpu_pci_addr[dev]
 
                 # increment control port by 1, and src/dst ports by flow_count
                 # for each additional link we're running Neper on
diff --git a/socket.c b/socket.c
index a8bd91f..3bfc198 100644
--- a/socket.c
+++ b/socket.c
@@ -18,7 +18,10 @@
 #include "flow.h"
 #include "socket.h"
 #include "thread.h"
-#ifdef WITH_TCPDEVMEM
+#ifdef WITH_TCPDEVMEM_CUDA
+#include "tcpdevmem_cuda.h"
+#endif
+#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMA)
 #include "tcpdevmem.h"
 #endif
 
@@ -70,7 +73,7 @@ static void socket_init_not_established(struct thread *t, int s)
                 if (err)
                         PLOG_ERROR(t->cb, "setsockopt(SO_LINGER)");
         }
-#ifdef WITH_TCPDEVMEM
+#ifdef WITH_TCPDEVMEM_CUDA
         if (!t->f_mbuf && opts->tcpd_gpu_pci_addr) {
                 if (tcpd_cuda_setup_alloc(t->opts, &t->f_mbuf, t)) {
                         LOG_FATAL(t->cb, "%s: failed to setup devmem CUDA socket",
@@ -78,9 +81,11 @@ static void socket_init_not_established(struct thread *t, int s)
                         exit(1);
                 }
         }
+#endif /* WITH_TCPDEVMEM_CUDA */
+#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMA)
         if (opts->tcpd_nic_pci_addr)
                 tcpd_setup_socket(s);
-#endif /* WITH_TCPDEVMEM */
+#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMA */
 }
 
 /*
@@ -254,7 +259,7 @@ void socket_listen(struct thread *t)
         struct addrinfo *ai = getaddrinfo_or_die(opts->host, opts->port, &hints,
                                                  cb);
         int port = atoi(opts->port);
-#ifdef WITH_TCPDEVMEM
+#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMA)
         /* TCP Devmem:
          * Since each thread has a CUDA buffer, and
          * flow-steering rules are required, threads, TCP connections, and
@@ -266,11 +271,11 @@ void socket_listen(struct thread *t)
          * thread_1/flow_1 listen on x+1 -> thread_1->f_mbuf
          * etc...
          */
-        if (opts->tcpd_gpu_pci_addr) {
+        if (opts->tcpd_nic_pci_addr) {
                 port += t->index;
                 reset_port(ai, port, cb);
         }
-#endif
+#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMA */
 
         int i, n, s;
 
@@ -287,7 +292,7 @@ void socket_listen(struct thread *t)
         switch (ai->ai_socktype) {
         case SOCK_STREAM:
                 n = opts->num_ports ? opts->num_ports : 1;
-#ifdef WITH_TCPDEVMEM
+#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMA)
                 /* TCP Devmem:
                  * See TCP Devmem comment above^
                  *
@@ -295,9 +300,9 @@ void socket_listen(struct thread *t)
                  * listens on a port that's 1 larger than the previous thread's
                  * port.
                  */
-                if (opts->tcpd_gpu_pci_addr)
+                if (opts->tcpd_nic_pci_addr)
                         n = 1;
-#endif /* WITH_TCPDEVMEM */
+#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMA */
                 for (i = 0; i < n; i++) {
                         s = socket_bind_listener(t, ai);
                         socket_init_not_established(t, s);
diff --git a/stream.c b/stream.c
index be7c259..4f5011f 100644
--- a/stream.c
+++ b/stream.c
@@ -23,15 +23,18 @@
 #include "socket.h"
 #include "stats.h"
 #include "thread.h"
-#ifdef WITH_TCPDEVMEM
-#include "tcpdevmem.h"
+#ifdef WITH_TCPDEVMEM_CUDA
+#include "tcpdevmem_cuda.h"
+#endif
+#ifdef WITH_TCPDEVMEM_UDMA
+#include "tcpdevmem_udma.h"
 #endif
 
 static void *stream_alloc(struct thread *t)
 {
         const struct options *opts = t->opts;
 
-#ifdef WITH_TCPDEVMEM
+#ifdef WITH_TCPDEVMEM_CUDA
         if (!t->f_mbuf && t->opts->tcpd_gpu_pci_addr) {
                 if (tcpd_cuda_setup_alloc(t->opts, &t->f_mbuf, t)) {
                         LOG_FATAL(t->cb, "%s: failed to setup devmem CUDA socket",
@@ -39,15 +42,16 @@ static void *stream_alloc(struct thread *t)
                         exit(1);
                 }
         }
-
+#endif /* WITH_TCPDEVMEM_CUDA */
+#ifdef WITH_TCPDEVMEM_UDMA
         if (!t->f_mbuf && t->opts->tcpd_nic_pci_addr) {
-                if (udmabuf_setup_alloc(t->opts, &t->f_mbuf)) {
+                if (udma_setup_alloc(t->opts, &t->f_mbuf, t)) {
                         LOG_FATAL(t->cb, "%s: failed to setup devmem UDMABUF socket",
                                   __func__);
                         exit(1);
                 }
         }
-#endif
+#endif /* WITH_TCPDEVMEM_UDMA */
 
         if (!t->f_mbuf) {
                 t->f_mbuf = malloc_or_die(opts->buffer_size, t->cb);
@@ -106,14 +110,21 @@ void stream_handler(struct flow *f, uint32_t events)
         if (events & EPOLLIN)
                 do {
                         do {
-#ifdef WITH_TCPDEVMEM
-                                if (t->opts->tcpd_nic_pci_addr)
+#ifdef WITH_TCPDEVMEM_CUDA
+                                if (t->opts->tcpd_gpu_pci_addr)
                                         n = tcpd_recv(fd, mbuf,
                                                            opts->buffer_size,
                                                            opts->recv_flags,
                                                            t);
                                 else
-#endif
+#endif /* WITH_TCPDEVMEM_CUDA */
+#ifdef WITH_TCPDEVMEM_UDMA
+                                if (t->opts->tcpd_nic_pci_addr)
+                                        n = udma_recv(fd, mbuf,
+                                                      opts->buffer_size,
+                                                      t);
+                                else
+#endif /* WITH_TCPDEVMEM_UDMA */
                                 n = recv(fd, mbuf, opts->buffer_size,
                                          opts->recv_flags);
                         } while(n == -1 && errno == EINTR);
@@ -131,14 +142,17 @@ void stream_handler(struct flow *f, uint32_t events)
 
         if (events & EPOLLOUT)
                 do {
-#ifdef WITH_TCPDEVMEM
+#ifdef WITH_TCPDEVMEM_CUDA
                         if (t->opts->tcpd_gpu_pci_addr) {
                                 n = tcpd_send(fd, mbuf, opts->buffer_size, opts->send_flags);
-                        }else if (t->opts->tcpd_nic_pci_addr) {
-                                n = tcpd_udma_send(fd, mbuf,
+                        } else
+#endif /* WITH_TCPDEVMEM_CUDA */
+#ifdef WITH_TCPDEVMEM_UDMA
+                        if (t->opts->tcpd_nic_pci_addr) {
+                                n = udma_send(fd, mbuf,
                                         opts->buffer_size, opts->send_flags);
                         } else
-#endif
+#endif /* WITH_TCPDEVMEM_UDMA */
                         n = send(fd, mbuf, opts->buffer_size, opts->send_flags);
                         if (n == -1) {
                                 if (errno != EAGAIN)
diff --git a/tcpdevmem.c b/tcpdevmem.c
new file mode 100644
index 0000000..80f49c8
--- /dev/null
+++ b/tcpdevmem.c
@@ -0,0 +1,75 @@
+#include "flow.h"
+#include "lib.h"
+#include "logging.h"
+#include "tcpdevmem_cuda.h"
+#include "tcpdevmem.h"
+#include "thread.h"
+
+#define TEST_PREFIX "ncdevmem_common"
+
+int install_flow_steering(const struct options *opts, intptr_t buf,
+			  struct thread *t)
+{
+	int q_start = opts->queue_start;
+	int q_num = opts->queue_num;
+	int ret;
+
+	int num_queues = q_start + (t->index % q_num);
+	printf("Bind to queue %i\n", num_queues);
+	struct dma_buf_pages_bind_rx_queue bind_cmd;
+
+	strcpy(bind_cmd.ifname, opts->tcpd_link_name);
+	bind_cmd.rxq_idx = num_queues;
+
+	ret = ioctl(buf, DMA_BUF_PAGES_BIND_RX, &bind_cmd);
+	if (ret < 0)
+	{
+		printf("%s: [FAIL, bind fail queue=%d]\n", TEST_PREFIX,
+		       num_queues);
+		exit(78);
+	}
+
+	/* using t->index below requires 1 thread listening to 1 port
+	 * (see relevant comments in socket.c)
+	 */
+	int src_port = t->index + opts->source_port;
+	int dst_port = t->index + atoi(opts->port);
+
+	char flow_steer_cmd[512];
+	sprintf(flow_steer_cmd,
+		"ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %i dst-port %i queue %i",
+		opts->tcpd_link_name, opts->tcpd_src_ip, opts->tcpd_dst_ip,
+		src_port, dst_port, num_queues);
+	ret = system(flow_steer_cmd);
+
+	// only running the below ethtool commands after last thread/flow is setup
+	if (t->index == opts->num_flows - 1)
+	{
+		char ethtool_cmd[512];
+		sprintf(ethtool_cmd, "ethtool --set-priv-flags %s enable-strict-header-split on", opts->tcpd_link_name);
+		ret = ret | system(ethtool_cmd);
+
+		sprintf(ethtool_cmd, "ethtool --set-priv-flags %s enable-header-split on", opts->tcpd_link_name);
+		ret = ret | system(ethtool_cmd);
+
+		sprintf(ethtool_cmd, "ethtool --set-rxfh-indir %s equal 8", opts->tcpd_link_name);
+		ret = ret | system(ethtool_cmd);
+
+		printf("ethtool cmds returned %i, sleeping 1...\n", ret);
+		sleep(1);
+	}
+	return ret;
+}
+
+int tcpd_setup_socket(int socket)
+{
+	const int one = 1;
+	if (setsockopt(socket, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) ||
+	    setsockopt(socket, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)) ||
+	    setsockopt(socket, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one)))
+	{
+		perror("tcpd_setup_socket");
+		exit(EXIT_FAILURE);
+	}
+	return 0;
+}
diff --git a/tcpdevmem.h b/tcpdevmem.h
index f34c5a4..a1d2b17 100644
--- a/tcpdevmem.h
+++ b/tcpdevmem.h
@@ -1,52 +1,71 @@
-#ifndef THIRD_PARTY_NEPER_DEVMEM_H_
-#define THIRD_PARTY_NEPER_DEVMEM_H_
+#include <fcntl.h>
+#include <linux/dma-buf.h>
+#include <linux/if.h>
+#include <sys/ioctl.h>
 
-#if __cplusplus
-extern "C" {
+#include "thread.h"
+#include "lib.h"
+
+#define PAGE_SHIFT (12)
+#define PAGE_SIZE (1 << PAGE_SHIFT)
+
+#define MSG_SOCK_DEVMEM 0x2000000
+#define SO_DEVMEM_DONTNEED 97
+#define SO_DEVMEM_HEADER 98
+#define SCM_DEVMEM_HEADER SO_DEVMEM_HEADER
+#define SO_DEVMEM_OFFSET 99
+#define SCM_DEVMEM_OFFSET SO_DEVMEM_OFFSET
+
+#define DMA_BUF_CREATE_PAGES \
+	_IOW(DMA_BUF_BASE, 2, struct dma_buf_create_pages_info)
+
+/* GRTE libraries from google3 already define the following */
+#ifndef F_SEAL_SHRINK
+#define F_SEAL_SHRINK 2U
+#endif
+#ifndef F_ADD_SEALS
+#define F_ADD_SEALS 1033U
+#endif
+#ifndef F_GET_SEALS
+#define F_GET_SEALS 1034U
 #endif
 
-#include <stdint.h>
+struct dma_buf_create_pages_info
+{
+	__u64 pci_bdf[3];
+	__s32 dma_buf_fd;
+	__s32 create_page_pool;
+};
 
-#include "common.h"
-#include "flags.h"
-#include "lib.h"
+struct dma_buf_pages_bind_rx_queue
+{
+	char ifname[IFNAMSIZ];
+	__u32 rxq_idx;
+};
 
-struct tcpdevmem_udma_mbuf {
-        struct msghdr msg;
-        int dmabuf_fd;
-        int pages_fd;
+#define DMA_BUF_PAGES_BIND_RX \
+	_IOW(DMA_BUF_BASE, 3, struct dma_buf_pages_bind_rx_queue)
 
-        int devfd;
-        int memfd;
-        int buf;
-        int buf_pages;
+// devmemvec represents a fragment of payload that is received on the socket.
+struct devmemvec
+{
+	// frag_offset is the offset in the registered memory.
+	__u32 frag_offset;
+	// frag size is the size of the payload.
+	__u32 frag_size;
+	// frag_token is an identifier for this fragment and it can be used to return
+	// the memory back to kernel.
+	__u32 frag_token;
 };
 
-struct tcpdevmem_cuda_mbuf {
-        int gpu_mem_fd_;
-        int dma_buf_fd_;
-        void *gpu_gen_mem_;
-        void *gpu_rx_mem_;
-        void *gpu_scatter_list_;
-        void *scattered_data_;
-        void *rx_blks_;
-        void *cpy_buffer;
-        size_t bytes_received;
-        size_t bytes_sent;
-        void *tokens;
-        void *vectors;
+// devmemtoken represents a range of tokens. It is used to return the fragment
+// memory back to the kernel.
+struct devmemtoken
+{
+	__u32 token_start;
+	__u32 token_count;
 };
 
+int install_flow_steering(const struct options *opts, intptr_t buf,
+			  struct thread *t);
 int tcpd_setup_socket(int socket);
-int tcpd_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct thread *t);
-int cuda_flow_cleanup(void *f_mbuf);
-int udmabuf_setup_alloc(const struct options *opts, void **f_mbuf);
-int tcpd_send(int socket, void *buf, size_t n, int flags);
-int tcpd_udma_send(int fd, void *buf, size_t n, int flags);
-int tcpd_recv(int fd, void *f_mbuf, size_t n, int flags, struct thread *t);
-
-#if __cplusplus
-}
-#endif
-
-#endif  // THIRD_PARTY_NEPER_DEVMEM_H_
diff --git a/tcpdevmem.cu b/tcpdevmem_cuda.cu
similarity index 59%
rename from tcpdevmem.cu
rename to tcpdevmem_cuda.cu
index e77e9f7..4766b20 100644
--- a/tcpdevmem.cu
+++ b/tcpdevmem_cuda.cu
@@ -1,4 +1,3 @@
-#ifdef WITH_TCPDEVMEM
 #include <cuda.h>
 #include <cuda_runtime.h>
 
@@ -20,13 +19,19 @@
 #include <string>
 #include <vector>
 
+#if __cplusplus
+extern "C" {
+#endif
+
+#include "common.h"
+#include "tcpdevmem_cuda.h"
 #include "tcpdevmem.h"
 #include "logging.h"
 #include "flow.h"
 #include "thread.h"
 
-#ifndef MSG_ZEROCOPY
-#define MSG_ZEROCOPY	0x4000000
+#if __cplusplus
+}
 #endif
 
 #define LAST_PRIME 111
@@ -34,91 +39,18 @@
 #define MIN_RX_BUFFER_TOTAL_SIZE (1 << 28)
 #define GPUMEM_ALIGNMENT (1UL << 21)
 #define GPUMEM_MINSZ 0x400000
-#define PAGE_SHIFT (12)
-#define PAGE_SIZE (1 << PAGE_SHIFT)
 
 #define multiplier (1 << 16)
 
 #define TEST_PREFIX "ncdevmem"
 #define NUM_PAGES 16000
 
-/* missing definitions in mman-linux.h */
-#ifndef MFD_ALLOW_SEALING
-#define MFD_ALLOW_SEALING 2U
-#endif
-
-/* GRTE libraries from google3 already define the following */
-#ifndef F_SEAL_SHRINK
-#define F_SEAL_SHRINK 2U
-#endif
-#ifndef F_ADD_SEALS
-#define F_ADD_SEALS 1033U
-#endif
-#ifndef F_GET_SEALS
-#define F_GET_SEALS 1034U
-#endif
-
-#define MSG_SOCK_DEVMEM 0x2000000
-#define SO_DEVMEM_DONTNEED 97
-#define SO_DEVMEM_HEADER 98
-#define SCM_DEVMEM_HEADER SO_DEVMEM_HEADER
-#define SO_DEVMEM_OFFSET 99
-#define SCM_DEVMEM_OFFSET SO_DEVMEM_OFFSET
-
-struct dma_buf_create_pages_info {
-  __u64 pci_bdf[3];
-  __s32 dma_buf_fd;
-  __s32 create_page_pool;
-};
-
-struct dma_buf_pages_bind_rx_queue {
-  char ifname[IFNAMSIZ];
-  __u32 rxq_idx;
-};
-
-#define DMA_BUF_CREATE_PAGES \
-  _IOW(DMA_BUF_BASE, 2, struct dma_buf_create_pages_info)
-
-#define DMA_BUF_PAGES_BIND_RX \
-  _IOW(DMA_BUF_BASE, 3, struct dma_buf_pages_bind_rx_queue)
-
-// devmemvec represents a fragment of payload that is received on the socket.
-struct devmemvec {
-  // frag_offset is the offset in the registered memory.
-  __u32 frag_offset;
-  // frag size is the size of the payload.
-  __u32 frag_size;
-  // frag_token is an identifier for this fragment and it can be used to return
-  // the memory back to kernel.
-  __u32 frag_token;
-};
-
-// devmemtoken represents a range of tokens. It is used to return the fragment
-// memory back to the kernel.
-struct devmemtoken {
-  __u32 token_start;
-  __u32 token_count;
-};
-
 struct TcpdRxBlock {
   uint64_t gpu_offset;
   size_t size;
   uint64_t paddr;
 };
 
-struct udmabuf_create {
-  uint32_t memfd;
-  uint32_t flags;
-  uint64_t offset;
-  uint64_t size;
-};
-#define UDMABUF_CREATE _IOW('u', 0x42, struct udmabuf_create)
-
-int memfd_create(const char *name, unsigned int flags)
-{
-	return syscall(__NR_memfd_create, name, flags);
-}
-
 /* Fills buf of size n with a repeating sequence of 1 to 111 inclusive
  */
 void fill_tx_buffer(void *buf, size_t n) {
@@ -184,19 +116,6 @@ void gather_rx_data(struct tcpdevmem_cuda_mbuf *tmbuf) {
       (long3*)gpu_scatter_list_, (uint8_t*)gpu_rx_mem_, (uint8_t*)rx_buff_);
 }
 
-int tcpd_setup_socket(int socket) {
-  const int one = 1;
-  if (setsockopt(socket, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one))
-      || setsockopt(socket, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one))
-      || setsockopt(socket, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one))
-     ) {
-    perror("tcpd_setup_socket");
-    exit(EXIT_FAILURE);
-  }
-
-  return 0;
-}
-
 int get_gpumem_dmabuf_pages_fd(const std::string& gpu_pci_addr,
                                const std::string& nic_pci_addr, void* gpu_mem,
                                size_t gpu_mem_sz, int* dma_buf_fd, bool is_client) {
@@ -249,8 +168,8 @@ int tcpd_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct thre
   void *gpu_gen_mem_;
   int gpu_mem_fd_;
   int dma_buf_fd_;
-  int q_start = opts->queue_start;
-  int q_num = opts->queue_num;
+  // int q_start = opts->queue_start;
+  // int q_num = opts->queue_num;
   struct tcpdevmem_cuda_mbuf *tmbuf;
   const char *gpu_pci_addr = opts->tcpd_gpu_pci_addr;
   const char *nic_pci_addr = opts->tcpd_nic_pci_addr;
@@ -286,43 +205,45 @@ int tcpd_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct thre
   }
 
   if (!is_client) {
-    int num_queues = q_start + (t->index % q_num);
-    printf("Bind to queue %i\n", num_queues);
-    struct dma_buf_pages_bind_rx_queue bind_cmd;
-
-    strcpy(bind_cmd.ifname, opts->tcpd_link_name);
-    bind_cmd.rxq_idx = num_queues;
-
-    ret = ioctl(gpu_mem_fd_, DMA_BUF_PAGES_BIND_RX, &bind_cmd);
-    if (ret < 0) {
-      printf("%s: [FAIL, bind fail queue=%d]\n", TEST_PREFIX,
-            num_queues);
-      exit(78);
-    }
-
-    // copied from socket.c#socket_connect_one()
-    int flow_idx = (t->flow_first + t->flow_count);
-    int src_port = flow_idx + opts->source_port;
-    int dst_port = flow_idx + atoi(opts->port);
-
-    char flow_steer_cmd[512];
-    sprintf(flow_steer_cmd,
-            "ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %i dst-port %i queue %i",
-            opts->tcpd_link_name, opts->tcpd_src_ip, opts->tcpd_dst_ip, src_port, dst_port, num_queues);
-    ret = system(flow_steer_cmd);
-
-    // only running the below ethtool commands after last thread/flow is setup
-    if (flow_idx + t->flow_limit >= opts->num_flows) {
-      char ethtool_cmd[512];
-      sprintf(ethtool_cmd, "ethtool --set-priv-flags %s enable-strict-header-split on", opts->tcpd_link_name);
-      ret = ret | system(ethtool_cmd);
-      sprintf(ethtool_cmd, "ethtool --set-priv-flags %s enable-header-split on", opts->tcpd_link_name);
-      ret = ret | system(ethtool_cmd);
-      sprintf(ethtool_cmd, "ethtool --set-rxfh-indir %s equal 8", opts->tcpd_link_name);
-      ret = ret | system(ethtool_cmd);
-      printf("ethtool cmds returned %i, sleeping 1...\n", ret);
-      sleep(1);
-    }
+    install_flow_steering(opts, (int)gpu_mem_fd_, t);
+
+    // int num_queues = q_start + (t->index % q_num);
+    // printf("Bind to queue %i\n", num_queues);
+    // struct dma_buf_pages_bind_rx_queue bind_cmd;
+
+    // strcpy(bind_cmd.ifname, opts->tcpd_link_name);
+    // bind_cmd.rxq_idx = num_queues;
+
+    // ret = ioctl(gpu_mem_fd_, DMA_BUF_PAGES_BIND_RX, &bind_cmd);
+    // if (ret < 0) {
+    //   printf("%s: [FAIL, bind fail queue=%d]\n", TEST_PREFIX,
+    //         num_queues);
+    //   exit(78);
+    // }
+
+    // // copied from socket.c#socket_connect_one()
+    // int flow_idx = (t->flow_first + t->flow_count);
+    // int src_port = flow_idx + opts->source_port;
+    // int dst_port = flow_idx + atoi(opts->port);
+
+    // char flow_steer_cmd[512];
+    // sprintf(flow_steer_cmd,
+    //         "ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %i dst-port %i queue %i",
+    //         opts->tcpd_link_name, opts->tcpd_src_ip, opts->tcpd_dst_ip, src_port, dst_port, num_queues);
+    // ret = system(flow_steer_cmd);
+
+    // // only running the below ethtool commands after last thread/flow is setup
+    // if (flow_idx + t->flow_limit >= opts->num_flows) {
+    //   char ethtool_cmd[512];
+    //   sprintf(ethtool_cmd, "ethtool --set-priv-flags %s enable-strict-header-split on", opts->tcpd_link_name);
+    //   ret = ret | system(ethtool_cmd);
+    //   sprintf(ethtool_cmd, "ethtool --set-priv-flags %s enable-header-split on", opts->tcpd_link_name);
+    //   ret = ret | system(ethtool_cmd);
+    //   sprintf(ethtool_cmd, "ethtool --set-rxfh-indir %s equal 8", opts->tcpd_link_name);
+    //   ret = ret | system(ethtool_cmd);
+    //   printf("ethtool cmds returned %i, sleeping 1...\n", ret);
+    //   sleep(1);
+    // }
   }
 
   *f_mbuf = tmbuf;
@@ -342,186 +263,6 @@ int tcpd_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct thre
   return 0;
 }
 
-int udmabuf_setup_alloc(const struct options *opts, void **f_mbuf) {
-  bool is_client = opts->client;
-  int devfd;
-  int memfd;
-  int buf;
-  int buf_pages;
-  int ret;
-  size_t size = opts->tcpd_phys_len;
-
-  struct tcpdevmem_udma_mbuf *tmbuf;
-  struct dma_buf_create_pages_info pages_create_info;
-  struct udmabuf_create create;
-
-  if (f_mbuf == NULL) return ENOMEM;
-
-  if (*f_mbuf) return 0;
-
-  tmbuf = (struct tcpdevmem_udma_mbuf *)calloc(1, sizeof(struct tcpdevmem_udma_mbuf));
-  if (!tmbuf) {
-    exit(EXIT_FAILURE);
-  }
-
-  devfd = open("/dev/udmabuf", O_RDWR);
-  if (devfd < 0) {
-    printf("%s: [skip,no-udmabuf: Unable to access DMA buffer device file]\n",
-           TEST_PREFIX);
-    exit(70);
-  }
-
-  memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING);
-  if (memfd < 0) {
-    printf("%s: [skip,no-memfd]\n", TEST_PREFIX);
-    exit(72);
-  }
-
-  ret = fcntl(memfd, F_ADD_SEALS, F_SEAL_SHRINK);
-  if (ret < 0) {
-    printf("%s: [skip,fcntl-add-seals]\n", TEST_PREFIX);
-    exit(73);
-  }
-
-  ret = ftruncate(memfd, size);
-  if (ret == -1) {
-    printf("%s: [FAIL,memfd-truncate]\n", TEST_PREFIX);
-    exit(74);
-  }
-
-  memset(&create, 0, sizeof(create));
-
-  create.memfd = memfd;
-  create.offset = 0;
-  create.size = size;
-  printf("size=%lu\n", size);
-  buf = ioctl(devfd, UDMABUF_CREATE, &create);
-  if (buf < 0) {
-    printf("%s: [FAIL, create udmabuf]\n", TEST_PREFIX);
-    exit(75);
-  }
-
-  pages_create_info.dma_buf_fd = buf;
-  pages_create_info.create_page_pool = is_client ? 0 : 1;
-
-  ret = sscanf(opts->tcpd_nic_pci_addr, "0000:%llx:%llx.%llx",
-         &pages_create_info.pci_bdf[0],
-         &pages_create_info.pci_bdf[1],
-         &pages_create_info.pci_bdf[2]);
-
-  if (ret != 3) {
-    printf("%s: [FAIL, parse fail]\n", TEST_PREFIX);
-    exit(76);
-  }
-
-  buf_pages = ioctl(buf, DMA_BUF_CREATE_PAGES, &pages_create_info);
-  if (buf_pages < 0) {
-    perror("ioctl DMA_BUF_CREATE_PAGES: [FAIL, create pages fail]\n");
-    exit(77);
-  }
-
-  if (!is_client) {
-    /* TODO hardcoded num_queues */
-    int num_queues = 15;
-    struct dma_buf_pages_bind_rx_queue bind_cmd;
-
-    strcpy(bind_cmd.ifname, "eth1");
-    bind_cmd.rxq_idx = num_queues;
-
-    ret = ioctl(buf_pages, DMA_BUF_PAGES_BIND_RX, &bind_cmd);
-    if (ret < 0) {
-      printf("%s: [FAIL, bind fail queue=%d]\n", TEST_PREFIX,
-            num_queues);
-      exit(78);
-    }
-
-    system("ethtool --set-priv-flags eth1 enable-header-split on");
-    system("ethtool --set-priv-flags eth1 enable-header-split off");
-	  system("ethtool --set-priv-flags eth1 enable-header-split on");
-    sleep(1);
-    printf("toggled header-split\n");
-  }
-
-  struct dma_buf_sync sync = { 0 };
-  sync.flags = DMA_BUF_SYNC_WRITE | DMA_BUF_SYNC_START;
-  ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
-
-  *f_mbuf = tmbuf;
-
-  tmbuf->devfd = devfd;
-  tmbuf->memfd = memfd;
-  tmbuf->buf = buf;
-  tmbuf->buf_pages = buf_pages;
-  return 0;
-}
-
-int tcpd_udma_send(int socket, void *f_mbuf, size_t n, int flags) {
-  int buf_pages, buf;
-  struct iovec iov;
-  struct msghdr *msg;
-  struct cmsghdr *cmsg;
-  char buf_dummy[n];
-  char offsetbuf[CMSG_SPACE(sizeof(uint32_t) * 2)];
-  struct tcpdevmem_udma_mbuf *tmbuf;
-
-  if (!f_mbuf) return -1;
-
-  tmbuf = (struct tcpdevmem_udma_mbuf *)f_mbuf;
-  buf_pages = tmbuf->buf_pages;
-  buf = tmbuf->buf;
-  msg = &tmbuf->msg;
-
-  struct dma_buf_sync sync = { 0 };
-  sync.flags = DMA_BUF_SYNC_WRITE | DMA_BUF_SYNC_START;
-  ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
-
-  char *buf_mem = NULL;
-  buf_mem = (char *)mmap(NULL, n, PROT_READ | PROT_WRITE, MAP_SHARED, buf, 0);
-  if (buf_mem == MAP_FAILED) {
-    perror("mmap()");
-    exit(1);
-  }
-
-  memcpy(buf_mem, buf_dummy, n);
-
-  sync.flags = DMA_BUF_SYNC_WRITE | DMA_BUF_SYNC_END;
-  ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
-
-  munmap(buf_mem, n);
-
-  memset(msg, 0, sizeof(struct msghdr));
-  // memset(cmsg, 0, sizeof(struct cmsghdr));
-
-  iov.iov_base = buf_dummy;
-  iov.iov_len = n;
-
-  msg->msg_iov = &iov;
-  msg->msg_iovlen = 1;
-
-  msg->msg_control = offsetbuf;
-  msg->msg_controllen = sizeof(offsetbuf);
-
-  cmsg = CMSG_FIRSTHDR(msg);
-  cmsg->cmsg_level = SOL_SOCKET;
-  cmsg->cmsg_type = SCM_DEVMEM_OFFSET;
-  cmsg->cmsg_len = CMSG_LEN(sizeof(int) * 2);
-  *((int*)CMSG_DATA(cmsg)) = buf_pages;
-  ((int*)CMSG_DATA(cmsg))[1] = 0;
-
-  ssize_t bytes_sent = sendmsg(socket, msg, MSG_ZEROCOPY);
-  if (bytes_sent < 0 && errno != EWOULDBLOCK && errno != EAGAIN) {
-    perror("sendmsg() error: ");
-    exit(EXIT_FAILURE);
-  }
-
-  if (bytes_sent == 0) {
-    perror("sendmsg() sent 0 bytes. Something is wrong.\n");
-    exit(EXIT_FAILURE);
-  }
-
-  return bytes_sent;
-}
-
 int tcpd_send(int socket, void *buf, size_t n, int flags) {
   int gpu_mem_fd_;
   struct iovec iov;
@@ -612,21 +353,22 @@ int tcpd_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread *t) {
   msg->msg_control = offsetbuf;
   msg->msg_controllen = sizeof(offsetbuf);
 
-  // char *buf_mem = NULL;
-
-  if (msg->msg_flags & MSG_CTRUNC) {
-    printf("fatal, cmsg truncated, current msg_controllen\n");
-  }
-
   rx_blks_->clear();
 
   ssize_t received = recvmsg(socket, msg, MSG_SOCK_DEVMEM | MSG_DONTWAIT);
   if (received < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
+    printf("%s %d: recvmsg returned < 0\n", __func__, __LINE__);
+    return -1;
   } else if (received < 0) {
     printf("%s %d\n", __func__, __LINE__);
     return -1;
   } else if (received == 0) {
     printf("Client exited\n");
+    return -1;
+  }
+
+  if (msg->msg_flags & MSG_CTRUNC) {
+    LOG_ERROR(t->cb, "fatal, cmsg truncated, current msg_controllen");
   }
 
   struct cmsghdr *cm = NULL;
@@ -751,4 +493,3 @@ int cuda_flow_cleanup(void *f_mbuf) {
   free(t_mbuf->scattered_data_);
   return 0;
 }
-#endif /* #ifdef WITH_TCPDEVMEM */
diff --git a/tcpdevmem_cuda.h b/tcpdevmem_cuda.h
new file mode 100644
index 0000000..9f67636
--- /dev/null
+++ b/tcpdevmem_cuda.h
@@ -0,0 +1,41 @@
+#ifndef THIRD_PARTY_NEPER_DEVMEM_H_
+#define THIRD_PARTY_NEPER_DEVMEM_H_
+
+#if __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+#include <fcntl.h>
+#include <linux/if.h>
+
+#include "common.h"
+#include "flags.h"
+#include "lib.h"
+
+struct tcpdevmem_cuda_mbuf {
+        int gpu_mem_fd_;
+        int dma_buf_fd_;
+        void *gpu_gen_mem_;
+        void *gpu_rx_mem_;
+        void *gpu_scatter_list_;
+        void *scattered_data_;
+        void *rx_blks_;
+        void *cpy_buffer;
+        size_t bytes_received;
+        size_t bytes_sent;
+        void *tokens;
+        void *vectors;
+};
+
+int tcpd_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct thread *t);
+int cuda_flow_cleanup(void *f_mbuf);
+int tcpd_send(int socket, void *buf, size_t n, int flags);
+int tcpd_recv(int fd, void *f_mbuf, size_t n, int flags, struct thread *t);
+
+#if __cplusplus
+}
+#endif
+
+#endif  // THIRD_PARTY_NEPER_DEVMEM_H_
diff --git a/tcpdevmem_udma.c b/tcpdevmem_udma.c
new file mode 100644
index 0000000..df33e11
--- /dev/null
+++ b/tcpdevmem_udma.c
@@ -0,0 +1,305 @@
+#include <fcntl.h>
+#include <linux/memfd.h>
+#include <linux/if.h>
+#include <linux/dma-buf.h>
+#include <linux/udmabuf.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include "flow.h"
+#include "lib.h"
+#include "logging.h"
+#include "tcpdevmem.h"
+#include "tcpdevmem_udma.h"
+#include "thread.h"
+
+#define TEST_PREFIX "ncdevmem_udma"
+
+int udma_setup_alloc(const struct options *opts, void **f_mbuf, struct thread *t)
+{
+        bool is_client = opts->client;
+        int devfd;
+        int memfd;
+        int buf;
+        int buf_pages;
+        int ret;
+        size_t size = opts->tcpd_phys_len;
+
+        struct tcpdevmem_udma_mbuf *tmbuf;
+        struct dma_buf_create_pages_info pages_create_info;
+        struct udmabuf_create create;
+
+        if (f_mbuf == NULL)
+                return ENOMEM;
+
+        if (*f_mbuf)
+                return 0;
+
+        tmbuf = (struct tcpdevmem_udma_mbuf *)calloc(1, sizeof(struct tcpdevmem_udma_mbuf));
+        if (!tmbuf)
+        {
+                exit(EXIT_FAILURE);
+        }
+
+        devfd = open("/dev/udmabuf", O_RDWR);
+        if (devfd < 0)
+        {
+                printf("%s: [skip,no-udmabuf: Unable to access DMA buffer device file]\n",
+                       TEST_PREFIX);
+                exit(70);
+        }
+
+        memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING);
+        if (memfd < 0)
+        {
+                printf("%s: [skip,no-memfd]\n", TEST_PREFIX);
+                exit(72);
+        }
+
+        ret = fcntl(memfd, F_ADD_SEALS, F_SEAL_SHRINK);
+        if (ret < 0)
+        {
+                printf("%s: [skip,fcntl-add-seals]\n", TEST_PREFIX);
+                exit(73);
+        }
+
+        ret = ftruncate(memfd, size);
+        if (ret == -1)
+        {
+                printf("%s: [FAIL,memfd-truncate]\n", TEST_PREFIX);
+                exit(74);
+        }
+
+        memset(&create, 0, sizeof(create));
+
+        create.memfd = memfd;
+        create.offset = 0;
+        create.size = size;
+        printf("size=%lu\n", size);
+        buf = ioctl(devfd, UDMABUF_CREATE, &create);
+        if (buf < 0)
+        {
+                printf("%s: [FAIL, create udmabuf] %i\n", TEST_PREFIX, buf);
+                exit(75);
+        }
+
+        pages_create_info.dma_buf_fd = buf;
+        pages_create_info.create_page_pool = is_client ? 0 : 1;
+
+        ret = sscanf(opts->tcpd_nic_pci_addr, "0000:%llx:%llx.%llx",
+                     &pages_create_info.pci_bdf[0],
+                     &pages_create_info.pci_bdf[1],
+                     &pages_create_info.pci_bdf[2]);
+
+        if (ret != 3)
+        {
+                printf("%s: [FAIL, parse fail]\n", TEST_PREFIX);
+                exit(76);
+        }
+
+        buf_pages = ioctl(buf, DMA_BUF_CREATE_PAGES, &pages_create_info);
+        if (buf_pages < 0)
+        {
+                perror("ioctl DMA_BUF_CREATE_PAGES: [FAIL, create pages fail]\n");
+                exit(77);
+        }
+
+        if (!is_client)
+                install_flow_steering(opts, buf_pages, t);
+
+        struct dma_buf_sync sync = {0};
+        sync.flags = DMA_BUF_SYNC_WRITE | DMA_BUF_SYNC_START;
+        ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
+
+        *f_mbuf = tmbuf;
+
+        tmbuf->devfd = devfd;
+        tmbuf->memfd = memfd;
+        tmbuf->buf = buf;
+        tmbuf->buf_pages = buf_pages;
+        tmbuf->bytes_sent = 0;
+        return 0;
+}
+
+int udma_send(int socket, void *f_mbuf, size_t n, int flags)
+{
+        int buf_pages, buf;
+        struct iovec iov;
+        struct msghdr *msg;
+        struct cmsghdr *cmsg;
+        char buf_dummy[n];
+        char offsetbuf[CMSG_SPACE(sizeof(uint32_t) * 2)];
+        struct tcpdevmem_udma_mbuf *tmbuf;
+
+        if (!f_mbuf)
+                return -1;
+
+        tmbuf = (struct tcpdevmem_udma_mbuf *)f_mbuf;
+        buf_pages = tmbuf->buf_pages;
+        buf = tmbuf->buf;
+        msg = &tmbuf->msg;
+
+        struct dma_buf_sync sync = {0};
+        sync.flags = DMA_BUF_SYNC_WRITE | DMA_BUF_SYNC_START;
+        ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
+
+        char *buf_mem = NULL;
+        buf_mem = (char *)mmap(NULL, n, PROT_READ | PROT_WRITE, MAP_SHARED, buf, 0);
+        if (buf_mem == MAP_FAILED)
+        {
+                perror("mmap()");
+                exit(1);
+        }
+
+        memcpy(buf_mem, buf_dummy, n);
+
+        sync.flags = DMA_BUF_SYNC_WRITE | DMA_BUF_SYNC_END;
+        ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync);
+
+        munmap(buf_mem, n);
+
+        memset(msg, 0, sizeof(struct msghdr));
+        // memset(cmsg, 0, sizeof(struct cmsghdr));
+
+        iov.iov_base = buf_dummy;
+        iov.iov_len = n - tmbuf->bytes_sent;
+
+        msg->msg_iov = &iov;
+        msg->msg_iovlen = 1;
+
+        msg->msg_control = offsetbuf;
+        msg->msg_controllen = sizeof(offsetbuf);
+
+        cmsg = CMSG_FIRSTHDR(msg);
+        cmsg->cmsg_level = SOL_SOCKET;
+        cmsg->cmsg_type = SCM_DEVMEM_OFFSET;
+        cmsg->cmsg_len = CMSG_LEN(sizeof(int) * 2);
+        *((int *)CMSG_DATA(cmsg)) = buf_pages;
+        ((int *)CMSG_DATA(cmsg))[1] = (int)tmbuf->bytes_sent;
+
+        ssize_t bytes_sent = sendmsg(socket, msg, MSG_ZEROCOPY);
+        if (bytes_sent < 0 && errno != EWOULDBLOCK && errno != EAGAIN)
+        {
+                perror("sendmsg() error: ");
+                exit(EXIT_FAILURE);
+        }
+
+        if (bytes_sent == 0)
+        {
+                perror("sendmsg() sent 0 bytes. Something is wrong.\n");
+                exit(EXIT_FAILURE);
+        }
+
+        tmbuf->bytes_sent += bytes_sent;
+        if (tmbuf->bytes_sent == n)
+                tmbuf->bytes_sent = 0;
+
+        return bytes_sent;
+}
+
+int udma_recv(int socket, void *f_mbuf, size_t n, struct thread *t)
+{
+        struct tcpdevmem_udma_mbuf *tmbuf = (struct tcpdevmem_udma_mbuf *)f_mbuf;
+        bool is_devmem = false;
+        size_t total_received = 0;
+        size_t page_aligned_frags = 0;
+        size_t non_page_aligned_frags = 0;
+        unsigned long flow_steering_flakes = 0;
+
+        char iobuf[819200];
+        char ctrl_data[sizeof(int) * 20000];
+
+        struct msghdr msg = {0};
+        struct iovec iov = {.iov_base = iobuf,
+                            .iov_len = sizeof(iobuf)};
+
+        if (!f_mbuf)
+                return -1;
+
+        msg.msg_iov = &iov;
+        msg.msg_iovlen = 1;
+        msg.msg_control = ctrl_data;
+        msg.msg_controllen = sizeof(ctrl_data);
+        ssize_t ret = recvmsg(socket, &msg, MSG_SOCK_DEVMEM);
+        printf("recvmsg ret=%lu\n", ret);
+        if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK))
+        {
+                return -1;
+        }
+        if (ret < 0)
+        {
+                perror("recvmsg:");
+                exit(1);
+        }
+        if (ret == 0)
+        {
+                printf("client exited\n");
+                return -1;
+        }
+        printf("msg_flags=%d\n", msg.msg_flags);
+
+        struct cmsghdr *cm = NULL;
+        struct devmemvec *devmemvec = NULL;
+        for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm))
+        {
+                if (cm->cmsg_level != SOL_SOCKET ||
+                    (cm->cmsg_type != SCM_DEVMEM_OFFSET &&
+                     cm->cmsg_type != SCM_DEVMEM_HEADER))
+                {
+                        fprintf(stderr, "found weird cmsg\n");
+                        continue;
+                }
+                is_devmem = true;
+
+                devmemvec = (struct devmemvec *)CMSG_DATA(cm);
+
+                if (cm->cmsg_type == SCM_DEVMEM_HEADER)
+                {
+                        // TODO: process data copied from skb's linear
+                        // buffer.
+                        fprintf(stderr,
+                                "SCM_DEVMEM_HEADER. "
+                                "devmemvec->frag_size=%u\n",
+                                devmemvec->frag_size);
+                        exit(1);
+
+                        continue;
+                }
+
+                struct devmemtoken token = {devmemvec->frag_token, 1};
+
+                total_received += devmemvec->frag_size;
+
+                if (devmemvec->frag_size % PAGE_SIZE)
+                        non_page_aligned_frags++;
+                else
+                        page_aligned_frags++;
+
+                struct dma_buf_sync sync = {0};
+                sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_START;
+                ioctl(tmbuf->buf, DMA_BUF_IOCTL_SYNC, &sync);
+
+                sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_END;
+                ioctl(tmbuf->buf, DMA_BUF_IOCTL_SYNC, &sync);
+
+                ret = setsockopt(socket, SOL_SOCKET,
+                                 SO_DEVMEM_DONTNEED, &token,
+                                 sizeof(token));
+                if (ret)
+                {
+                        perror("DONTNEED failed");
+                        exit(1);
+                }
+        }
+
+        if (!is_devmem)
+        {
+                flow_steering_flakes++;
+                is_devmem = false;
+                total_received += ret;
+        }
+        printf("total_received=%lu flow_steering_flakes=%lu\n",
+               total_received, flow_steering_flakes);
+
+        return total_received;
+}
diff --git a/tcpdevmem_udma.h b/tcpdevmem_udma.h
new file mode 100644
index 0000000..f90b6fa
--- /dev/null
+++ b/tcpdevmem_udma.h
@@ -0,0 +1,37 @@
+#ifndef THIRD_PARTY_NEPER_DEVMEM_UDMA_H_
+#define THIRD_PARTY_NEPER_DEVMEM_UDMA_H_
+
+#if __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+#include "common.h"
+#include "flags.h"
+#include "lib.h"
+
+#define UDMABUF_CREATE _IOW('u', 0x42, struct udmabuf_create)
+
+struct tcpdevmem_udma_mbuf {
+        struct msghdr msg;
+        int dmabuf_fd;
+        int pages_fd;
+
+        int devfd;
+        int memfd;
+        int buf;
+        int buf_pages;
+        size_t bytes_sent;
+};
+
+int udma_setup_alloc(const struct options *opts, void **f_mbuf,
+                     struct thread *t);
+int udma_send(int socket, void *f_mbuf, size_t n, int flags);
+int udma_recv(int socket, void *f_mbuf, size_t n, struct thread *t);
+
+#if __cplusplus
+}
+#endif
+
+#endif  // THIRD_PARTY_NEPER_DEVMEM_UDMA_H_

From 70923a634dec596eaa0a741790eb81343b7801c1 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Wed, 25 Oct 2023 15:32:12 +0000
Subject: [PATCH 34/72] remove comment block & extraneous macros

---
 tcpdevmem.h       | 11 -----------
 tcpdevmem_cuda.cu | 43 ++-----------------------------------------
 2 files changed, 2 insertions(+), 52 deletions(-)

diff --git a/tcpdevmem.h b/tcpdevmem.h
index a1d2b17..141ae52 100644
--- a/tcpdevmem.h
+++ b/tcpdevmem.h
@@ -19,17 +19,6 @@
 #define DMA_BUF_CREATE_PAGES \
 	_IOW(DMA_BUF_BASE, 2, struct dma_buf_create_pages_info)
 
-/* GRTE libraries from google3 already define the following */
-#ifndef F_SEAL_SHRINK
-#define F_SEAL_SHRINK 2U
-#endif
-#ifndef F_ADD_SEALS
-#define F_ADD_SEALS 1033U
-#endif
-#ifndef F_GET_SEALS
-#define F_GET_SEALS 1034U
-#endif
-
 struct dma_buf_create_pages_info
 {
 	__u64 pci_bdf[3];
diff --git a/tcpdevmem_cuda.cu b/tcpdevmem_cuda.cu
index 4766b20..cadf72c 100644
--- a/tcpdevmem_cuda.cu
+++ b/tcpdevmem_cuda.cu
@@ -204,47 +204,8 @@ int tcpd_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct thre
     exit(71);
   }
 
-  if (!is_client) {
-    install_flow_steering(opts, (int)gpu_mem_fd_, t);
-
-    // int num_queues = q_start + (t->index % q_num);
-    // printf("Bind to queue %i\n", num_queues);
-    // struct dma_buf_pages_bind_rx_queue bind_cmd;
-
-    // strcpy(bind_cmd.ifname, opts->tcpd_link_name);
-    // bind_cmd.rxq_idx = num_queues;
-
-    // ret = ioctl(gpu_mem_fd_, DMA_BUF_PAGES_BIND_RX, &bind_cmd);
-    // if (ret < 0) {
-    //   printf("%s: [FAIL, bind fail queue=%d]\n", TEST_PREFIX,
-    //         num_queues);
-    //   exit(78);
-    // }
-
-    // // copied from socket.c#socket_connect_one()
-    // int flow_idx = (t->flow_first + t->flow_count);
-    // int src_port = flow_idx + opts->source_port;
-    // int dst_port = flow_idx + atoi(opts->port);
-
-    // char flow_steer_cmd[512];
-    // sprintf(flow_steer_cmd,
-    //         "ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %i dst-port %i queue %i",
-    //         opts->tcpd_link_name, opts->tcpd_src_ip, opts->tcpd_dst_ip, src_port, dst_port, num_queues);
-    // ret = system(flow_steer_cmd);
-
-    // // only running the below ethtool commands after last thread/flow is setup
-    // if (flow_idx + t->flow_limit >= opts->num_flows) {
-    //   char ethtool_cmd[512];
-    //   sprintf(ethtool_cmd, "ethtool --set-priv-flags %s enable-strict-header-split on", opts->tcpd_link_name);
-    //   ret = ret | system(ethtool_cmd);
-    //   sprintf(ethtool_cmd, "ethtool --set-priv-flags %s enable-header-split on", opts->tcpd_link_name);
-    //   ret = ret | system(ethtool_cmd);
-    //   sprintf(ethtool_cmd, "ethtool --set-rxfh-indir %s equal 8", opts->tcpd_link_name);
-    //   ret = ret | system(ethtool_cmd);
-    //   printf("ethtool cmds returned %i, sleeping 1...\n", ret);
-    //   sleep(1);
-    // }
-  }
+  if (!is_client)
+    install_flow_steering(opts, gpu_mem_fd_, t);
 
   *f_mbuf = tmbuf;
   tmbuf->gpu_mem_fd_ = gpu_mem_fd_;

From d8c9098db3f06e2b1130a2452057e86be3fb8634 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Wed, 25 Oct 2023 15:56:49 +0000
Subject: [PATCH 35/72] removing udma-related printfs

---
 tcpdevmem_udma.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/tcpdevmem_udma.c b/tcpdevmem_udma.c
index df33e11..6b02479 100644
--- a/tcpdevmem_udma.c
+++ b/tcpdevmem_udma.c
@@ -221,22 +221,20 @@ int udma_recv(int socket, void *f_mbuf, size_t n, struct thread *t)
         msg.msg_control = ctrl_data;
         msg.msg_controllen = sizeof(ctrl_data);
         ssize_t ret = recvmsg(socket, &msg, MSG_SOCK_DEVMEM);
-        printf("recvmsg ret=%lu\n", ret);
         if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK))
         {
                 return -1;
         }
         if (ret < 0)
         {
-                perror("recvmsg:");
+                PLOG_FATAL(t->cb, "recvmsg:");
                 exit(1);
         }
         if (ret == 0)
         {
-                printf("client exited\n");
+                LOG_ERROR(t->cb, "client exited");
                 return -1;
         }
-        printf("msg_flags=%d\n", msg.msg_flags);
 
         struct cmsghdr *cm = NULL;
         struct devmemvec *devmemvec = NULL;
@@ -246,7 +244,7 @@ int udma_recv(int socket, void *f_mbuf, size_t n, struct thread *t)
                     (cm->cmsg_type != SCM_DEVMEM_OFFSET &&
                      cm->cmsg_type != SCM_DEVMEM_HEADER))
                 {
-                        fprintf(stderr, "found weird cmsg\n");
+                        LOG_ERROR(t->cb, "found weird cmsg");
                         continue;
                 }
                 is_devmem = true;
@@ -257,12 +255,10 @@ int udma_recv(int socket, void *f_mbuf, size_t n, struct thread *t)
                 {
                         // TODO: process data copied from skb's linear
                         // buffer.
-                        fprintf(stderr,
-                                "SCM_DEVMEM_HEADER. "
-                                "devmemvec->frag_size=%u\n",
-                                devmemvec->frag_size);
+                        LOG_FATAL(t->cb,
+                                  "SCM_DEVMEM_HEADER. devmemvec->frag_size=%u",
+                                  devmemvec->frag_size);
                         exit(1);
-
                         continue;
                 }
 
@@ -287,7 +283,7 @@ int udma_recv(int socket, void *f_mbuf, size_t n, struct thread *t)
                                  sizeof(token));
                 if (ret)
                 {
-                        perror("DONTNEED failed");
+                        PLOG_FATAL(t->cb, "DONTNEED failed");
                         exit(1);
                 }
         }
@@ -298,8 +294,10 @@ int udma_recv(int socket, void *f_mbuf, size_t n, struct thread *t)
                 is_devmem = false;
                 total_received += ret;
         }
-        printf("total_received=%lu flow_steering_flakes=%lu\n",
-               total_received, flow_steering_flakes);
+        if (flow_steering_flakes) {
+                LOG_WARN(t->cb, "total_received=%lu flow_steering_flakes=%lu",
+                         total_received, flow_steering_flakes);
+        }
 
         return total_received;
 }

From 54808d70e68f649431c3598144a377209ca67263 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Fri, 27 Oct 2023 19:02:13 +0000
Subject: [PATCH 36/72] temporary workaround for malformed devmemvecs

seeing frag_sizes > 4096
---
 tcpdevmem_cuda.cu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tcpdevmem_cuda.cu b/tcpdevmem_cuda.cu
index cadf72c..eb2e405 100644
--- a/tcpdevmem_cuda.cu
+++ b/tcpdevmem_cuda.cu
@@ -354,6 +354,9 @@ int tcpd_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread *t) {
     struct devmemtoken token = { devmemvec->frag_token, 1 };
     struct TcpdRxBlock blk;
 
+    if (devmemvec->frag_size > PAGE_SIZE)
+      continue;
+
     blk.gpu_offset = (uint64_t)devmemvec->frag_offset;
     blk.size = devmemvec->frag_size;
     rx_blks_->emplace_back(blk);

From bba962b418b173560829a2517060152a49e2e0ed Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Fri, 27 Oct 2023 21:22:07 +0000
Subject: [PATCH 37/72] Initial Dockerfile

---
 Dockerfile     | 22 ++++++++++++++++++++++
 multi_neper.py |  1 +
 2 files changed, 23 insertions(+)
 create mode 100644 Dockerfile

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..391cc3e
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,22 @@
+FROM nvidia/cuda:12.0.0-devel-ubuntu20.04
+
+ENV DEBIAN_FRONTEND='noninteractive'
+
+RUN apt-get update \
+  && apt-get install -y --no-install-recommends \
+        git openssh-server wget iproute2 vim libopenmpi-dev build-essential cmake gdb \
+  protobuf-compiler libprotobuf-dev rsync libssl-dev \
+  && rm -rf /var/lib/apt/lists/*
+
+ARG CUDA12_GENCODE='-gencode=arch=compute_90,code=sm_90'
+ARG CUDA12_PTX='-gencode=arch=compute_90,code=compute_90'
+
+WORKDIR /third_party
+
+RUN git clone -b tcpd https://github.com/google/neper.git
+WORKDIR neper
+RUN make tcp_stream WITH_TCPDEVMEM_CUDA=1
+
+RUN chmod +777 /tmp
+RUN apt-get update
+RUN apt-get install -y python3 sysstat ethtool
diff --git a/multi_neper.py b/multi_neper.py
index c2af306..2b3a120 100755
--- a/multi_neper.py
+++ b/multi_neper.py
@@ -81,6 +81,7 @@ def build_neper_cmd(neper_dir: str, is_client: bool, dev: str,
                         cmd += " --tcpd-rx-cpy"
 
         env = {"CUDA_VISIBLE_DEVICES": link_to_gpu_index[dev]}
+        env.update(os.environ.copy())
 
         return (cmd, env)
 

From 51fe161687dc8c96c31abf99ada40eede787a3ea Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Tue, 31 Oct 2023 19:35:50 +0000
Subject: [PATCH 38/72] add run_neper script to use with Docker container


From 452e9470f54d1cb36d9cb550dea98cdd8219e92d Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Thu, 23 Nov 2023 01:08:09 +0000
Subject: [PATCH 39/72] tcpd: build UDMA tcp_stream -static

UDMA: allocate udma buffers in socket_init_not_established
remove unnecessary imports and #defines
---
 Makefile         |  3 ++-
 socket.c         | 12 ++++++++++++
 tcpdevmem.c      |  4 ++++
 tcpdevmem.h      | 51 ------------------------------------------------
 tcpdevmem_udma.c | 13 +++++++++++-
 5 files changed, 30 insertions(+), 53 deletions(-)

diff --git a/Makefile b/Makefile
index ba309f4..001721d 100644
--- a/Makefile
+++ b/Makefile
@@ -24,7 +24,8 @@ ifdef WITH_TCPDEVMEM_CUDA
 	CFLAGS += -DWITH_TCPDEVMEM_CUDA
 endif
 ifdef WITH_TCPDEVMEM_UDMA
-	CFLAGS += -DWITH_TCPDEVMEM_UDMA
+	CFLAGS += -DWITH_TCPDEVMEM_UDMA -DNDEBUG=1 -static -I ~/cos-kernel/usr/include
+	LDFLAGS += -static
 endif
 
 ifndef_any_of = $(filter undefined,$(foreach v,$(1),$(origin $(v))))
diff --git a/socket.c b/socket.c
index 3bfc198..287d38d 100644
--- a/socket.c
+++ b/socket.c
@@ -21,6 +21,9 @@
 #ifdef WITH_TCPDEVMEM_CUDA
 #include "tcpdevmem_cuda.h"
 #endif
+#ifdef WITH_TCPDEVMEM_UDMA
+#include "tcpdevmem_udma.h"
+#endif
 #if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMA)
 #include "tcpdevmem.h"
 #endif
@@ -82,6 +85,15 @@ static void socket_init_not_established(struct thread *t, int s)
                 }
         }
 #endif /* WITH_TCPDEVMEM_CUDA */
+#ifdef WITH_TCPDEVMEM_UDMA
+        if (!t->f_mbuf && opts->tcpd_nic_pci_addr) {
+                if (udma_setup_alloc(t->opts, &t->f_mbuf, t)) {
+                        LOG_FATAL(t->cb, "%s: failed to setup devmem UDMABUF socket",
+                                  __func__);
+                        exit(1);
+                }
+        }
+#endif /* WITH_TCPDEVMEM_UDMA */
 #if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMA)
         if (opts->tcpd_nic_pci_addr)
                 tcpd_setup_socket(s);
diff --git a/tcpdevmem.c b/tcpdevmem.c
index 80f49c8..32d49bf 100644
--- a/tcpdevmem.c
+++ b/tcpdevmem.c
@@ -1,3 +1,7 @@
+#include <linux/if.h>
+#include <linux/dma-buf.h>
+#include <sys/ioctl.h>
+
 #include "flow.h"
 #include "lib.h"
 #include "logging.h"
diff --git a/tcpdevmem.h b/tcpdevmem.h
index 141ae52..a9268e1 100644
--- a/tcpdevmem.h
+++ b/tcpdevmem.h
@@ -1,60 +1,9 @@
-#include <fcntl.h>
-#include <linux/dma-buf.h>
-#include <linux/if.h>
-#include <sys/ioctl.h>
-
 #include "thread.h"
 #include "lib.h"
 
 #define PAGE_SHIFT (12)
 #define PAGE_SIZE (1 << PAGE_SHIFT)
 
-#define MSG_SOCK_DEVMEM 0x2000000
-#define SO_DEVMEM_DONTNEED 97
-#define SO_DEVMEM_HEADER 98
-#define SCM_DEVMEM_HEADER SO_DEVMEM_HEADER
-#define SO_DEVMEM_OFFSET 99
-#define SCM_DEVMEM_OFFSET SO_DEVMEM_OFFSET
-
-#define DMA_BUF_CREATE_PAGES \
-	_IOW(DMA_BUF_BASE, 2, struct dma_buf_create_pages_info)
-
-struct dma_buf_create_pages_info
-{
-	__u64 pci_bdf[3];
-	__s32 dma_buf_fd;
-	__s32 create_page_pool;
-};
-
-struct dma_buf_pages_bind_rx_queue
-{
-	char ifname[IFNAMSIZ];
-	__u32 rxq_idx;
-};
-
-#define DMA_BUF_PAGES_BIND_RX \
-	_IOW(DMA_BUF_BASE, 3, struct dma_buf_pages_bind_rx_queue)
-
-// devmemvec represents a fragment of payload that is received on the socket.
-struct devmemvec
-{
-	// frag_offset is the offset in the registered memory.
-	__u32 frag_offset;
-	// frag size is the size of the payload.
-	__u32 frag_size;
-	// frag_token is an identifier for this fragment and it can be used to return
-	// the memory back to kernel.
-	__u32 frag_token;
-};
-
-// devmemtoken represents a range of tokens. It is used to return the fragment
-// memory back to the kernel.
-struct devmemtoken
-{
-	__u32 token_start;
-	__u32 token_count;
-};
-
 int install_flow_steering(const struct options *opts, intptr_t buf,
 			  struct thread *t);
 int tcpd_setup_socket(int socket);
diff --git a/tcpdevmem_udma.c b/tcpdevmem_udma.c
index 6b02479..f2b5f42 100644
--- a/tcpdevmem_udma.c
+++ b/tcpdevmem_udma.c
@@ -1,11 +1,17 @@
-#include <fcntl.h>
+#define __iovec_defined 1
+
 #include <linux/memfd.h>
 #include <linux/if.h>
 #include <linux/dma-buf.h>
+#include <linux/socket.h>
 #include <linux/udmabuf.h>
+#include <linux/uio.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 
+#include <stddef.h>
+#include <fcntl.h>
+
 #include "flow.h"
 #include "lib.h"
 #include "logging.h"
@@ -15,6 +21,11 @@
 
 #define TEST_PREFIX "ncdevmem_udma"
 
+#ifndef MSG_SOCK_DEVMEM
+#define MSG_SOCK_DEVMEM 0x2000000	/* don't copy devmem pages but return
+					 * them as cmsg instead */
+#endif
+
 int udma_setup_alloc(const struct options *opts, void **f_mbuf, struct thread *t)
 {
         bool is_client = opts->client;

From 1afbd7f4773d811e62d8c78533fb10c06d2f7ca3 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Wed, 20 Dec 2023 21:33:05 +0000
Subject: [PATCH 40/72] tcpdevmem: build CUDA-version image on workstation

Assumes that kernel hdr files are in usr/ folder in your Neper working
directory.

Changes from git pulling tcpd branch within the container, to copying
the source files from current Neper dir into the container, then
building with those files.
---
 Dockerfile        | 8 ++++++--
 Makefile          | 4 ++--
 tcpdevmem.c       | 2 +-
 tcpdevmem.h       | 5 +++++
 tcpdevmem_cuda.cu | 3 +++
 tcpdevmem_udma.c  | 5 -----
 6 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 391cc3e..a537685 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,10 +11,14 @@ RUN apt-get update \
 ARG CUDA12_GENCODE='-gencode=arch=compute_90,code=sm_90'
 ARG CUDA12_PTX='-gencode=arch=compute_90,code=compute_90'
 
+# this assumes that kernel hdr files have been copied into ${neper_dir}/usr/,
+# which will then be copied into the container
+COPY usr/ /kernel-includes/
+
 WORKDIR /third_party
 
-RUN git clone -b tcpd https://github.com/google/neper.git
-WORKDIR neper
+COPY ./* ./
+RUN make clean
 RUN make tcp_stream WITH_TCPDEVMEM_CUDA=1
 
 RUN chmod +777 /tmp
diff --git a/Makefile b/Makefile
index 001721d..7285432 100644
--- a/Makefile
+++ b/Makefile
@@ -21,7 +21,7 @@ all: binaries
 CFLAGS := -std=c99 -Wall -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA
 
 ifdef WITH_TCPDEVMEM_CUDA
-	CFLAGS += -DWITH_TCPDEVMEM_CUDA
+	CFLAGS += -DWITH_TCPDEVMEM_CUDA -I/kernel-includes/include
 endif
 ifdef WITH_TCPDEVMEM_UDMA
 	CFLAGS += -DWITH_TCPDEVMEM_UDMA -DNDEBUG=1 -static -I ~/cos-kernel/usr/include
@@ -86,7 +86,7 @@ psp_rr-objs := psp_rr_main.o psp_rr.o rr.o psp_lib.o $(lib)
 ext-libs := -lm -lrt -lpthread
 
 tcpdevmem_cuda.o: tcpdevmem_cuda.cu
-	nvcc -arch=sm_90 -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA -DWITH_TCPDEVMEM_CUDA -c -o $@ $^
+	nvcc -arch=sm_90 -O3 -g -I/kernel-includes/include -D_GNU_SOURCE -DNO_LIBNUMA -DWITH_TCPDEVMEM_CUDA -c -o $@ $^
 
 tcp_rr: $(tcp_rr-objs)
 	$(CC) $(LDFLAGS) -o $@ $^ $(ext-libs)
diff --git a/tcpdevmem.c b/tcpdevmem.c
index 32d49bf..a445b5d 100644
--- a/tcpdevmem.c
+++ b/tcpdevmem.c
@@ -9,7 +9,7 @@
 #include "tcpdevmem.h"
 #include "thread.h"
 
-#define TEST_PREFIX "ncdevmem_common"
+#define TEST_PREFIX "ncdevmem"
 
 int install_flow_steering(const struct options *opts, intptr_t buf,
 			  struct thread *t)
diff --git a/tcpdevmem.h b/tcpdevmem.h
index a9268e1..c59097c 100644
--- a/tcpdevmem.h
+++ b/tcpdevmem.h
@@ -4,6 +4,11 @@
 #define PAGE_SHIFT (12)
 #define PAGE_SIZE (1 << PAGE_SHIFT)
 
+#ifndef MSG_SOCK_DEVMEM
+#define MSG_SOCK_DEVMEM 0x2000000	/* don't copy devmem pages but return
+					 * them as cmsg instead */
+#endif
+
 int install_flow_steering(const struct options *opts, intptr_t buf,
 			  struct thread *t);
 int tcpd_setup_socket(int socket);
diff --git a/tcpdevmem_cuda.cu b/tcpdevmem_cuda.cu
index eb2e405..84252c7 100644
--- a/tcpdevmem_cuda.cu
+++ b/tcpdevmem_cuda.cu
@@ -1,6 +1,9 @@
+#define __iovec_defined 1
+
 #include <cuda.h>
 #include <cuda_runtime.h>
 
+#include <linux/uio.h>
 #include <asm-generic/errno-base.h>
 #include <asm-generic/socket.h>
 #include <errno.h>
diff --git a/tcpdevmem_udma.c b/tcpdevmem_udma.c
index f2b5f42..88f10c2 100644
--- a/tcpdevmem_udma.c
+++ b/tcpdevmem_udma.c
@@ -21,11 +21,6 @@
 
 #define TEST_PREFIX "ncdevmem_udma"
 
-#ifndef MSG_SOCK_DEVMEM
-#define MSG_SOCK_DEVMEM 0x2000000	/* don't copy devmem pages but return
-					 * them as cmsg instead */
-#endif
-
 int udma_setup_alloc(const struct options *opts, void **f_mbuf, struct thread *t)
 {
         bool is_client = opts->client;

From ce1120d056ef13befe093178d87f463574dc8474 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Fri, 22 Dec 2023 18:41:09 +0000
Subject: [PATCH 41/72] tcpd: UDMA reference usr/include folder

building UDMA statically can also reference header files found in
usr/include

remove unreferenced variable ret
---
 Dockerfile        | 9 +++------
 Makefile          | 6 +++---
 tcpdevmem_cuda.cu | 1 -
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index a537685..f3eb11c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,14 +11,11 @@ RUN apt-get update \
 ARG CUDA12_GENCODE='-gencode=arch=compute_90,code=sm_90'
 ARG CUDA12_PTX='-gencode=arch=compute_90,code=compute_90'
 
-# this assumes that kernel hdr files have been copied into ${neper_dir}/usr/,
-# which will then be copied into the container
-COPY usr/ /kernel-includes/
-
 WORKDIR /third_party
 
-COPY ./* ./
-RUN make clean
+# this assumes that kernel hdr files have been copied into ${neper_dir}/usr/,
+# which will then be copied into the container
+COPY . /third_party
 RUN make tcp_stream WITH_TCPDEVMEM_CUDA=1
 
 RUN chmod +777 /tmp
diff --git a/Makefile b/Makefile
index 7285432..a65eaf8 100644
--- a/Makefile
+++ b/Makefile
@@ -21,10 +21,10 @@ all: binaries
 CFLAGS := -std=c99 -Wall -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA
 
 ifdef WITH_TCPDEVMEM_CUDA
-	CFLAGS += -DWITH_TCPDEVMEM_CUDA -I/kernel-includes/include
+	CFLAGS += -DWITH_TCPDEVMEM_CUDA -I usr/include
 endif
 ifdef WITH_TCPDEVMEM_UDMA
-	CFLAGS += -DWITH_TCPDEVMEM_UDMA -DNDEBUG=1 -static -I ~/cos-kernel/usr/include
+	CFLAGS += -DWITH_TCPDEVMEM_UDMA -DNDEBUG=1 -static -I usr/include
 	LDFLAGS += -static
 endif
 
@@ -86,7 +86,7 @@ psp_rr-objs := psp_rr_main.o psp_rr.o rr.o psp_lib.o $(lib)
 ext-libs := -lm -lrt -lpthread
 
 tcpdevmem_cuda.o: tcpdevmem_cuda.cu
-	nvcc -arch=sm_90 -O3 -g -I/kernel-includes/include -D_GNU_SOURCE -DNO_LIBNUMA -DWITH_TCPDEVMEM_CUDA -c -o $@ $^
+	nvcc -arch=sm_90 -O3 -g -I usr/include -D_GNU_SOURCE -DNO_LIBNUMA -DWITH_TCPDEVMEM_CUDA -c -o $@ $^
 
 tcp_rr: $(tcp_rr-objs)
 	$(CC) $(LDFLAGS) -o $@ $^ $(ext-libs)
diff --git a/tcpdevmem_cuda.cu b/tcpdevmem_cuda.cu
index 84252c7..fae82ec 100644
--- a/tcpdevmem_cuda.cu
+++ b/tcpdevmem_cuda.cu
@@ -167,7 +167,6 @@ err_close_dmabuf:
 int tcpd_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct thread *t)
 {
   bool is_client = opts->client;
-  int ret;
   void *gpu_gen_mem_;
   int gpu_mem_fd_;
   int dma_buf_fd_;

From 8f183730bd8cd9de6897337c248cd3e952f9e84c Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Tue, 2 Jan 2024 19:43:27 +0000
Subject: [PATCH 42/72] tcpd: add tcpdevmem README


From fee98c26ffe8885dc26ac14c955bf865489359f1 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Tue, 2 Jan 2024 23:36:31 +0000
Subject: [PATCH 43/72] tcpd: refining tcpdevmem readme file

addressing pull request comments.

From fc6e0171d1d6b4972c93eef74b112c3cdc6fc9ee Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Tue, 2 Jan 2024 23:59:51 +0000
Subject: [PATCH 44/72] tcpd: readme minor changes according to comments


From 262a313242d295be3f7a34f35a8088e89fcf3e15 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Wed, 3 Jan 2024 00:07:54 +0000
Subject: [PATCH 45/72] tcpd: short-lived container

add note to work with long-running container

From 5735fe78511a1d7bd40edf6d1c84b7099a5b641c Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Fri, 9 Feb 2024 16:21:13 +0000
Subject: [PATCH 46/72] increase msg_control buffer size

fixes malformed vec when recvmsg buffer_size is too large
---
 tcpdevmem_cuda.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tcpdevmem_cuda.cu b/tcpdevmem_cuda.cu
index fae82ec..5256bc9 100644
--- a/tcpdevmem_cuda.cu
+++ b/tcpdevmem_cuda.cu
@@ -303,7 +303,7 @@ int tcpd_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread *t) {
   client_fd = socket;
 
   char buf_dummy[n];
-  char offsetbuf[CMSG_SPACE(sizeof(int) * 1000)];
+  char offsetbuf[CMSG_SPACE(sizeof(iov) * 10000)];
   msg = &msg_local;
 
   memset(msg, 0, sizeof(struct msghdr));

From 712f0ad0266455e66f82872f104d3f6a6ff94fcb Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Mon, 12 Feb 2024 19:45:27 +0000
Subject: [PATCH 47/72] driver reset in neper binary, not multi_neper.py

---
 multi_neper.py | 21 ---------------------
 tcpdevmem.c    | 38 ++++++++++++++++++++++++++++++++++++++
 tcpdevmem.h    |  1 +
 thread.c       | 10 ++++++++++
 4 files changed, 49 insertions(+), 21 deletions(-)

diff --git a/multi_neper.py b/multi_neper.py
index 2b3a120..4aa7c8d 100755
--- a/multi_neper.py
+++ b/multi_neper.py
@@ -26,23 +26,6 @@
         "eth4": "6"
 }
 
-def run_pre_neper_cmds(dev: str):
-        cmds = [
-                f"ethtool --set-priv-flags {dev} enable-strict-header-split on",
-                f"ethtool --set-priv-flags {dev} enable-strict-header-split off",
-                f"ethtool --set-priv-flags {dev} enable-header-split off",
-                f"ethtool --set-rxfh-indir {dev} equal 16",
-                f"ethtool -K {dev} ntuple off",
-                f"ethtool --set-priv-flags {dev} enable-strict-header-split off",
-                f"ethtool --set-priv-flags {dev} enable-header-split off",
-                f"ethtool -K {dev} ntuple off",
-                f"ethtool --set-priv-flags {dev} enable-max-rx-buffer-size on",
-                f"ethtool -K {dev} ntuple on"
-        ]
-
-        for cmd in cmds:
-                subprocess.run(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
-
 # returns a 2-tuple of a Neper command and a dict of env vars
 def build_neper_cmd(neper_dir: str, is_client: bool, dev: str,
                     threads: int, flows: int,
@@ -168,10 +151,6 @@ def parse_subprocess_outputs(subprocesses):
                 info("setting up flow-steering rules")
                 # src_ips = args.src_ips.split(",")
 
-                for i, dev in enumerate(devices):
-                        if not args.dry_run:
-                                run_pre_neper_cmds(dev)
-
         cmds = []
         debug(f"running on {devices}")
         is_client = args.client
diff --git a/tcpdevmem.c b/tcpdevmem.c
index a445b5d..d8f5551 100644
--- a/tcpdevmem.c
+++ b/tcpdevmem.c
@@ -11,6 +11,44 @@
 
 #define TEST_PREFIX "ncdevmem"
 
+int driver_reset(const struct options *opts) {
+	char driver_reset_cmd[512];
+	int ret = 0;
+
+	sprintf(driver_reset_cmd, "ethtool --set-priv-flags %s enable-strict-header-split on", opts->tcpd_link_name);
+	ret = ret | system(driver_reset_cmd);
+
+	sprintf(driver_reset_cmd, "ethtool --set-priv-flags %s enable-strict-header-split off", opts->tcpd_link_name);
+	ret = ret | system(driver_reset_cmd);
+
+	sprintf(driver_reset_cmd, "ethtool --set-priv-flags %s enable-header-split off", opts->tcpd_link_name);
+	ret = ret | system(driver_reset_cmd);
+
+	sprintf(driver_reset_cmd, "ethtool --set-rxfh-indir %s equal 16", opts->tcpd_link_name);
+	ret = ret | system(driver_reset_cmd);
+
+	sprintf(driver_reset_cmd, "ethtool -K %s ntuple off", opts->tcpd_link_name);
+	ret = ret | system(driver_reset_cmd);
+
+	sprintf(driver_reset_cmd, "ethtool --set-priv-flags %s enable-strict-header-split off", opts->tcpd_link_name);
+	ret = ret | system(driver_reset_cmd);
+
+	sprintf(driver_reset_cmd, "ethtool --set-priv-flags %s enable-header-split off", opts->tcpd_link_name);
+	ret = ret | system(driver_reset_cmd);
+
+	sprintf(driver_reset_cmd, "ethtool -K %s ntuple off", opts->tcpd_link_name);
+	ret = ret | system(driver_reset_cmd);
+
+	sprintf(driver_reset_cmd, "ethtool --set-priv-flags %s enable-max-rx-buffer-size on", opts->tcpd_link_name);
+	ret = ret | system(driver_reset_cmd);
+
+	sprintf(driver_reset_cmd, "ethtool -K %s ntuple on", opts->tcpd_link_name);
+	ret = ret | system(driver_reset_cmd);
+
+	printf("TCPDEVMEM driver reset returning %i\n", ret);
+	return ret;
+}
+
 int install_flow_steering(const struct options *opts, intptr_t buf,
 			  struct thread *t)
 {
diff --git a/tcpdevmem.h b/tcpdevmem.h
index c59097c..01a985b 100644
--- a/tcpdevmem.h
+++ b/tcpdevmem.h
@@ -9,6 +9,7 @@
 					 * them as cmsg instead */
 #endif
 
+int driver_reset(const struct options *opts);
 int install_flow_steering(const struct options *opts, intptr_t buf,
 			  struct thread *t);
 int tcpd_setup_socket(int socket);
diff --git a/thread.c b/thread.c
index ac01395..50dafdd 100644
--- a/thread.c
+++ b/thread.c
@@ -29,6 +29,9 @@
 #include "rusage.h"
 #include "snaps.h"
 #include "stats.h"
+#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMA)
+#include "tcpdevmem.h"
+#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMA */
 #include "thread.h"
 
 #ifndef NO_LIBNUMA
@@ -367,6 +370,13 @@ void start_worker_threads(struct options *opts, struct callbacks *cb,
 
         int percentiles = percentiles_count(&opts->percentiles);
 
+#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMA)
+        /* perform driver reset (on host) in anticipation of TCPDEVMEM run */
+        if (opts->tcpd_nic_pci_addr && !opts->client) {
+                driver_reset(opts);
+        }
+#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMA */
+
         for (i = 0; i < opts->num_threads; i++) {
                 t[i].index = i;
                 t[i].fn = fn;

From 2852410715977a6dbfdc0f2d616095f20ab57c1b Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Mon, 12 Feb 2024 19:54:25 +0000
Subject: [PATCH 48/72] catch driver_reset return value and log

---
 tcpdevmem.c | 24 +++++++++++++-----------
 thread.c    |  3 ++-
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/tcpdevmem.c b/tcpdevmem.c
index d8f5551..a4137c2 100644
--- a/tcpdevmem.c
+++ b/tcpdevmem.c
@@ -10,42 +10,44 @@
 #include "thread.h"
 
 #define TEST_PREFIX "ncdevmem"
+#define RETURN_IF_NON_ZERO(cmd)	\
+	ret = (cmd);		\
+	if (ret) return ret;
 
 int driver_reset(const struct options *opts) {
 	char driver_reset_cmd[512];
 	int ret = 0;
 
 	sprintf(driver_reset_cmd, "ethtool --set-priv-flags %s enable-strict-header-split on", opts->tcpd_link_name);
-	ret = ret | system(driver_reset_cmd);
+	RETURN_IF_NON_ZERO(system(driver_reset_cmd));
 
 	sprintf(driver_reset_cmd, "ethtool --set-priv-flags %s enable-strict-header-split off", opts->tcpd_link_name);
-	ret = ret | system(driver_reset_cmd);
+	RETURN_IF_NON_ZERO(system(driver_reset_cmd));
 
 	sprintf(driver_reset_cmd, "ethtool --set-priv-flags %s enable-header-split off", opts->tcpd_link_name);
-	ret = ret | system(driver_reset_cmd);
+	RETURN_IF_NON_ZERO(system(driver_reset_cmd));
 
 	sprintf(driver_reset_cmd, "ethtool --set-rxfh-indir %s equal 16", opts->tcpd_link_name);
-	ret = ret | system(driver_reset_cmd);
+	RETURN_IF_NON_ZERO(system(driver_reset_cmd));
 
 	sprintf(driver_reset_cmd, "ethtool -K %s ntuple off", opts->tcpd_link_name);
-	ret = ret | system(driver_reset_cmd);
+	RETURN_IF_NON_ZERO(system(driver_reset_cmd));
 
 	sprintf(driver_reset_cmd, "ethtool --set-priv-flags %s enable-strict-header-split off", opts->tcpd_link_name);
-	ret = ret | system(driver_reset_cmd);
+	RETURN_IF_NON_ZERO(system(driver_reset_cmd));
 
 	sprintf(driver_reset_cmd, "ethtool --set-priv-flags %s enable-header-split off", opts->tcpd_link_name);
-	ret = ret | system(driver_reset_cmd);
+	RETURN_IF_NON_ZERO(system(driver_reset_cmd));
 
 	sprintf(driver_reset_cmd, "ethtool -K %s ntuple off", opts->tcpd_link_name);
-	ret = ret | system(driver_reset_cmd);
+	RETURN_IF_NON_ZERO(system(driver_reset_cmd));
 
 	sprintf(driver_reset_cmd, "ethtool --set-priv-flags %s enable-max-rx-buffer-size on", opts->tcpd_link_name);
-	ret = ret | system(driver_reset_cmd);
+	RETURN_IF_NON_ZERO(system(driver_reset_cmd));
 
 	sprintf(driver_reset_cmd, "ethtool -K %s ntuple on", opts->tcpd_link_name);
-	ret = ret | system(driver_reset_cmd);
+	RETURN_IF_NON_ZERO(system(driver_reset_cmd));
 
-	printf("TCPDEVMEM driver reset returning %i\n", ret);
 	return ret;
 }
 
diff --git a/thread.c b/thread.c
index 50dafdd..c22e780 100644
--- a/thread.c
+++ b/thread.c
@@ -373,7 +373,8 @@ void start_worker_threads(struct options *opts, struct callbacks *cb,
 #if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMA)
         /* perform driver reset (on host) in anticipation of TCPDEVMEM run */
         if (opts->tcpd_nic_pci_addr && !opts->client) {
-                driver_reset(opts);
+                if (driver_reset(opts))
+                        LOG_FATAL(cb, "TCPDEVMEM driver reset failed");
         }
 #endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMA */
 

From e8ae584eed830a4ac88b109109a9c06c90306e5e Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Wed, 28 Feb 2024 18:48:31 +0000
Subject: [PATCH 49/72] update readme, default image under stable

Tested: `./run_neper_container.sh $a_different_image bash` and
`./run_neper_container.sh bash`

From 9e3ee46dd86925ecd058f94fe4c6e0de323e86f4 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Thu, 14 Mar 2024 18:02:39 +0000
Subject: [PATCH 50/72] various code quality improvements

Replace udma with udmabuf
Clean up error handling (PLOG_FATAL <- perror)
Minor details added to tcpdevmem README
---
 Makefile                                |  10 +--
 check_all_options.c                     |   4 +-
 define_all_flags.c                      |   4 +-
 flow.c                                  |  20 ++---
 lib.h                                   |   4 +-
 multi_neper.py                          |   4 +-
 socket.c                                |  38 ++++----
 stream.c                                |  29 +++---
 tcpdevmem.c                             |  24 ++---
 tcpdevmem.h                             |   2 +-
 tcpdevmem_cuda.cu                       |  43 ++++-----
 tcpdevmem_cuda.h                        |   2 +-
 tcpdevmem_udma.h                        |  37 --------
 tcpdevmem_udma.c => tcpdevmem_udmabuf.c | 114 ++++++++----------------
 tcpdevmem_udmabuf.h                     |  38 ++++++++
 thread.c                                |   8 +-
 16 files changed, 160 insertions(+), 221 deletions(-)
 delete mode 100644 tcpdevmem_udma.h
 rename tcpdevmem_udma.c => tcpdevmem_udmabuf.c (75%)
 create mode 100644 tcpdevmem_udmabuf.h

diff --git a/Makefile b/Makefile
index a65eaf8..5e12f2e 100644
--- a/Makefile
+++ b/Makefile
@@ -23,8 +23,8 @@ CFLAGS := -std=c99 -Wall -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA
 ifdef WITH_TCPDEVMEM_CUDA
 	CFLAGS += -DWITH_TCPDEVMEM_CUDA -I usr/include
 endif
-ifdef WITH_TCPDEVMEM_UDMA
-	CFLAGS += -DWITH_TCPDEVMEM_UDMA -DNDEBUG=1 -static -I usr/include
+ifdef WITH_TCPDEVMEM_UDMABUF
+	CFLAGS += -DWITH_TCPDEVMEM_UDMABUF -DNDEBUG=1 -static -I usr/include
 	LDFLAGS += -static
 endif
 
@@ -63,10 +63,10 @@ tcp_stream-objs := tcp_stream_main.o tcp_stream.o stream.o $(lib)
 ifdef WITH_TCPDEVMEM_CUDA
 	tcp_stream-objs += tcpdevmem_cuda.o
 endif
-ifdef WITH_TCPDEVMEM_UDMA
-	tcp_stream-objs += tcpdevmem_udma.o
+ifdef WITH_TCPDEVMEM_UDMABUF
+	tcp_stream-objs += tcpdevmem_udmabuf.o
 endif
-ifneq ($(call ifdef_any_of,WITH_TCPDEVMEM_CUDA WITH_TCPDEVMEM_UDMA),)
+ifneq ($(call ifdef_any_of,WITH_TCPDEVMEM_CUDA WITH_TCPDEVMEM_UDMABUF),)
 	tcp_stream-objs += tcpdevmem.o
 endif
 
diff --git a/check_all_options.c b/check_all_options.c
index bfca4c1..7c3c6bc 100644
--- a/check_all_options.c
+++ b/check_all_options.c
@@ -112,7 +112,7 @@ void check_options_tcp_stream(struct options *opts, struct callbacks *cb)
             }
       }
 #endif /* WITH_TCPDEVMEM_CUDA */
-#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMA)
+#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMABUF)
       if (opts->tcpd_nic_pci_addr) {
             CHECK(cb, opts->tcpd_phys_len > 0,
                   "Must provide non-zero --tcpd-phys-len flag when running in devmem TCP mode.");
@@ -128,7 +128,7 @@ void check_options_tcp_stream(struct options *opts, struct callbacks *cb)
                         "Must provide destination IP address for devmem TCP host.");
             }
       }
-#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMA */
+#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMABUF */
 }
 
 void check_options_udp_rr(struct options *opts, struct callbacks *cb)
diff --git a/define_all_flags.c b/define_all_flags.c
index 42234da..f4a1522 100644
--- a/define_all_flags.c
+++ b/define_all_flags.c
@@ -141,7 +141,7 @@ struct flags_parser *add_flags_tcp_stream(struct flags_parser *fp)
         DEFINE_FLAG(fp, bool,          enable_write,    false,   'w', "Write to flows? Enabled by default for the client");
         DEFINE_FLAG(fp, bool,          enable_tcp_maerts,    false,   'M', "Enables TCP_MAERTS test (server writes and client reads). It overrides enable_read, and enable_write");
         DEFINE_FLAG(fp, bool,          async_connect,   false,   0,  "use non blocking connect");
-#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMA)
+#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMABUF)
         DEFINE_FLAG(fp, bool,          tcpd_validate,   false,  0, "Validates that received data is a repeating sequence of 1 to 111 inclusive");
         DEFINE_FLAG(fp, bool,          tcpd_rx_cpy,     false,  0, "After the CUDA buffer is filled to buffer_size, calls cudaMemcpy to a userspace buffer");
         DEFINE_FLAG(fp, const char *,  tcpd_nic_pci_addr, 0,    0, "NIC PCI addr, e.x. 0000:06:00.0");
@@ -152,7 +152,7 @@ struct flags_parser *add_flags_tcp_stream(struct flags_parser *fp)
         DEFINE_FLAG(fp, const char *,  tcpd_link_name,  "eth1", 0, "Link name to bind DMA buffer_pages for Rx");
         DEFINE_FLAG(fp, int,           queue_start,     8,      0, "Queue to start flow-steering at");
         DEFINE_FLAG(fp, int,           queue_num,       4,      0, "Number of queues to flow-steer to");
-#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMA */
+#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMABUF */
 
         /* Return the updated fp */
         return (fp);
diff --git a/flow.c b/flow.c
index 5f1543e..222bc22 100644
--- a/flow.c
+++ b/flow.c
@@ -22,9 +22,9 @@
 #ifdef WITH_TCPDEVMEM_CUDA
 #include "tcpdevmem_cuda.h"
 #endif /* WITH_TCPDEVMEM_CUDA */
-#ifdef WITH_TCPDEVMEM_UDMA
-#include "tcpdevmem_udma.h"
-#endif /* WITH_TCPDEVMEM_UDMA */
+#ifdef WITH_TCPDEVMEM_UDMABUF
+#include "tcpdevmem_udmabuf.h"
+#endif /* WITH_TCPDEVMEM_UDMABUF */
 
 /*
  * We define the flow struct locally to this file to force outside users to go
@@ -261,16 +261,10 @@ void flow_delete(struct flow *f)
                 cuda_flow_cleanup(f->f_mbuf);
         } else
 #endif /* WITH_TCPDEVMEM_CUDA */
-#ifdef WITH_TCPDEVMEM_UDMA
-        if (flow_thread(f)->opts->tcpd_nic_pci_addr) {
-                struct tcpdevmem_udma_mbuf *t_mbuf = (struct tcpdevmem_udma_mbuf *)f->f_mbuf;
-
-                close(t_mbuf->buf_pages);
-                close(t_mbuf->buf);
-                close(t_mbuf->memfd);
-                close(t_mbuf->devfd);
-        }
-#endif /* WITH_TCPDEVMEM_UDMA */
+#ifdef WITH_TCPDEVMEM_UDMABUF
+        if (flow_thread(f)->opts->tcpd_nic_pci_addr)
+                udmabuf_flow_cleanup(f->f_mbuf);
+#endif /* WITH_TCPDEVMEM_UDMABUF */
 
 /* TODO: need to free the stat struct here for crr tests */
         free(f->f_opaque);
diff --git a/lib.h b/lib.h
index 5e4c840..01008da 100644
--- a/lib.h
+++ b/lib.h
@@ -106,7 +106,7 @@ struct options {
         bool async_connect;
 
         /* tcp_stream */
-#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMA)
+#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMABUF)
         bool tcpd_validate;
         bool tcpd_rx_cpy;
         const char *tcpd_nic_pci_addr;
@@ -117,7 +117,7 @@ struct options {
         const char *tcpd_link_name;
         int queue_start;
         int queue_num;
-#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMA */
+#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMABUF */
         bool enable_read;
         bool enable_write;
         bool enable_tcp_maerts;
diff --git a/multi_neper.py b/multi_neper.py
index 4aa7c8d..7bc5fc4 100755
--- a/multi_neper.py
+++ b/multi_neper.py
@@ -135,7 +135,7 @@ def parse_subprocess_outputs(subprocesses):
 
         parser.add_argument("-l", "--length", default=10)
         parser.add_argument("--log", default="WARNING")
-        parser.add_argument("-m", "--mode", default="cuda", help="cuda|udma|default")
+        parser.add_argument("-m", "--mode", default="cuda", help="cuda|udmabuf|default")
 
         args = parser.parse_args()
 
@@ -158,7 +158,7 @@ def parse_subprocess_outputs(subprocesses):
         for i, dev in enumerate(devices):
                 nic_pci, gpu_pci = None, None
 
-                if args.mode.lower() in ["cuda", "udma"]:
+                if args.mode.lower() in ["cuda", "udmabuf"]:
                         nic_pci = link_to_nic_pci_addr[dev]
                 if args.mode.lower() == "cuda":
                         gpu_pci = link_to_gpu_pci_addr[dev]
diff --git a/socket.c b/socket.c
index 287d38d..703cbef 100644
--- a/socket.c
+++ b/socket.c
@@ -21,10 +21,10 @@
 #ifdef WITH_TCPDEVMEM_CUDA
 #include "tcpdevmem_cuda.h"
 #endif
-#ifdef WITH_TCPDEVMEM_UDMA
-#include "tcpdevmem_udma.h"
+#ifdef WITH_TCPDEVMEM_UDMABUF
+#include "tcpdevmem_udmabuf.h"
 #endif
-#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMA)
+#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMABUF)
 #include "tcpdevmem.h"
 #endif
 
@@ -78,26 +78,22 @@ static void socket_init_not_established(struct thread *t, int s)
         }
 #ifdef WITH_TCPDEVMEM_CUDA
         if (!t->f_mbuf && opts->tcpd_gpu_pci_addr) {
-                if (tcpd_cuda_setup_alloc(t->opts, &t->f_mbuf, t)) {
+                if (tcpd_cuda_setup_alloc(t->opts, &t->f_mbuf, t))
                         LOG_FATAL(t->cb, "%s: failed to setup devmem CUDA socket",
                                   __func__);
-                        exit(1);
-                }
         }
 #endif /* WITH_TCPDEVMEM_CUDA */
-#ifdef WITH_TCPDEVMEM_UDMA
+#ifdef WITH_TCPDEVMEM_UDMABUF
         if (!t->f_mbuf && opts->tcpd_nic_pci_addr) {
-                if (udma_setup_alloc(t->opts, &t->f_mbuf, t)) {
+                if (udmabuf_setup_alloc(t->opts, &t->f_mbuf, t))
                         LOG_FATAL(t->cb, "%s: failed to setup devmem UDMABUF socket",
                                   __func__);
-                        exit(1);
-                }
         }
-#endif /* WITH_TCPDEVMEM_UDMA */
-#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMA)
+#endif /* WITH_TCPDEVMEM_UDMABUF */
+#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMABUF)
         if (opts->tcpd_nic_pci_addr)
-                tcpd_setup_socket(s);
-#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMA */
+                tcpd_setup_socket(t, s);
+#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMABUF */
 }
 
 /*
@@ -271,11 +267,11 @@ void socket_listen(struct thread *t)
         struct addrinfo *ai = getaddrinfo_or_die(opts->host, opts->port, &hints,
                                                  cb);
         int port = atoi(opts->port);
-#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMA)
-        /* TCP Devmem:
-         * Since each thread has a CUDA buffer, and
+#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMABUF)
+        /* TCPDevmem:
+         * Since each thread has a dma buffer, and
          * flow-steering rules are required, threads, TCP connections, and
-         * CUDA buffers need to be 1:1:1.
+         * dma buffers need to be 1:1:1.
          *
          * We enforce that by co-opting the num_ports option.
          *
@@ -287,7 +283,7 @@ void socket_listen(struct thread *t)
                 port += t->index;
                 reset_port(ai, port, cb);
         }
-#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMA */
+#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMABUF */
 
         int i, n, s;
 
@@ -304,7 +300,7 @@ void socket_listen(struct thread *t)
         switch (ai->ai_socktype) {
         case SOCK_STREAM:
                 n = opts->num_ports ? opts->num_ports : 1;
-#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMA)
+#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMABUF)
                 /* TCP Devmem:
                  * See TCP Devmem comment above^
                  *
@@ -314,7 +310,7 @@ void socket_listen(struct thread *t)
                  */
                 if (opts->tcpd_nic_pci_addr)
                         n = 1;
-#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMA */
+#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMABUF */
                 for (i = 0; i < n; i++) {
                         s = socket_bind_listener(t, ai);
                         socket_init_not_established(t, s);
diff --git a/stream.c b/stream.c
index 4f5011f..6dfdc37 100644
--- a/stream.c
+++ b/stream.c
@@ -26,8 +26,8 @@
 #ifdef WITH_TCPDEVMEM_CUDA
 #include "tcpdevmem_cuda.h"
 #endif
-#ifdef WITH_TCPDEVMEM_UDMA
-#include "tcpdevmem_udma.h"
+#ifdef WITH_TCPDEVMEM_UDMABUF
+#include "tcpdevmem_udmabuf.h"
 #endif
 
 static void *stream_alloc(struct thread *t)
@@ -43,15 +43,15 @@ static void *stream_alloc(struct thread *t)
                 }
         }
 #endif /* WITH_TCPDEVMEM_CUDA */
-#ifdef WITH_TCPDEVMEM_UDMA
+#ifdef WITH_TCPDEVMEM_UDMABUF
         if (!t->f_mbuf && t->opts->tcpd_nic_pci_addr) {
-                if (udma_setup_alloc(t->opts, &t->f_mbuf, t)) {
+                if (udmabuf_setup_alloc(t->opts, &t->f_mbuf, t)) {
                         LOG_FATAL(t->cb, "%s: failed to setup devmem UDMABUF socket",
                                   __func__);
                         exit(1);
                 }
         }
-#endif /* WITH_TCPDEVMEM_UDMA */
+#endif /* WITH_TCPDEVMEM_UDMABUF */
 
         if (!t->f_mbuf) {
                 t->f_mbuf = malloc_or_die(opts->buffer_size, t->cb);
@@ -118,13 +118,13 @@ void stream_handler(struct flow *f, uint32_t events)
                                                            t);
                                 else
 #endif /* WITH_TCPDEVMEM_CUDA */
-#ifdef WITH_TCPDEVMEM_UDMA
+#ifdef WITH_TCPDEVMEM_UDMABUF
                                 if (t->opts->tcpd_nic_pci_addr)
-                                        n = udma_recv(fd, mbuf,
+                                        n = udmabuf_recv(fd, mbuf,
                                                       opts->buffer_size,
                                                       t);
                                 else
-#endif /* WITH_TCPDEVMEM_UDMA */
+#endif /* WITH_TCPDEVMEM_UDMABUF */
                                 n = recv(fd, mbuf, opts->buffer_size,
                                          opts->recv_flags);
                         } while(n == -1 && errno == EINTR);
@@ -144,15 +144,18 @@ void stream_handler(struct flow *f, uint32_t events)
                 do {
 #ifdef WITH_TCPDEVMEM_CUDA
                         if (t->opts->tcpd_gpu_pci_addr) {
-                                n = tcpd_send(fd, mbuf, opts->buffer_size, opts->send_flags);
+                                n = tcpd_send(fd, mbuf, opts->buffer_size, opts->send_flags, t);
                         } else
 #endif /* WITH_TCPDEVMEM_CUDA */
-#ifdef WITH_TCPDEVMEM_UDMA
+#ifdef WITH_TCPDEVMEM_UDMABUF
                         if (t->opts->tcpd_nic_pci_addr) {
-                                n = udma_send(fd, mbuf,
-                                        opts->buffer_size, opts->send_flags);
+                                n = udmabuf_send(fd,
+                                              mbuf,
+                                              opts->buffer_size,
+                                              opts->send_flags,
+                                              t);
                         } else
-#endif /* WITH_TCPDEVMEM_UDMA */
+#endif /* WITH_TCPDEVMEM_UDMABUF */
                         n = send(fd, mbuf, opts->buffer_size, opts->send_flags);
                         if (n == -1) {
                                 if (errno != EAGAIN)
diff --git a/tcpdevmem.c b/tcpdevmem.c
index a4137c2..c16d0b9 100644
--- a/tcpdevmem.c
+++ b/tcpdevmem.c
@@ -12,7 +12,7 @@
 #define TEST_PREFIX "ncdevmem"
 #define RETURN_IF_NON_ZERO(cmd)	\
 	ret = (cmd);		\
-	if (ret) return ret;
+	if (ret) return ret
 
 int driver_reset(const struct options *opts) {
 	char driver_reset_cmd[512];
@@ -67,11 +67,7 @@ int install_flow_steering(const struct options *opts, intptr_t buf,
 
 	ret = ioctl(buf, DMA_BUF_PAGES_BIND_RX, &bind_cmd);
 	if (ret < 0)
-	{
-		printf("%s: [FAIL, bind fail queue=%d]\n", TEST_PREFIX,
-		       num_queues);
-		exit(78);
-	}
+		LOG_FATAL(t->cb, "FAIL, bind fail queue=%d", num_queues);
 
 	/* using t->index below requires 1 thread listening to 1 port
 	 * (see relevant comments in socket.c)
@@ -84,20 +80,20 @@ int install_flow_steering(const struct options *opts, intptr_t buf,
 		"ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %i dst-port %i queue %i",
 		opts->tcpd_link_name, opts->tcpd_src_ip, opts->tcpd_dst_ip,
 		src_port, dst_port, num_queues);
-	ret = system(flow_steer_cmd);
+	RETURN_IF_NON_ZERO(system(flow_steer_cmd));
 
 	// only running the below ethtool commands after last thread/flow is setup
 	if (t->index == opts->num_flows - 1)
 	{
 		char ethtool_cmd[512];
 		sprintf(ethtool_cmd, "ethtool --set-priv-flags %s enable-strict-header-split on", opts->tcpd_link_name);
-		ret = ret | system(ethtool_cmd);
+		RETURN_IF_NON_ZERO(system(ethtool_cmd));
 
 		sprintf(ethtool_cmd, "ethtool --set-priv-flags %s enable-header-split on", opts->tcpd_link_name);
-		ret = ret | system(ethtool_cmd);
+		RETURN_IF_NON_ZERO(system(ethtool_cmd));
 
 		sprintf(ethtool_cmd, "ethtool --set-rxfh-indir %s equal 8", opts->tcpd_link_name);
-		ret = ret | system(ethtool_cmd);
+		RETURN_IF_NON_ZERO(system(ethtool_cmd));
 
 		printf("ethtool cmds returned %i, sleeping 1...\n", ret);
 		sleep(1);
@@ -105,15 +101,13 @@ int install_flow_steering(const struct options *opts, intptr_t buf,
 	return ret;
 }
 
-int tcpd_setup_socket(int socket)
+int tcpd_setup_socket(struct thread *t, int socket)
 {
 	const int one = 1;
 	if (setsockopt(socket, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) ||
 	    setsockopt(socket, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)) ||
 	    setsockopt(socket, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one)))
-	{
-		perror("tcpd_setup_socket");
-		exit(EXIT_FAILURE);
-	}
+		PLOG_FATAL(t->cb, "tcpd_setup_socket");
+
 	return 0;
 }
diff --git a/tcpdevmem.h b/tcpdevmem.h
index 01a985b..937aa26 100644
--- a/tcpdevmem.h
+++ b/tcpdevmem.h
@@ -12,4 +12,4 @@
 int driver_reset(const struct options *opts);
 int install_flow_steering(const struct options *opts, intptr_t buf,
 			  struct thread *t);
-int tcpd_setup_socket(int socket);
+int tcpd_setup_socket(struct thread *t, int socket);
diff --git a/tcpdevmem_cuda.cu b/tcpdevmem_cuda.cu
index 5256bc9..a0cf8f0 100644
--- a/tcpdevmem_cuda.cu
+++ b/tcpdevmem_cuda.cu
@@ -121,20 +121,19 @@ void gather_rx_data(struct tcpdevmem_cuda_mbuf *tmbuf) {
 
 int get_gpumem_dmabuf_pages_fd(const std::string& gpu_pci_addr,
                                const std::string& nic_pci_addr, void* gpu_mem,
-                               size_t gpu_mem_sz, int* dma_buf_fd, bool is_client) {
+                               size_t gpu_mem_sz, int* dma_buf_fd, bool is_client,
+                               struct thread *t) {
   int err, ret;
 
   cuMemGetHandleForAddressRange((void*)dma_buf_fd, (CUdeviceptr)gpu_mem,
                                 gpu_mem_sz, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
                                 0);
 
-  if (*dma_buf_fd < 0) {
-    perror("cuMemGetHandleForAddressRange() failed!: ");
-    exit(EXIT_FAILURE);
-  }
+  if (*dma_buf_fd < 0)
+    PLOG_FATAL(t->cb, "cuMemGetHandleForAddressRange");
 
   printf("Registered dmabuf region 0x%p of %lu Bytes\n",
-      gpu_mem, gpu_mem_sz);
+         gpu_mem, gpu_mem_sz);
 
   struct dma_buf_create_pages_info frags_create_info;
   frags_create_info.dma_buf_fd = *dma_buf_fd;
@@ -153,7 +152,7 @@ int get_gpumem_dmabuf_pages_fd(const std::string& gpu_pci_addr,
 
   ret = ioctl(*dma_buf_fd, DMA_BUF_CREATE_PAGES, &frags_create_info);
   if (ret < 0) {
-    perror("Error getting dma_buf frags: ");
+    PLOG_ERROR(t->cb, "get dma_buf frags");
     err = -EIO;
     goto err_close_dmabuf;
   }
@@ -199,12 +198,10 @@ int tcpd_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct thre
 
   gpu_mem_fd_ = get_gpumem_dmabuf_pages_fd(gpu_pci_addr, nic_pci_addr,
                                            gpu_gen_mem_, alloc_size,
-                                           &dma_buf_fd_, is_client);
+                                           &dma_buf_fd_, is_client, t);
 
-  if (gpu_mem_fd_ < 0) {
-    printf("get_gpumem_dmabuf_pages_fd() failed!: ");
-    exit(71);
-  }
+  if (gpu_mem_fd_ < 0)
+    LOG_FATAL(t->cb, "get_gpumem_dmabuf_pages_fd");
 
   if (!is_client)
     install_flow_steering(opts, gpu_mem_fd_, t);
@@ -226,7 +223,7 @@ int tcpd_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct thre
   return 0;
 }
 
-int tcpd_send(int socket, void *buf, size_t n, int flags) {
+int tcpd_send(int socket, void *buf, size_t n, int flags, struct thread *t) {
   int gpu_mem_fd_;
   struct iovec iov;
   struct msghdr msg;
@@ -240,7 +237,6 @@ int tcpd_send(int socket, void *buf, size_t n, int flags) {
   gpu_mem_fd_ = tmbuf->gpu_mem_fd_;
 
   memset(&msg, 0, sizeof(msg));
-  // memset(cmsg, 0, sizeof(struct cmsghdr));
 
   iov.iov_base = NULL;
   iov.iov_len = n - tmbuf->bytes_sent;
@@ -259,15 +255,11 @@ int tcpd_send(int socket, void *buf, size_t n, int flags) {
   ((int *)CMSG_DATA(cmsg))[1] = (int)tmbuf->bytes_sent;
 
   ssize_t bytes_sent = sendmsg(socket, &msg, MSG_ZEROCOPY | MSG_DONTWAIT);
-  if (bytes_sent < 0 && errno != EWOULDBLOCK && errno != EAGAIN) {
-    perror("sendmsg() error: ");
-    exit(EXIT_FAILURE);
-  }
+  if (bytes_sent < 0 && errno != EWOULDBLOCK && errno != EAGAIN)
+    PLOG_FATAL(t->cb, "sendmsg");
 
-  if (bytes_sent == 0) {
-    perror("sendmsg() sent 0 bytes. Something is wrong.\n");
-    exit(EXIT_FAILURE);
-  }
+  if (bytes_sent == 0)
+    PLOG_FATAL(t->cb, "sendmsg sent 0 bytes");
 
   tmbuf->bytes_sent += bytes_sent;
   if (tmbuf->bytes_sent == n)
@@ -431,10 +423,9 @@ int tcpd_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread *t) {
     ret = setsockopt(client_fd, SOL_SOCKET,
                       SO_DEVMEM_DONTNEED, tokens->data(),
                       tokens->size() * sizeof(devmemtoken));
-    if (ret) {
-      perror("DONTNEED failed");
-      exit(1);
-    }
+    if (ret)
+      PLOG_FATAL(t->cb, "setsockopt DONTNEED failed");
+
     vectors->clear();
     tokens->clear();
     rx_blks_->clear();
diff --git a/tcpdevmem_cuda.h b/tcpdevmem_cuda.h
index 9f67636..bdf12b3 100644
--- a/tcpdevmem_cuda.h
+++ b/tcpdevmem_cuda.h
@@ -31,7 +31,7 @@ struct tcpdevmem_cuda_mbuf {
 
 int tcpd_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct thread *t);
 int cuda_flow_cleanup(void *f_mbuf);
-int tcpd_send(int socket, void *buf, size_t n, int flags);
+int tcpd_send(int socket, void *buf, size_t n, int flags, struct thread *t);
 int tcpd_recv(int fd, void *f_mbuf, size_t n, int flags, struct thread *t);
 
 #if __cplusplus
diff --git a/tcpdevmem_udma.h b/tcpdevmem_udma.h
deleted file mode 100644
index f90b6fa..0000000
--- a/tcpdevmem_udma.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef THIRD_PARTY_NEPER_DEVMEM_UDMA_H_
-#define THIRD_PARTY_NEPER_DEVMEM_UDMA_H_
-
-#if __cplusplus
-extern "C" {
-#endif
-
-#include <stdint.h>
-
-#include "common.h"
-#include "flags.h"
-#include "lib.h"
-
-#define UDMABUF_CREATE _IOW('u', 0x42, struct udmabuf_create)
-
-struct tcpdevmem_udma_mbuf {
-        struct msghdr msg;
-        int dmabuf_fd;
-        int pages_fd;
-
-        int devfd;
-        int memfd;
-        int buf;
-        int buf_pages;
-        size_t bytes_sent;
-};
-
-int udma_setup_alloc(const struct options *opts, void **f_mbuf,
-                     struct thread *t);
-int udma_send(int socket, void *f_mbuf, size_t n, int flags);
-int udma_recv(int socket, void *f_mbuf, size_t n, struct thread *t);
-
-#if __cplusplus
-}
-#endif
-
-#endif  // THIRD_PARTY_NEPER_DEVMEM_UDMA_H_
diff --git a/tcpdevmem_udma.c b/tcpdevmem_udmabuf.c
similarity index 75%
rename from tcpdevmem_udma.c
rename to tcpdevmem_udmabuf.c
index 88f10c2..4c5db38 100644
--- a/tcpdevmem_udma.c
+++ b/tcpdevmem_udmabuf.c
@@ -16,12 +16,10 @@
 #include "lib.h"
 #include "logging.h"
 #include "tcpdevmem.h"
-#include "tcpdevmem_udma.h"
+#include "tcpdevmem_udmabuf.h"
 #include "thread.h"
 
-#define TEST_PREFIX "ncdevmem_udma"
-
-int udma_setup_alloc(const struct options *opts, void **f_mbuf, struct thread *t)
+int udmabuf_setup_alloc(const struct options *opts, void **f_mbuf, struct thread *t)
 {
         bool is_client = opts->client;
         int devfd;
@@ -31,7 +29,7 @@ int udma_setup_alloc(const struct options *opts, void **f_mbuf, struct thread *t
         int ret;
         size_t size = opts->tcpd_phys_len;
 
-        struct tcpdevmem_udma_mbuf *tmbuf;
+        struct tcpdevmem_udmabuf_mbuf *tmbuf;
         struct dma_buf_create_pages_info pages_create_info;
         struct udmabuf_create create;
 
@@ -41,40 +39,26 @@ int udma_setup_alloc(const struct options *opts, void **f_mbuf, struct thread *t
         if (*f_mbuf)
                 return 0;
 
-        tmbuf = (struct tcpdevmem_udma_mbuf *)calloc(1, sizeof(struct tcpdevmem_udma_mbuf));
+        tmbuf = (struct tcpdevmem_udmabuf_mbuf *)calloc(1, sizeof(struct tcpdevmem_udmabuf_mbuf));
         if (!tmbuf)
-        {
-                exit(EXIT_FAILURE);
-        }
+                LOG_FATAL(t->cb, "calloc udmabuf");
 
         devfd = open("/dev/udmabuf", O_RDWR);
         if (devfd < 0)
-        {
-                printf("%s: [skip,no-udmabuf: Unable to access DMA buffer device file]\n",
-                       TEST_PREFIX);
-                exit(70);
-        }
+                LOG_FATAL(t->cb, "[skip,no-udmabuf: Unable to access DMA buffer device file]");
 
         memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING);
         if (memfd < 0)
-        {
-                printf("%s: [skip,no-memfd]\n", TEST_PREFIX);
-                exit(72);
-        }
+                LOG_FATAL(t->cb, "[skip,no-memfd]");
+
 
         ret = fcntl(memfd, F_ADD_SEALS, F_SEAL_SHRINK);
         if (ret < 0)
-        {
-                printf("%s: [skip,fcntl-add-seals]\n", TEST_PREFIX);
-                exit(73);
-        }
+                LOG_FATAL(t->cb, "[skip,fcntl-add-seals]");
 
         ret = ftruncate(memfd, size);
         if (ret == -1)
-        {
-                printf("%s: [FAIL,memfd-truncate]\n", TEST_PREFIX);
-                exit(74);
-        }
+                LOG_FATAL(t->cb, "[FAIL,memfd-truncate]\n");
 
         memset(&create, 0, sizeof(create));
 
@@ -84,10 +68,7 @@ int udma_setup_alloc(const struct options *opts, void **f_mbuf, struct thread *t
         printf("size=%lu\n", size);
         buf = ioctl(devfd, UDMABUF_CREATE, &create);
         if (buf < 0)
-        {
-                printf("%s: [FAIL, create udmabuf] %i\n", TEST_PREFIX, buf);
-                exit(75);
-        }
+                LOG_FATAL(t->cb, "[FAIL, create udmabuf]");
 
         pages_create_info.dma_buf_fd = buf;
         pages_create_info.create_page_pool = is_client ? 0 : 1;
@@ -98,17 +79,11 @@ int udma_setup_alloc(const struct options *opts, void **f_mbuf, struct thread *t
                      &pages_create_info.pci_bdf[2]);
 
         if (ret != 3)
-        {
-                printf("%s: [FAIL, parse fail]\n", TEST_PREFIX);
-                exit(76);
-        }
+                LOG_FATAL(t->cb, "[FAIL, parse fail]");
 
         buf_pages = ioctl(buf, DMA_BUF_CREATE_PAGES, &pages_create_info);
         if (buf_pages < 0)
-        {
-                perror("ioctl DMA_BUF_CREATE_PAGES: [FAIL, create pages fail]\n");
-                exit(77);
-        }
+                PLOG_FATAL(t->cb, "ioctl DMA_BUF_CREATE_PAGES: [FAIL, create pages fail]");
 
         if (!is_client)
                 install_flow_steering(opts, buf_pages, t);
@@ -127,7 +102,7 @@ int udma_setup_alloc(const struct options *opts, void **f_mbuf, struct thread *t
         return 0;
 }
 
-int udma_send(int socket, void *f_mbuf, size_t n, int flags)
+int udmabuf_send(int socket, void *f_mbuf, size_t n, int flags, struct thread *t)
 {
         int buf_pages, buf;
         struct iovec iov;
@@ -135,12 +110,12 @@ int udma_send(int socket, void *f_mbuf, size_t n, int flags)
         struct cmsghdr *cmsg;
         char buf_dummy[n];
         char offsetbuf[CMSG_SPACE(sizeof(uint32_t) * 2)];
-        struct tcpdevmem_udma_mbuf *tmbuf;
+        struct tcpdevmem_udmabuf_mbuf *tmbuf;
 
         if (!f_mbuf)
                 return -1;
 
-        tmbuf = (struct tcpdevmem_udma_mbuf *)f_mbuf;
+        tmbuf = (struct tcpdevmem_udmabuf_mbuf *)f_mbuf;
         buf_pages = tmbuf->buf_pages;
         buf = tmbuf->buf;
         msg = &tmbuf->msg;
@@ -152,10 +127,7 @@ int udma_send(int socket, void *f_mbuf, size_t n, int flags)
         char *buf_mem = NULL;
         buf_mem = (char *)mmap(NULL, n, PROT_READ | PROT_WRITE, MAP_SHARED, buf, 0);
         if (buf_mem == MAP_FAILED)
-        {
-                perror("mmap()");
-                exit(1);
-        }
+                PLOG_FATAL(t->cb, "mmap()");
 
         memcpy(buf_mem, buf_dummy, n);
 
@@ -185,16 +157,10 @@ int udma_send(int socket, void *f_mbuf, size_t n, int flags)
 
         ssize_t bytes_sent = sendmsg(socket, msg, MSG_ZEROCOPY);
         if (bytes_sent < 0 && errno != EWOULDBLOCK && errno != EAGAIN)
-        {
-                perror("sendmsg() error: ");
-                exit(EXIT_FAILURE);
-        }
+                PLOG_FATAL(t->cb, "sendmsg");
 
         if (bytes_sent == 0)
-        {
-                perror("sendmsg() sent 0 bytes. Something is wrong.\n");
-                exit(EXIT_FAILURE);
-        }
+                PLOG_FATAL(t->cb, "sendmsg sent 0 bytes");
 
         tmbuf->bytes_sent += bytes_sent;
         if (tmbuf->bytes_sent == n)
@@ -203,9 +169,9 @@ int udma_send(int socket, void *f_mbuf, size_t n, int flags)
         return bytes_sent;
 }
 
-int udma_recv(int socket, void *f_mbuf, size_t n, struct thread *t)
+int udmabuf_recv(int socket, void *f_mbuf, size_t n, struct thread *t)
 {
-        struct tcpdevmem_udma_mbuf *tmbuf = (struct tcpdevmem_udma_mbuf *)f_mbuf;
+        struct tcpdevmem_udmabuf_mbuf *tmbuf = (struct tcpdevmem_udmabuf_mbuf *)f_mbuf;
         bool is_devmem = false;
         size_t total_received = 0;
         size_t page_aligned_frags = 0;
@@ -227,29 +193,23 @@ int udma_recv(int socket, void *f_mbuf, size_t n, struct thread *t)
         msg.msg_control = ctrl_data;
         msg.msg_controllen = sizeof(ctrl_data);
         ssize_t ret = recvmsg(socket, &msg, MSG_SOCK_DEVMEM);
-        if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK))
-        {
+        if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
                 return -1;
         }
         if (ret < 0)
-        {
                 PLOG_FATAL(t->cb, "recvmsg:");
-                exit(1);
-        }
-        if (ret == 0)
-        {
+
+        if (ret == 0) {
                 LOG_ERROR(t->cb, "client exited");
                 return -1;
         }
 
         struct cmsghdr *cm = NULL;
         struct devmemvec *devmemvec = NULL;
-        for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm))
-        {
+        for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) {
                 if (cm->cmsg_level != SOL_SOCKET ||
                     (cm->cmsg_type != SCM_DEVMEM_OFFSET &&
-                     cm->cmsg_type != SCM_DEVMEM_HEADER))
-                {
+                     cm->cmsg_type != SCM_DEVMEM_HEADER)) {
                         LOG_ERROR(t->cb, "found weird cmsg");
                         continue;
                 }
@@ -258,15 +218,11 @@ int udma_recv(int socket, void *f_mbuf, size_t n, struct thread *t)
                 devmemvec = (struct devmemvec *)CMSG_DATA(cm);
 
                 if (cm->cmsg_type == SCM_DEVMEM_HEADER)
-                {
                         // TODO: process data copied from skb's linear
                         // buffer.
                         LOG_FATAL(t->cb,
                                   "SCM_DEVMEM_HEADER. devmemvec->frag_size=%u",
                                   devmemvec->frag_size);
-                        exit(1);
-                        continue;
-                }
 
                 struct devmemtoken token = {devmemvec->frag_token, 1};
 
@@ -288,22 +244,26 @@ int udma_recv(int socket, void *f_mbuf, size_t n, struct thread *t)
                                  SO_DEVMEM_DONTNEED, &token,
                                  sizeof(token));
                 if (ret)
-                {
                         PLOG_FATAL(t->cb, "DONTNEED failed");
-                        exit(1);
-                }
         }
 
-        if (!is_devmem)
-        {
+        if (!is_devmem) {
                 flow_steering_flakes++;
                 is_devmem = false;
                 total_received += ret;
         }
-        if (flow_steering_flakes) {
+        if (flow_steering_flakes)
                 LOG_WARN(t->cb, "total_received=%lu flow_steering_flakes=%lu",
                          total_received, flow_steering_flakes);
-        }
 
         return total_received;
 }
+
+void udmabuf_flow_cleanup(void *f_mbuf) {
+        struct tcpdevmem_udmabuf_mbuf *t_mbuf = (struct tcpdevmem_udmabuf_mbuf *)f_mbuf;
+
+        close(t_mbuf->buf_pages);
+        close(t_mbuf->buf);
+        close(t_mbuf->memfd);
+        close(t_mbuf->devfd);
+}
diff --git a/tcpdevmem_udmabuf.h b/tcpdevmem_udmabuf.h
new file mode 100644
index 0000000..391caf1
--- /dev/null
+++ b/tcpdevmem_udmabuf.h
@@ -0,0 +1,38 @@
+#ifndef THIRD_PARTY_NEPER_DEVMEM_UDMABUF_H_
+#define THIRD_PARTY_NEPER_DEVMEM_UDMABUF_H_
+
+#if __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+#include "common.h"
+#include "flags.h"
+#include "lib.h"
+
+#define UDMABUF_CREATE _IOW('u', 0x42, struct udmabuf_create)
+
+struct tcpdevmem_udmabuf_mbuf {
+        struct msghdr msg;
+        int dmabuf_fd;
+        int pages_fd;
+
+        int devfd;
+        int memfd;
+        int buf;
+        int buf_pages;
+        size_t bytes_sent;
+};
+
+int udmabuf_setup_alloc(const struct options *opts, void **f_mbuf,
+                     struct thread *t);
+int udmabuf_send(int socket, void *f_mbuf, size_t n, int flags, struct thread *t);
+int udmabuf_recv(int socket, void *f_mbuf, size_t n, struct thread *t);
+void udmabuf_flow_cleanup(void *f_mbuf);
+
+#if __cplusplus
+}
+#endif
+
+#endif  // THIRD_PARTY_NEPER_DEVMEM_UDMABUF_H_
diff --git a/thread.c b/thread.c
index c22e780..0a37a7b 100644
--- a/thread.c
+++ b/thread.c
@@ -29,9 +29,9 @@
 #include "rusage.h"
 #include "snaps.h"
 #include "stats.h"
-#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMA)
+#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMABUF)
 #include "tcpdevmem.h"
-#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMA */
+#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMABUF */
 #include "thread.h"
 
 #ifndef NO_LIBNUMA
@@ -370,13 +370,13 @@ void start_worker_threads(struct options *opts, struct callbacks *cb,
 
         int percentiles = percentiles_count(&opts->percentiles);
 
-#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMA)
+#if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMABUF)
         /* perform driver reset (on host) in anticipation of TCPDEVMEM run */
         if (opts->tcpd_nic_pci_addr && !opts->client) {
                 if (driver_reset(opts))
                         LOG_FATAL(cb, "TCPDEVMEM driver reset failed");
         }
-#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMA */
+#endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMABUF */
 
         for (i = 0; i < opts->num_threads; i++) {
                 t[i].index = i;

From 1c1ef8d71a14b068df5e6efad4dde50dc11890d3 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Tue, 9 Apr 2024 20:04:32 +0000
Subject: [PATCH 51/72] manually override kernel hdrs dir when making

---
 Makefile | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 5e12f2e..855611d 100644
--- a/Makefile
+++ b/Makefile
@@ -20,11 +20,13 @@ all: binaries
 
 CFLAGS := -std=c99 -Wall -O3 -g -D_GNU_SOURCE -DNO_LIBNUMA
 
+HEADERS_DIR := usr/include
+
 ifdef WITH_TCPDEVMEM_CUDA
-	CFLAGS += -DWITH_TCPDEVMEM_CUDA -I usr/include
+	CFLAGS += -DWITH_TCPDEVMEM_CUDA -I $(HEADERS_DIR)
 endif
 ifdef WITH_TCPDEVMEM_UDMABUF
-	CFLAGS += -DWITH_TCPDEVMEM_UDMABUF -DNDEBUG=1 -static -I usr/include
+	CFLAGS += -DWITH_TCPDEVMEM_UDMABUF -DNDEBUG=1 -static -I $(HEADERS_DIR)
 	LDFLAGS += -static
 endif
 
@@ -86,7 +88,7 @@ psp_rr-objs := psp_rr_main.o psp_rr.o rr.o psp_lib.o $(lib)
 ext-libs := -lm -lrt -lpthread
 
 tcpdevmem_cuda.o: tcpdevmem_cuda.cu
-	nvcc -arch=sm_90 -O3 -g -I usr/include -D_GNU_SOURCE -DNO_LIBNUMA -DWITH_TCPDEVMEM_CUDA -c -o $@ $^
+	nvcc -arch=sm_90 -O3 -g -I $(HEADERS_DIR) -D_GNU_SOURCE -DNO_LIBNUMA -DWITH_TCPDEVMEM_CUDA -c -o $@ $^
 
 tcp_rr: $(tcp_rr-objs)
 	$(CC) $(LDFLAGS) -o $@ $^ $(ext-libs)

From 8d119b900edeb669f7cf294d59db335a80fb6bc2 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Tue, 9 Apr 2024 20:18:10 +0000
Subject: [PATCH 52/72] avoid collision with existing Dockerfile in main branch

---
 Dockerfile | 23 -----------------------
 1 file changed, 23 deletions(-)
 delete mode 100644 Dockerfile

diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index f3eb11c..0000000
--- a/Dockerfile
+++ /dev/null
@@ -1,23 +0,0 @@
-FROM nvidia/cuda:12.0.0-devel-ubuntu20.04
-
-ENV DEBIAN_FRONTEND='noninteractive'
-
-RUN apt-get update \
-  && apt-get install -y --no-install-recommends \
-        git openssh-server wget iproute2 vim libopenmpi-dev build-essential cmake gdb \
-  protobuf-compiler libprotobuf-dev rsync libssl-dev \
-  && rm -rf /var/lib/apt/lists/*
-
-ARG CUDA12_GENCODE='-gencode=arch=compute_90,code=sm_90'
-ARG CUDA12_PTX='-gencode=arch=compute_90,code=compute_90'
-
-WORKDIR /third_party
-
-# this assumes that kernel hdr files have been copied into ${neper_dir}/usr/,
-# which will then be copied into the container
-COPY . /third_party
-RUN make tcp_stream WITH_TCPDEVMEM_CUDA=1
-
-RUN chmod +777 /tmp
-RUN apt-get update
-RUN apt-get install -y python3 sysstat ethtool

From 9793ada3e92d0630ad708c4ec363fc379755f91e Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Tue, 9 Apr 2024 20:21:41 +0000
Subject: [PATCH 53/72] update tcpdevmem README to specify correct Dockerfile


From 38c6c2ff4128a284202e8566ac6d7abd419921d7 Mon Sep 17 00:00:00 2001
From: Luigi Rizzo <rizzo.unipi@gmail.com>
Date: Wed, 29 Nov 2023 17:39:53 +0100
Subject: [PATCH 54/72] neper: support 64 bit for pacing, also allowed on
 server

Linux 5.10 extended SO_MAX_PACING_RATE to 64 bit, allowing more
than 4GB/s per socket.
Adjust variable sizes to pass the full 64 bits to the kernel.
Earlier kernels will silently truncate to 32 bits.

On passing, also move the setsockopt to after socket is established
so it can be used on the server side as well.

Tested: run experiment with argument bigger than 32 bits.
(It takes a fast link to verify that a single flow can achieve
the requested rate).
---
 check_all_options.c |  4 ++--
 define_all_flags.c  |  2 +-
 socket.c            | 10 ++++++----
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/check_all_options.c b/check_all_options.c
index 7c3c6bc..90b3564 100644
--- a/check_all_options.c
+++ b/check_all_options.c
@@ -52,8 +52,8 @@ void check_options_common(struct options *opts, struct callbacks *cb)
               "Number of epoll events must be positive.");
         CHECK(cb, opts->max_pacing_rate >= 0,
               "Max pacing rate must be non-negative.");
-        CHECK(cb, opts->max_pacing_rate <= UINT32_MAX,
-              "Max pacing rate cannot exceed 32 bits.");
+        CHECK(cb, opts->max_pacing_rate <= UINT64_MAX,
+              "Max pacing rate cannot exceed 64 bits.");
 }
 
 void check_options_tcp(struct options *opts, struct callbacks *cb)
diff --git a/define_all_flags.c b/define_all_flags.c
index f4a1522..3f55672 100644
--- a/define_all_flags.c
+++ b/define_all_flags.c
@@ -44,7 +44,7 @@ struct flags_parser *add_flags_common(struct flags_parser *fp)
         DEFINE_FLAG(fp, bool,         nonblocking,   false,    0,  "Make sure syscalls are all nonblocking");
         DEFINE_FLAG(fp, bool,         freebind,      false,    0,  "Set FREEBIND socket option");
         DEFINE_FLAG(fp, double,       interval,      1.0,     'I', "For how many seconds that a sample is generated");
-        DEFINE_FLAG(fp, long long,    max_pacing_rate, 0,     'm', "SO_MAX_PACING_RATE value; use as 32-bit unsigned");
+        DEFINE_FLAG(fp, long long,    max_pacing_rate, 0,     'm', "SO_MAX_PACING_RATE value; use as 64-bit unsigned");
         DEFINE_FLAG_PARSER(fp,        max_pacing_rate, parse_max_pacing_rate);
         DEFINE_FLAG(fp, int,          mark,          0,       'M', "SO_MARK value; use as 32-bit unsigned");
         DEFINE_FLAG(fp, const char *, local_hosts,   NULL,    'L', "Local hostnames or IP addresses");
diff --git a/socket.c b/socket.c
index 703cbef..f5e3688 100644
--- a/socket.c
+++ b/socket.c
@@ -54,10 +54,6 @@ static void socket_init_not_established(struct thread *t, int s)
 
         if (opts->debug)
                 set_debug(s, 1, cb);
-        if (opts->max_pacing_rate) {
-                uint32_t m = opts->max_pacing_rate;
-                setsockopt(s, SOL_SOCKET, SO_MAX_PACING_RATE, &m, sizeof(m));
-        }
 	if (opts->mark)
 		set_mark(s, opts->mark, cb);
         if (opts->reuseaddr)
@@ -103,8 +99,14 @@ static void socket_init_not_established(struct thread *t, int s)
 
 static void socket_init_established(struct thread *t, int s)
 {
+        const struct options *opts = t->opts;
         struct callbacks *cb = t->cb;
 
+        if (opts->max_pacing_rate) {
+		/* kernels before 5.10 will silently truncate to 32 bits */
+                uint64_t m = opts->max_pacing_rate;
+                setsockopt(s, SOL_SOCKET, SO_MAX_PACING_RATE, &m, sizeof(m));
+        }
         set_nonblocking(s, cb);
 }
 

From c4ba1ff39fe7f6f5b82133ff2f6c3ee7fd54ea98 Mon Sep 17 00:00:00 2001
From: Luigi Rizzo <rizzo.unipi@gmail.com>
Date: Thu, 30 Nov 2023 11:52:20 +0100
Subject: [PATCH 55/72] neper: new option --iostat-ms N prints io statistics
 periodically

This option on the client side periodically prints io statistics
(tx and rx operations, bytes and Mbps) so it is possible to monitor
throughput variations in real time.

netperf has a similar option.

Tested: run with --iostat-ms 1000
---
 control_plane.c    | 57 ++++++++++++++++++++++++++++++++++++++++++++--
 control_plane.h    |  3 ++-
 define_all_flags.c |  1 +
 lib.h              |  1 +
 rr.c               | 26 ++++++++++++++++++---
 stream.c           |  4 ++++
 thread.c           |  2 +-
 thread.h           |  9 ++++++++
 8 files changed, 96 insertions(+), 7 deletions(-)

diff --git a/control_plane.c b/control_plane.c
index 03c0cb7..3fc2f77 100644
--- a/control_plane.c
+++ b/control_plane.c
@@ -25,6 +25,7 @@
 #include "hexdump.h"
 #include "lib.h"
 #include "logging.h"
+#include "thread.h"
 
 /*
  * Client and server exchange typed (struct hs_msg) on the control
@@ -376,15 +377,67 @@ static void sig_alarm_handler(int sig)
         termination_requested = 1;
 }
 
-void control_plane_wait_until_done(struct control_plane *cp)
+static inline uint64_t clock_now(void)
 {
+        struct timespec t;
+
+        common_gettime(&t);
+        return t.tv_nsec + t.tv_sec * 1000000000ul;
+}
+
+struct print_io_stats_info {
+        struct callbacks *cb;
+        struct thread *t;
+        int num_threads;
+        uint64_t start_ns;
+        uint64_t prev_ns;
+        struct io_stats prev;
+};
+
+static void print_io_stats(struct print_io_stats_info *s)
+{
+        const uint64_t now = clock_now();
+        const double dt = 1e-9 * (now - s->prev_ns);
+        struct io_stats cur = {}, prev = s->prev;
+
+        /* Accumulate per-thread stats */
+        for (int i = 0; i < s->num_threads; i++) {
+                cur.tx_ops += s->t[i].io_stats.tx_ops;
+                cur.tx_bytes += s->t[i].io_stats.tx_bytes;
+                cur.rx_ops += s->t[i].io_stats.rx_ops;
+                cur.rx_bytes += s->t[i].io_stats.rx_bytes;
+        }
+        /* save totals for next round */
+        s->prev = cur;
+        s->prev_ns = now;
+
+        /* compute deltas for this interval */
+        cur.tx_ops -= prev.tx_ops;
+        cur.tx_bytes -= prev.tx_bytes;
+        cur.rx_ops -= prev.rx_ops;
+        cur.rx_bytes -= prev.rx_bytes;
+        PRINT(s->cb, "t",
+              "%-10.3lf TX: %6ld ops, %10ld bytes, %8.1lf Mbps; RX: %6ld ops, %10ld bytes, %8.1f Mbps;",
+              (double)(now - s->start_ns) * 1e-9,
+              cur.tx_ops, cur.tx_bytes, cur.tx_bytes * 8 * 1e-6 /dt,
+              cur.rx_ops, cur.rx_bytes, cur.rx_bytes * 8 * 1e-6 / dt);
+}
+
+void control_plane_wait_until_done(struct control_plane *cp, struct thread *t)
+{
+        struct print_io_stats_info s = {
+                .cb = cp->cb, .t = t, .num_threads = cp->opts->num_threads,
+		.start_ns = clock_now(), .prev_ns = clock_now()};
         if (cp->opts->client) {
                 if (cp->opts->test_length > 0) {
                         signal(SIGALRM, sig_alarm_handler);
                         signal(SIGTERM, sig_alarm_handler);
                         alarm(cp->opts->test_length);
+                        const int sleep_ms = cp->opts->iostat_ms ? : 1000;
                         while (!termination_requested) {
-                                sleep(1);
+                                usleep(sleep_ms * 1000);
+                                if (cp->opts->iostat_ms)
+                                        print_io_stats(&s);
                         }
                         LOG_INFO(cp->cb, "finished sleep");
                 } else if (cp->opts->test_length < 0) {
diff --git a/control_plane.h b/control_plane.h
index b59dae1..8e05289 100644
--- a/control_plane.h
+++ b/control_plane.h
@@ -23,13 +23,14 @@ struct control_plane;
 struct options;
 struct countdown_cond;
 struct neper_fn;
+struct thread;
 
 struct control_plane* control_plane_create(struct options *opts,
                                            struct callbacks *cb,
                                            struct countdown_cond *data_pending,
                                            const struct neper_fn *fn);
 void control_plane_start(struct control_plane *cp, struct addrinfo **ai);
-void control_plane_wait_until_done(struct control_plane *cp);
+void control_plane_wait_until_done(struct control_plane *cp, struct thread *t);
 void control_plane_stop(struct control_plane *cp);
 int control_plane_incidents(struct control_plane *cp);
 void control_plane_destroy(struct control_plane *cp);
diff --git a/define_all_flags.c b/define_all_flags.c
index 3f55672..e39325a 100644
--- a/define_all_flags.c
+++ b/define_all_flags.c
@@ -55,6 +55,7 @@ struct flags_parser *add_flags_common(struct flags_parser *fp)
         DEFINE_FLAG_HAS_OPTIONAL_ARGUMENT(fp, all_samples);
         DEFINE_FLAG_PARSER(fp,        all_samples, parse_all_samples);
         DEFINE_FLAG(fp, bool,         time_wait,     false,    0,  "Do not set SO_LINGER 0. Close gracefully. Active peer will enter TIME_WAIT state");
+        DEFINE_FLAG(fp, unsigned long, iostat_ms,    0,        0,  "Print io stats snapshot every this many ms");
 
         /* Return the updated fp */
         return (fp);
diff --git a/lib.h b/lib.h
index 01008da..e769e58 100644
--- a/lib.h
+++ b/lib.h
@@ -101,6 +101,7 @@ struct options {
         const char *control_port;
         const char *port;
         int source_port;
+        unsigned long iostat_ms;
         const char *all_samples;
         const char secret[32]; /* includes test name */
         bool async_connect;
diff --git a/rr.c b/rr.c
index 00d47c9..3a7483c 100644
--- a/rr.c
+++ b/rr.c
@@ -79,6 +79,10 @@ static void crr_client_state_0(struct flow *, uint32_t);
 static ssize_t rr_fn_send(struct flow *f, const char *buf, size_t len,
                           int flags)
 {
+        struct thread *t = flow_thread(f);
+
+        t->io_stats.tx_ops++;
+        t->io_stats.tx_bytes += len;
         return send(flow_fd(f), buf, len, flags);
 }
 
@@ -86,21 +90,37 @@ static ssize_t rr_fn_sendto(struct flow *f, const char *buf, size_t len,
                             int flags)
 {
         const struct rr_state *rr = flow_opaque(f);
+        struct thread *t = flow_thread(f);
+
+        t->io_stats.tx_ops++;
+        t->io_stats.tx_bytes += len;
         return sendto(flow_fd(f), buf, len, flags, (void *)&rr->rr_peer,
                       rr->rr_peerlen);
 }
 
 static ssize_t rr_fn_recv(struct flow *f, char *buf, size_t len)
 {
-        return recv(flow_fd(f), buf, len, 0);
+        struct thread *t = flow_thread(f);
+        ssize_t ret;
+
+        ret = recv(flow_fd(f), buf, len, 0);
+        t->io_stats.rx_ops++;
+        t->io_stats.rx_bytes += ret > 0 ? ret : 0;
+        return ret;
 }
 
 static ssize_t rr_fn_recvfrom(struct flow *f, char *buf, size_t len)
 {
         struct rr_state *rr = flow_opaque(f);
+        struct thread *t = flow_thread(f);
+        ssize_t ret;
+
         rr->rr_peerlen = sizeof(struct sockaddr_storage);
-        return recvfrom(flow_fd(f), buf, len, 0, (void *)&rr->rr_peer,
-                        &rr->rr_peerlen);
+        ret = recvfrom(flow_fd(f), buf, len, 0, (void *)&rr->rr_peer,
+                       &rr->rr_peerlen);
+        t->io_stats.rx_ops++;
+        t->io_stats.rx_bytes += ret > 0 ? ret : 0;
+        return ret;
 }
 
 /* Allocate a message buffer for a rr flow. */
diff --git a/stream.c b/stream.c
index 6dfdc37..8d42f07 100644
--- a/stream.c
+++ b/stream.c
@@ -128,6 +128,8 @@ void stream_handler(struct flow *f, uint32_t events)
                                 n = recv(fd, mbuf, opts->buffer_size,
                                          opts->recv_flags);
                         } while(n == -1 && errno == EINTR);
+                        t->io_stats.rx_ops++;
+                        t->io_stats.rx_bytes += n > 0 ? n : 0;
                         if (n == -1) {
                                 if (errno != EAGAIN)
                                         PLOG_ERROR(t->cb, "read");
@@ -157,6 +159,8 @@ void stream_handler(struct flow *f, uint32_t events)
                         } else
 #endif /* WITH_TCPDEVMEM_UDMABUF */
                         n = send(fd, mbuf, opts->buffer_size, opts->send_flags);
+                        t->io_stats.tx_ops++;
+                        t->io_stats.tx_bytes += n > 0 ? n : 0;
                         if (n == -1) {
                                 if (errno != EAGAIN)
                                         PLOG_ERROR(t->cb, "send");
diff --git a/thread.c b/thread.c
index 0a37a7b..163d514 100644
--- a/thread.c
+++ b/thread.c
@@ -581,7 +581,7 @@ int run_main_thread(struct options *opts, struct callbacks *cb,
         pthread_mutex_lock(&time_start_mutex);
         getrusage_enhanced(RUSAGE_SELF, &rusage_start); /* rusage start! */
         pthread_mutex_unlock(&time_start_mutex);
-        control_plane_wait_until_done(cp);
+        control_plane_wait_until_done(cp, ts);
         getrusage_enhanced(RUSAGE_SELF, &rusage_end); /* rusage end! */
 
         stop_worker_threads(cb, opts->num_threads, ts, &ready_barrier,
diff --git a/thread.h b/thread.h
index 65e40ae..c3f1af4 100644
--- a/thread.h
+++ b/thread.h
@@ -69,6 +69,14 @@ struct rate_limit {
         struct flow **pending_flows;    /* size is flow_count */
 };
 
+/* Store per-thread io stats */
+struct io_stats {
+        uint64_t tx_ops;
+        uint64_t tx_bytes;
+        uint64_t rx_ops;
+        uint64_t rx_bytes;
+};
+
 struct thread {
         int index;
         pthread_t id;
@@ -98,6 +106,7 @@ struct thread {
         struct neper_histo_factory *histo_factory;
         struct neper_stats *stats;
         struct neper_rusage *rusage;
+	struct io_stats io_stats;
         struct countdown_cond *data_pending;
         struct rate_limit rl;
         struct flow **flows;  /* indexed by flow_id(flow) */

From a27d552541a4e2a91a19ca535434b8df6def657d Mon Sep 17 00:00:00 2001
From: Luigi Rizzo <lrizzo@google.com>
Date: Thu, 30 Nov 2023 17:49:54 +0100
Subject: [PATCH 56/72] neper: use one socket per flow in bidirectional stream
 mode

In bidirectional mode, acks are piggybacked behind data and this
creates unwanted dependencies between forward and reverse flows.

To solve the problem, IN BIDIRECTIONAL STREAM MODE ONLY we use
one tcp socket per direction (the user-specified number of flows
is doubled after option parsing), used as follows:
- client and server always read from all sockets
- client sends only on half of the sockets (those witheven f_id).
  This is done by disabling EPOLLOUT on alternate sockets.
- server starts sending on all sockets, but will stop sending and
  disable EPOLLOUT on sockets on which data is received.
  This is done in stream_handler()

The above allows to have half of the sockets in tx, and half in rx,
without control plane modifications.

For backward compatibility, this is controlled by the --split-bidir
command line option which implies -rw on both sides.

Tested: manual test with --split-bidir and different '-m' values on client and server
---
 define_all_flags.c |  1 +
 flow.c             | 23 ++++++++++++++++++++++-
 lib.h              |  1 +
 psp_stream_main.c  | 10 +++++++++-
 stream.c           | 12 ++++++++++++
 tcp_stream_main.c  |  5 +++++
 6 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/define_all_flags.c b/define_all_flags.c
index e39325a..14fb096 100644
--- a/define_all_flags.c
+++ b/define_all_flags.c
@@ -140,6 +140,7 @@ struct flags_parser *add_flags_tcp_stream(struct flags_parser *fp)
         DEFINE_FLAG(fp, bool,          skip_rx_copy,    false,    0,  "Skip kernel->user payload copy on receives");
         DEFINE_FLAG(fp, bool,          enable_read,     false,   'r', "Read from flows? enabled by default for the server");
         DEFINE_FLAG(fp, bool,          enable_write,    false,   'w', "Write to flows? Enabled by default for the client");
+        DEFINE_FLAG(fp, bool,          split_bidir ,    false,    0,  "Bidirectional using separate tx/rx sockets");
         DEFINE_FLAG(fp, bool,          enable_tcp_maerts,    false,   'M', "Enables TCP_MAERTS test (server writes and client reads). It overrides enable_read, and enable_write");
         DEFINE_FLAG(fp, bool,          async_connect,   false,   0,  "use non blocking connect");
 #if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMABUF)
diff --git a/flow.c b/flow.c
index 222bc22..0bda105 100644
--- a/flow.c
+++ b/flow.c
@@ -127,6 +127,7 @@ void flow_create(const struct flow_create_args *args)
 {
         struct thread *t = args->thread;
         struct flow *f = calloc_or_die(1, sizeof(struct flow), t->cb);
+        int events = args->events;      /* must be overriden in some cases */
 
         f->f_thread = t;
         f->f_opaque = args->opaque;
@@ -145,7 +146,27 @@ void flow_create(const struct flow_create_args *args)
                         thread_store_flow_or_die(t, f);
                 }
         }
-        flow_ctl(f, EPOLL_CTL_ADD, args->handler, args->events, true);
+        /* In bidirectional mode, acks are piggybacked behind data and this
+         * creates unwanted dependencies between forward and reverse flows.
+         *
+         * To solve the problem, IN BIDIRECTIONAL STREAM MODE ONLY we use
+         * one tcp socket per direction (the user-specified number of flows
+         * is doubled after option parsing), used as follows:
+         * - client and server always read from all sockets
+         * - client sends only on half of the sockets (those witheven f_id).
+         *   This is done by disabling EPOLLOUT on alternate sockets, below.
+         * - server starts sending on all sockets, but will stop sending and
+         *   disable EPOLLOUT on sockets on which data is received.
+         *   This is done in stream_handler.
+         * The above allows to have half of the sockets in tx, and half in rx,
+         * without control plane modifications.
+         * For backward compatibility reasons, this is controlled by a
+         * command-line option, --split-bidir
+         */
+        if (t->opts->split_bidir && t->opts->client)
+                events &= (f->f_id & 1) ? EPOLLOUT : EPOLLIN;
+
+        flow_ctl(f, EPOLL_CTL_ADD, args->handler, events, true);
 }
 
 /* Returns true if the deadline for the flow has expired.
diff --git a/lib.h b/lib.h
index e769e58..b0d0c61 100644
--- a/lib.h
+++ b/lib.h
@@ -119,6 +119,7 @@ struct options {
         int queue_start;
         int queue_num;
 #endif /* WITH_TCPDEVMEM_CUDA || WITH_TCPDEVMEM_UDMABUF */
+        bool split_bidir;  /* implies enable_read, enable_write, split rx/tx */
         bool enable_read;
         bool enable_write;
         bool enable_tcp_maerts;
diff --git a/psp_stream_main.c b/psp_stream_main.c
index 9ef8df9..0423853 100644
--- a/psp_stream_main.c
+++ b/psp_stream_main.c
@@ -50,6 +50,14 @@ int main(int argc, char **argv)
                 else
                         opts.enable_read = true;
         }
+        if (opts.split_bidir) {
+                opts.enable_read = true;
+                opts.enable_write = true;
+                opts.num_flows *= 2;
+        }
+
+        if (opts.enable_read && opts.enable_write)
+                opts.num_flows *= 2;
 
         if (opts.skip_rx_copy)
                 opts.recv_flags = MSG_TRUNC;
@@ -79,4 +87,4 @@ int main(int argc, char **argv)
 exit:
         logging_exit(&cb);
         return exit_code;
-}
\ No newline at end of file
+}
diff --git a/stream.c b/stream.c
index 8d42f07..6cb9a69 100644
--- a/stream.c
+++ b/stream.c
@@ -192,6 +192,18 @@ void stream_handler(struct flow *f, uint32_t events)
                  * e.g. Linux kernel tools/testing/selftests/net/msg_zerocopy.c
                  */
         }
+        if (opts->split_bidir && !opts->client &&
+            events & EPOLLOUT && events & EPOLLOUT) {
+                /* See comments in flow.c on bidirectional traffic:
+                 * we use one socket per direction, incoming data means
+                 * this socket is used for client writes and the server should
+                 * stop writing there. This is meant to be called only once;
+                 * leaving only EPOLLIN prevents this to be called again
+                 * without having to store extra state.
+                 */
+                 flow_mod(f, stream_handler, EPOLLIN, true);
+         }
+
 }
 
 int stream_report(struct thread *ts)
diff --git a/tcp_stream_main.c b/tcp_stream_main.c
index 7dc5a1e..8764a10 100644
--- a/tcp_stream_main.c
+++ b/tcp_stream_main.c
@@ -50,6 +50,11 @@ int main(int argc, char **argv)
                 else
                         opts.enable_read = true;
         }
+        if (opts.split_bidir) {
+                opts.enable_read = true;
+                opts.enable_write = true;
+                opts.num_flows *= 2;
+        }
 
         if (opts.skip_rx_copy)
                 opts.recv_flags = MSG_TRUNC;

From 8c5f1cb0343d628b7576c7f9e9fe113fc851b3f4 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Tue, 12 Dec 2023 16:39:38 +0000
Subject: [PATCH 57/72] explicit cast from sockaddr_{in,in6} to sockaddr

fix error when -Wincompatible-pointer-types is included
---
 psp_lib.c | 8 ++++----
 socket.c  | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/psp_lib.c b/psp_lib.c
index 2945a1e..49c7e5e 100644
--- a/psp_lib.c
+++ b/psp_lib.c
@@ -44,7 +44,7 @@ void psp_ctrl_client(int ctrl_conn, struct callbacks *cb) {
         struct sockaddr_in6 kmaddr;
         socklen_t kmaddrlen = sizeof(kmaddr);
 
-        if (getpeername(ctrl_conn, &kmaddr, &kmaddrlen) < 0) {
+        if (getpeername(ctrl_conn, (struct sockaddr *)&kmaddr, &kmaddrlen) < 0) {
                 LOG_FATAL(cb, "Can't get peer address: %s", strerror(errno));
         }
         kmaddr.sin6_port = htons(port);
@@ -54,7 +54,7 @@ void psp_ctrl_client(int ctrl_conn, struct callbacks *cb) {
         if (kmfd < 0) {
                 LOG_FATAL(cb, "Can't create km client socket: %s", strerror(errno));
         }
-        if (connect(kmfd, &kmaddr, sizeof(kmaddr)) < 0) {
+        if (connect(kmfd, (const struct sockaddr *)&kmaddr, sizeof(kmaddr)) < 0) {
                 LOG_FATAL(cb, "Can't connect km client socket: %s", strerror(errno));
         }
         LOG_INFO(cb, "Connected to km socket");
@@ -100,14 +100,14 @@ static void *psp_key_server(void *arg)
         if (setsockopt(kmlfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) < 0) {
                 LOG_FATAL(cb, "Can't set SO_REUSEADDR on listen socket: %s", strerror(errno));
         }
-        if (bind(kmlfd, &kmaddr, sizeof(kmaddr)) < 0) {
+        if (bind(kmlfd, (const struct sockaddr *)&kmaddr, sizeof(kmaddr)) < 0) {
                 LOG_FATAL(cb, "Can't bind listen socket: %s", strerror(errno));
         }
         if (listen(kmlfd, 5) < 0) {
                 LOG_FATAL(cb, "Can't listen on listen socket: %s", strerror(errno));
         }
         LOG_INFO(cb, "Waiting for connection on listen socket");
-        kmfd = accept(kmlfd, &acceptaddr, &acceptaddrlen);
+        kmfd = accept(kmlfd, (struct sockaddr *)&acceptaddr, &acceptaddrlen);
         if (kmfd < 0) {
                 LOG_FATAL(cb, "Can't accept on listen socket: %s", strerror(errno));
         }
diff --git a/socket.c b/socket.c
index f5e3688..7732c52 100644
--- a/socket.c
+++ b/socket.c
@@ -353,7 +353,7 @@ int socket_connect_one(struct thread *t, int flags)
                         source.sin_family = AF_INET;
                         source.sin_addr.s_addr = INADDR_ANY;
                         source.sin_port = htons(port);
-                        if (bind(s, &source, sizeof(source))) {
+                        if (bind(s, (const struct sockaddr *)&source, sizeof(source))) {
                                 PLOG_FATAL(t->cb, "bind for source port");
                         }
                 } else {
@@ -362,7 +362,7 @@ int socket_connect_one(struct thread *t, int flags)
                         source.sin6_family = AF_INET6;
                         source.sin6_addr = in6addr_any;
                         source.sin6_port = htons(port);
-                        if (bind(s, &source, sizeof(source))) {
+                        if (bind(s, (const struct sockaddr *)&source, sizeof(source))) {
                                 PLOG_FATAL(t->cb, "bind for source port");
                         }
                 }

From fbd2fb56b393e98e21b5c63f57e2132bf5fda24d Mon Sep 17 00:00:00 2001
From: Antonio Ojea <aojea@google.com>
Date: Sat, 6 Jan 2024 16:59:07 +0000
Subject: [PATCH 58/72] use neper in a container image

Signed-off-by: Antonio Ojea <aojea@google.com>
---
 Dockerfile | 20 ++++++++++++++++++++
 Makefile   |  6 ++++++
 2 files changed, 26 insertions(+)
 create mode 100644 Dockerfile

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..9ad64fe
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,20 @@
+FROM debian:12
+WORKDIR /src
+RUN apt-get update && apt-get install build-essential -y
+COPY . .
+RUN make clean && make all
+
+# debug images contain a busybox shell
+FROM gcr.io/distroless/cc-debian12:debug
+WORKDIR /home
+COPY --from=0 /src/tcp_rr /bin/tcp_rr
+COPY --from=0 /src/tcp_stream /bin/tcp_stream
+COPY --from=0 /src/tcp_crr /bin/tcp_crr
+COPY --from=0 /src/udp_rr /bin/udp_rr
+COPY --from=0 /src/udp_stream /bin/udp_stream
+COPY --from=0 /src/psp_stream /bin/psp_stream
+COPY --from=0 /src/psp_crr /bin/psp_crr
+COPY --from=0 /src/psp_rr /bin/psp_rr
+# useful to deploy the image as a Kubernetes Pod
+# as it keeps the image running forever
+ENTRYPOINT [ "/busybox/sleep", "infinity" ]
diff --git a/Makefile b/Makefile
index 855611d..c6e03b4 100644
--- a/Makefile
+++ b/Makefile
@@ -122,3 +122,9 @@ binaries: tcp_rr tcp_stream tcp_crr udp_rr udp_stream psp_stream psp_crr psp_rr
 
 clean:
 	rm -f *.o tcp_rr tcp_stream tcp_crr udp_rr udp_stream psp_stream psp_crr psp_rr
+
+IMAGE ?= neper
+TAG ?= $(shell git describe --tags --always --dirty)
+
+image:
+	docker build --tag ${IMAGE}:${TAG} .

From 3606589acac22e002b577185c21d0d269449084d Mon Sep 17 00:00:00 2001
From: Luigi Rizzo <lrizzo@google.com>
Date: Fri, 5 Jan 2024 15:17:42 +0000
Subject: [PATCH 59/72] numlist: remove unused component

There seems to be no use for the numlist object
---
 Makefile  |   1 -
 numlist.c | 189 ------------------------------------------------------
 numlist.h |  40 ------------
 3 files changed, 230 deletions(-)
 delete mode 100644 numlist.c
 delete mode 100644 numlist.h

diff --git a/Makefile b/Makefile
index c6e03b4..5ce9611 100644
--- a/Makefile
+++ b/Makefile
@@ -46,7 +46,6 @@ lib := \
 	histo.o \
 	logging.o \
 	loop.o \
-	numlist.o \
 	or_die.o \
 	parse.o \
 	percentiles.o \
diff --git a/numlist.c b/numlist.c
deleted file mode 100644
index 194a87c..0000000
--- a/numlist.c
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "numlist.h"
-#include <errno.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include "lib.h"
-#include "logging.h"
-
-#define MEMBLOCK_SIZE 500
-
-struct memblock {
-        size_t size;
-        struct memblock *next;
-        double data[MEMBLOCK_SIZE];
-};
-
-struct numlist {
-        struct callbacks *cb;
-        struct memblock *head;
-};
-
-static void prepend_memblock(struct numlist *lst)
-{
-        struct memblock *blk;
-
-        blk = malloc(sizeof(struct memblock));
-        if (!blk)
-                PLOG_FATAL(lst->cb, "unable to allocate memblock");
-        blk->size = 0;
-        blk->next = lst->head;
-        lst->head = blk;
-}
-
-struct numlist *numlist_create(struct callbacks *cb)
-{
-        struct numlist *lst;
-
-        lst = malloc(sizeof(struct numlist));
-        if (!lst)
-                PLOG_FATAL(cb, "unable to allocate numlist");
-        lst->cb = cb;
-        lst->head = NULL;
-        prepend_memblock(lst);
-        return lst;
-}
-
-void numlist_destroy(struct numlist *lst)
-{
-        struct memblock *block, *next;
-
-        block = lst->head;
-        while (block) {
-                next = block->next;
-                free(block);
-                block = next;
-        }
-        free(lst);
-}
-
-void numlist_add(struct numlist *lst, double val)
-{
-        if (lst->head->size == MEMBLOCK_SIZE)
-                prepend_memblock(lst);
-        lst->head->data[lst->head->size++] = val;
-}
-
-void numlist_concat(struct numlist *lst, struct numlist *tail)
-{
-        struct memblock *blk = lst->head;
-
-        while (blk->next)
-                blk = blk->next;
-        blk->next = tail->head;
-        tail->head = NULL;
-}
-
-#define for_each_memblock(blk, lst) \
-        for (blk = (lst)->head; blk; blk = blk->next)
-
-#define for_each_number(n, blk) \
-        for (n = (blk)->data; n < (blk)->data + blk->size; n++)
-
-#define for_each(n, blk, lst) \
-        for_each_memblock(blk, lst) for_each_number(n, blk)
-
-size_t numlist_size(struct numlist *lst)
-{
-        struct memblock *blk;
-        size_t size = 0;
-        double *n;
-
-        for_each(n, blk, lst)
-                size++;
-        return size;
-}
-
-double numlist_min(struct numlist *lst)
-{
-        double min = INFINITY, *n;
-        struct memblock *blk;
-
-        for_each(n, blk, lst) {
-                if (*n < min)
-                        min = *n;
-        }
-        return min;
-}
-
-double numlist_max(struct numlist *lst)
-{
-        double max = -INFINITY, *n;
-        struct memblock *blk;
-
-        for_each(n, blk, lst) {
-                if (*n > max)
-                        max = *n;
-        }
-        return max;
-}
-
-double numlist_mean(struct numlist *lst)
-{
-        double sum = 0, cnt = 0, *n;
-        struct memblock *blk;
-
-        for_each(n, blk, lst) {
-                sum += *n;
-                cnt++;
-        }
-        return sum / cnt;
-}
-
-double numlist_stddev(struct numlist *lst)
-{
-        double sum = 0, cnt = 0, mean, *n;
-        struct memblock *blk;
-
-        mean = numlist_mean(lst);
-        for_each(n, blk, lst) {
-                sum += (*n - mean) * (*n - mean);
-                cnt++;
-        }
-        return sqrt(sum / cnt);
-}
-
-static int compare_doubles(const void *a, const void *b)
-{
-        const double x = *(const double *)a, y = *(const double *)b;
-
-        if (x < y)
-                return -1;
-        if (x > y)
-                return 1;
-        return 0;
-}
-
-double numlist_percentile(struct numlist *lst, int percentile)
-{
-        double *values, *n, result;
-        struct memblock *blk;
-        size_t size, i = 0;
-
-        size = numlist_size(lst);
-        if (size == 0)
-                return NAN;
-        values = malloc(sizeof(double) * size);
-        for_each(n, blk, lst)
-                values[i++] = *n;
-        qsort(values, size, sizeof(double), compare_doubles);
-        result = values[(size - 1) * percentile / 100];
-        free(values);
-        return result;
-}
diff --git a/numlist.h b/numlist.h
deleted file mode 100644
index ed47b26..0000000
--- a/numlist.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright 2016 Google Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef NEPER_NUMLIST_H
-#define NEPER_NUMLIST_H
-
-#include <stddef.h>
-
-struct callbacks;
-struct numlist;
-
-struct numlist *numlist_create(struct callbacks *cb);
-void numlist_destroy(struct numlist *lst);
-void numlist_add(struct numlist *lst, double val);
-/**
- * The numbers in @tail are all moved to @lst.
- * @tail will become empty after this operation.
- */
-void numlist_concat(struct numlist *lst, struct numlist *tail);
-size_t numlist_size(struct numlist *lst);
-double numlist_min(struct numlist *lst);
-double numlist_max(struct numlist *lst);
-double numlist_mean(struct numlist *lst);
-double numlist_stddev(struct numlist *lst);
-double numlist_percentile(struct numlist *lst, int percentile);
-
-#endif

From 60b0af76c23842d8cac0f7004b9e0bf568fdc716 Mon Sep 17 00:00:00 2001
From: Luigi Rizzo <lrizzo@google.com>
Date: Thu, 4 Jan 2024 07:41:25 +0000
Subject: [PATCH 60/72] rr: remove incorrect division by MILLION csv printing

percentiles in CSV file were incorrectly divided by MILLION,
resulting in mostly 0 values. Remove the divisor.

Probably the feature was never used, otherwise it would have
been noticed.
---
 rr.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/rr.c b/rr.c
index 3a7483c..4b35722 100644
--- a/rr.c
+++ b/rr.c
@@ -39,8 +39,6 @@
 
 #define NEPER_EPOLL_MASK (EPOLLHUP | EPOLLRDHUP | EPOLLERR)
 
-static const int MILLION = 1000000;
-
 typedef ssize_t (*rr_send_t)(struct flow *, const char *, size_t, int);
 typedef ssize_t (*rr_recv_t)(struct flow *, char *, size_t);
 
@@ -480,8 +478,7 @@ static void rr_print_snap(struct thread *t, int flow_index,
                 const struct rr_snap_opaque *rso = (void *)&snap->opaque;
 
                 fprintf(csv, ",%f,%f,%f,%f",
-                        rso->min / MILLION, rso->mean / MILLION,
-                        rso->max / MILLION, rso->stddev / MILLION);
+                        rso->min, rso->mean, rso->max, rso->stddev);
 
                 if (t->percentiles) {
                         const struct options *opts = t->opts;
@@ -490,7 +487,7 @@ static void rr_print_snap(struct thread *t, int flow_index,
                         for (i = 0; i < PER_INDEX_COUNT; i++)
                                 if (percentiles_chosen(&opts->percentiles, i))
                                         fprintf(csv, ",%f",
-                                                rso->percentile[j++] / MILLION);
+                                                rso->percentile[j++]);
                 }
 
                 fprintf(csv, "\n");

From b3007ef891806ffe28b6782d0247e9a3e776b229 Mon Sep 17 00:00:00 2001
From: Luigi Rizzo <lrizzo@google.com>
Date: Thu, 4 Jan 2024 08:28:09 +0000
Subject: [PATCH 61/72] histo: de-virtualize histogram methods. No functional
 change.

histogram methods were implemented as virtual functions, but since there
is only one possible implementation this was overkill.
Simplify the code by exposing the actual methods. The implementation
still remains opaque.

No functional changes.

Tested with
 ./tcp_rr -c -H 127.0.0.1 -p1,2,10,50,90,999.9999,100 -A/tmp/x.csv -l 4
 and verified that the csv file has the correct data.
 (histograms are only exercised in rr tests)
---
 histo.c  | 147 +++++++++++++------------------------------------------
 histo.h  |  71 +++++++++++++++------------
 rr.c     |  29 ++++++-----
 stats.c  |   2 +-
 thread.c |   4 +-
 5 files changed, 91 insertions(+), 162 deletions(-)

diff --git a/histo.c b/histo.c
index 0ca302f..d88fd83 100644
--- a/histo.c
+++ b/histo.c
@@ -23,9 +23,7 @@
 // use 0.01 us time resolution
 static const int TIME_RESOLUTION = 100 * 1000000;
 
-struct histo_impl {
-        struct neper_histo histo;
-
+struct neper_histo {
         const struct thread *thread;
 
         int num_buckets;  /* # of buckets allocated */
@@ -60,55 +58,26 @@ struct histo_impl {
         bool first_all;   /* Is this the first call to all_percent() */
 };
 
-struct histo_factory_impl {
-        struct neper_histo_factory factory;
-
+struct neper_histo_factory {
         const struct thread *thread;
 
         int num_buckets;  /* # of buckets allocated */
         int *ceil;        /* Max value that can be hashed into each bucket */
 };
 
-static double histo_all_min(const struct neper_histo *histo)
-{
-        const struct histo_impl *impl = (void *)histo;
-
-        return impl->all_min;
-}
-
-static double histo_one_min(const struct neper_histo *histo)
-{
-        const struct histo_impl *impl = (void *)histo;
-
-        return impl->one_min;
-}
-
-static double histo_all_max(const struct neper_histo *histo)
-{
-        const struct histo_impl *impl = (void *)histo;
-
-        return impl->all_max;
-}
-
-static double histo_one_max(const struct neper_histo *histo)
+double neper_histo_min(const struct neper_histo *histo)
 {
-        const struct histo_impl *impl = (void *)histo;
-
-        return impl->one_max;
+        return histo->one_min;
 }
 
-static double histo_all_mean(const struct neper_histo *histo)
+double neper_histo_max(const struct neper_histo *histo)
 {
-        const struct histo_impl *impl = (void *)histo;
-
-        return impl->all_sum / impl->all_count;
+        return histo->one_max;
 }
 
-static double histo_one_mean(const struct neper_histo *histo)
+double neper_histo_mean(const struct neper_histo *histo)
 {
-        const struct histo_impl *impl = (void *)histo;
-
-        return impl->one_sum / impl->one_count;
+        return histo->one_sum / histo->one_count;
 }
 
 static double histo_stddev(long double N, long double S, long double Q)
@@ -116,21 +85,12 @@ static double histo_stddev(long double N, long double S, long double Q)
         return sqrt(N*Q - S*S) / N;
 }
 
-static double histo_all_stddev(const struct neper_histo *histo)
-{
-        struct histo_impl *impl = (void *)histo;
-
-        return histo_stddev(impl->all_count, impl->all_sum, impl->all_sum2);
-}
-
-static double histo_one_stddev(const struct neper_histo *histo)
+double neper_histo_stddev(const struct neper_histo *histo)
 {
-        struct histo_impl *impl = (void *)histo;
-
-        return histo_stddev(impl->one_count, impl->one_sum, impl->one_sum2);
+        return histo_stddev(histo->one_count, histo->one_sum, histo->one_sum2);
 }
 
-static void histo_all_finalize(struct histo_impl *impl)
+static void histo_all_finalize(struct neper_histo *impl)
 {
         double cent = impl->all_count / 100.0;
         double nnn  = (impl->all_count * 99.9) / 100.0;
@@ -164,7 +124,7 @@ static void histo_all_finalize(struct histo_impl *impl)
         }
 }
 
-static void histo_one_finalize(struct histo_impl *impl)
+static void histo_one_finalize(struct neper_histo *impl)
 {
         double cent = impl->one_count / 100.0;
         double nnn  = (impl->one_count * 99.9) / 100.0;
@@ -197,10 +157,8 @@ static void histo_one_finalize(struct histo_impl *impl)
         }
 }
 
-static double histo_all_percent(struct neper_histo *histo, int percentage)
+static double histo_all_percent(struct neper_histo *impl, int percentage)
 {
-        struct histo_impl *impl = (void *)histo;
-
         histo_all_finalize(impl);
 
         switch (percentage) {
@@ -220,10 +178,8 @@ static double histo_all_percent(struct neper_histo *histo, int percentage)
         }
 }
 
-static double histo_one_percent(const struct neper_histo *histo, int percentage)
+double neper_histo_percent(const struct neper_histo *impl, int percentage)
 {
-        struct histo_impl *impl = (void *)histo;
-
         switch (percentage) {
         case 0:
                 return impl->one_min;
@@ -241,18 +197,13 @@ static double histo_one_percent(const struct neper_histo *histo, int percentage)
         }
 }
 
-static uint64_t histo_events(const struct neper_histo *histo)
+uint64_t neper_histo_samples(const struct neper_histo *histo)
 {
-        struct histo_impl *impl = (void *)histo;
-
-        return impl->all_count;
+        return histo->all_count;
 }
 
-static void histo_add(struct neper_histo *des, const struct neper_histo *src)
+void neper_histo_add(struct neper_histo *desi, const struct neper_histo *srci)
 {
-        struct histo_impl *desi = (void *)des;
-        const struct histo_impl *srci = (void *)src;
-
         desi->cur_count += srci->all_count;
         desi->cur_sum   += srci->all_sum;
         desi->cur_sum2  += srci->all_sum2;
@@ -266,7 +217,7 @@ static void histo_add(struct neper_histo *des, const struct neper_histo *src)
 }
 
 // binary search for the correct bucket index
-static int histo_find_bucket_idx(struct histo_impl *impl, int ticks)
+static int histo_find_bucket_idx(struct neper_histo *impl, int ticks)
 {
         int l_idx = 0;
         int r_idx = impl->num_buckets - 1;
@@ -291,9 +242,8 @@ static int histo_find_bucket_idx(struct histo_impl *impl, int ticks)
         return -1;
 }
 
-static void histo_event(struct neper_histo *histo, double delta_s)
+void neper_histo_event(struct neper_histo *impl, double delta_s)
 {
-        struct histo_impl *impl = (void *)histo;
         int ticks = delta_s * TIME_RESOLUTION;
         int i;
 
@@ -314,10 +264,8 @@ static void histo_event(struct neper_histo *histo, double delta_s)
         impl->cur_buckets[i]++;
 }
 
-static void histo_epoch(struct neper_histo *histo)
+void neper_histo_epoch(struct neper_histo *impl)
 {
-        struct histo_impl *impl = (void *)histo;
-
         impl->all_count += impl->cur_count;
         impl->one_count  = impl->cur_count;
         impl->cur_count  = 0;
@@ -370,16 +318,16 @@ static void histo_hash(int num_buckets, double growth, int *ceils)
         }
 }
 
-static void histo_print(struct neper_histo *histo)
+void neper_histo_print(struct neper_histo *histo)
 {
-        struct histo_impl *impl = (void *)histo;
-        const struct thread *t = impl->thread;
+        const struct thread *t = histo->thread;
         const struct options *opts = t->opts;
 
-        PRINT(t->cb, "latency_min", "%.9f", histo_all_min(histo));
-        PRINT(t->cb, "latency_max", "%.9f", histo_all_max(histo));
-        PRINT(t->cb, "latency_mean", "%.9f", histo_all_mean(histo));
-        PRINT(t->cb, "latency_stddev", "%.9f", histo_all_stddev(histo));
+        PRINT(t->cb, "latency_min", "%.9f", histo->all_min);
+        PRINT(t->cb, "latency_max", "%.9f", histo->all_max);
+        PRINT(t->cb, "latency_mean", "%.9f", histo->all_sum / histo->all_count);
+        PRINT(t->cb, "latency_stddev", "%.9f",
+              histo_stddev(histo->all_count, histo->all_sum, histo->all_sum2));
 
         int i;
         for (i = 0; i < 100; i++)
@@ -396,10 +344,8 @@ static void histo_print(struct neper_histo *histo)
                 histo_all_percent(histo, PER_INDEX_99_99));
 }
 
-static void histo_fini(struct neper_histo *histo)
+void neper_histo_delete(struct neper_histo *impl)
 {
-        struct histo_impl *impl = (void *)histo;
-
         if (impl) {
                 free(impl->all_buckets);
                 free(impl->cur_buckets);
@@ -408,26 +354,10 @@ static void histo_fini(struct neper_histo *histo)
         }
 }
 
-static struct neper_histo *neper_histo_factory_create(
-        const struct neper_histo_factory *factory)
+struct neper_histo *neper_histo_new( const struct neper_histo_factory *fimpl)
 {
-        const struct histo_factory_impl *fimpl = (void *)factory;
 
-        struct histo_impl *impl = calloc(1, sizeof(struct histo_impl));
-        struct neper_histo *histo = &impl->histo;
-
-        histo->min     = histo_one_min;
-        histo->max     = histo_one_max;
-        histo->mean    = histo_one_mean;
-        histo->stddev  = histo_one_stddev;
-        histo->percent = histo_one_percent;
-        histo->events  = histo_events;
-
-        histo->add   = histo_add;
-        histo->event = histo_event;
-        histo->epoch = histo_epoch;
-        histo->print = histo_print;
-        histo->fini  = histo_fini;
+        struct neper_histo *impl = calloc(1, sizeof(*impl));
 
         impl->thread      = fimpl->thread;
         impl->num_buckets = fimpl->num_buckets;
@@ -441,28 +371,21 @@ static struct neper_histo *neper_histo_factory_create(
 
         impl->first_all = true;
 
-        return histo;
+        return impl;
 }
 
-void neper_histo_factory_fini(struct neper_histo_factory *factory)
+void neper_histo_factory_delete(struct neper_histo_factory *impl)
 {
-        struct histo_factory_impl *impl = (void *)factory;
-
         if (impl) {
                 free(impl->ceil);
                 free(impl);
         }
 }
 
-struct neper_histo_factory *neper_histo_factory(const struct thread *t,
+struct neper_histo_factory *neper_histo_factory_new(const struct thread *t,
                                                 int num_buckets, double growth)
 {
-        struct histo_factory_impl *impl =
-                calloc(1, sizeof(struct histo_factory_impl));
-        struct neper_histo_factory *factory = &impl->factory;
-
-        factory->create   = neper_histo_factory_create;
-        factory->fini     = neper_histo_factory_fini;
+        struct neper_histo_factory *impl = calloc(1, sizeof(*impl));
 
         impl->thread      = t;
         impl->num_buckets = num_buckets;
@@ -470,5 +393,5 @@ struct neper_histo_factory *neper_histo_factory(const struct thread *t,
 
         histo_hash(num_buckets, growth, impl->ceil);
 
-        return factory;
+        return impl;
 }
diff --git a/histo.h b/histo.h
index 571d80d..2b78d5e 100644
--- a/histo.h
+++ b/histo.h
@@ -23,54 +23,61 @@ struct thread;
 
 /*
  * A simple histogram API for tracking a series of latency measurements:
- * 
- * min()         Returns the min of the previous sampling epoch.
- * max()         Returns the max of the previous sampling epoch.
- * mean()        Returns the mean of the previous sampling epoch.
- * stddev()      Returns the stddev of the previous sampling epoch.
- * percent()     Returns the percent of the previous sampling epoch.
- * add()         Adds one histogram to the current epoch of another.
- * event()       Adds a new event to the current sampling epoch.
- * events()      Returns the event total across all sampling epochs.
- * epoch()       Commits the current sample set and begins a new one.
- * print()       Prints the results.
- * fini()        Deallocates the object.
  *
  * An 'event' is a single measurement.
  * An 'epoch' is all events collected within some time interval.
  *
- * So, typical usage is to call histo->event() many times and histo->epoch()
- * perhaps every second or so.
+ * Typical usage is to call neper_histo_event() many times and
+ * neper_histo_epoch() perhaps every second or so.
  */
 
-struct neper_histo {
-        uint64_t (*events)(const struct neper_histo *);
+struct neper_histo_factory;
+struct neper_histo;
 
-        double (*min)(const struct neper_histo *);
-        double (*max)(const struct neper_histo *);
-        double (*mean)(const struct neper_histo *);
-        double (*stddev)(const struct neper_histo *);
-        double (*percent)(const struct neper_histo *, int percentage);
+/* Create a new collector */
+struct neper_histo *neper_histo_new(const struct neper_histo_factory *);
 
-        void (*add)(struct neper_histo *des, const struct neper_histo *src);
+/* Returns the min of the previous sampling epoch. */
+double neper_histo_min(const struct neper_histo *);
 
-        void (*event)(struct neper_histo *, double delta_s);
-        void (*epoch)(struct neper_histo *);
-        void (*print)(struct neper_histo *);
-        void (*fini)(struct neper_histo *);
-};
+/* Returns the max of the previous sampling epoch. */
+double neper_histo_max(const struct neper_histo *);
+
+/* Returns the mean of the previous sampling epoch. */
+double neper_histo_mean(const struct neper_histo *);
+
+/* Returns the stddev of the previous sampling epoch. */
+double neper_histo_stddev(const struct neper_histo *);
+
+/* Returns the percent of the previous sampling epoch. */
+double neper_histo_percent(const struct neper_histo *, int percentage);
+
+/* Adds one histogram to the current epoch of another. */
+void neper_histo_add(struct neper_histo *des, const struct neper_histo *src);
+
+/* Adds a new event to the current sampling epoch. */
+void neper_histo_event(struct neper_histo *, double delta_s);
+
+/* Returns the event total across all sampling epochs. */
+uint64_t neper_histo_samples(const struct neper_histo *);
+
+/* Commits the current sample set and begins a new one. */
+void neper_histo_epoch(struct neper_histo *);
+
+/* Prints the results */
+void neper_histo_print(struct neper_histo *);
+
+/* Destroy the object */
+void neper_histo_delete(struct neper_histo *);
 
 /*
  * We use a factory to create histo objects so they can all share one set of
  * common lookup tables, saving a great deal of memory.
  */
 
-struct neper_histo_factory {
-        struct neper_histo *(*create)(const struct neper_histo_factory *);
-        void (*fini)(struct neper_histo_factory *);
-};
+void neper_histo_factory_delete(struct neper_histo_factory *);
 
-struct neper_histo_factory *neper_histo_factory(const struct thread *,
+struct neper_histo_factory *neper_histo_factory_new(const struct thread *,
                                                 int size,
                                                 double growth);
 
diff --git a/rr.c b/rr.c
index 4b35722..00261e8 100644
--- a/rr.c
+++ b/rr.c
@@ -144,7 +144,7 @@ static struct neper_stat *rr_latency_init(struct flow *f)
         if (t->opts->nostats)
                 return NULL;
 
-        struct neper_histo *histo = t->histo_factory->create(t->histo_factory);
+        struct neper_histo *histo = neper_histo_new(t->histo_factory);
 
         size = sizeof(struct rr_snap_opaque) + t->percentiles * sizeof(double);
 
@@ -298,21 +298,21 @@ static void rr_snapshot(struct thread *t, struct neper_stat *stat,
 {
         struct neper_histo *histo = stat->histo(stat);
 
-        histo->epoch(histo);
+        neper_histo_epoch(histo);
 
         struct rr_snap_opaque *opaque = (void *)&snap->opaque;
 
-        opaque->min = histo->min(histo);
-        opaque->max = histo->max(histo);
-        opaque->mean = histo->mean(histo);
-        opaque->stddev = histo->stddev(histo);
+        opaque->min = neper_histo_min(histo);
+        opaque->max = neper_histo_max(histo);
+        opaque->mean = neper_histo_mean(histo);
+        opaque->stddev = neper_histo_stddev(histo);
 
         if (t->percentiles) {
                 int i, j = 0;
                 for (i = 0; i < PER_INDEX_COUNT; i++)
                         if (percentiles_chosen(&t->opts->percentiles, i))
                                 opaque->percentile[j++] =
-                                        histo->percent(histo, i);
+                                        neper_histo_percent(histo, i);
         }
 }
 
@@ -326,7 +326,7 @@ static bool rr_do_compl(struct flow *f,
 
         struct neper_stat *stat = flow_stat(f);
         struct neper_histo *histo = stat->histo(stat);
-        histo->event(histo, elapsed);
+        neper_histo_event(histo, elapsed);
 
         if (t->data_pending) {
                 /* data vs time mode, last rr? */
@@ -411,7 +411,7 @@ static void rr_server_state_2(struct flow *f, uint32_t events)
         if (rr_do_send(f, events, rr->rr_send)) {
                 if (stat) {
                         /* rr server has no meaningful latency to measure. */
-                        histo->event(histo, 0.0);
+                        neper_histo_event(histo, 0.0);
                         stat->event(t, stat, 1, false, rr_snapshot);
                 }
                 flow_mod(f, rr_server_state_0, EPOLLIN, false);
@@ -499,7 +499,7 @@ fn_add(struct neper_stat *stat, void *ptr)
 {
         struct neper_histo *src = stat->histo(stat);
         struct neper_histo *des = ptr;
-        des->add(des, src);
+        neper_histo_add(des, src);
         return 0;
 }
 
@@ -517,13 +517,12 @@ int rr_report_stats(struct thread *tinfo)
         int num_events = thread_stats_events(tinfo);
         PRINT(cb, "num_transactions", "%d", num_events);
 
-        struct neper_histo *sum =
-                tinfo[0].histo_factory->create(tinfo[0].histo_factory);
+        struct neper_histo *sum = neper_histo_new(tinfo[0].histo_factory);
         for (i = 0; i < opts->num_threads; i++)
                 tinfo[i].stats->sumforeach(tinfo[i].stats, fn_add, sum);
-        sum->epoch(sum);
-        sum->print(sum);
-        sum->fini(sum);
+        neper_histo_epoch(sum);
+        neper_histo_print(sum);
+        neper_histo_delete(sum);
 
         if (path) {
                 csv = print_header(path, "transactions,transactions/s",
diff --git a/stats.c b/stats.c
index 173e2dd..26e18d9 100644
--- a/stats.c
+++ b/stats.c
@@ -201,7 +201,7 @@ static void stat_delete(struct neper_stat *stat)
 
         if (impl) {
                 if (impl->histo)
-                        impl->histo->fini(impl->histo);
+                        neper_histo_delete(impl->histo);
                 free(impl->snaps);   /* TODO: Add a destructor */
                 free(impl);
         }
diff --git a/thread.c b/thread.c
index 163d514..fe8328e 100644
--- a/thread.c
+++ b/thread.c
@@ -50,7 +50,7 @@ static int
 fn_count_events(struct neper_stat *stat, void *ptr)
 {
         const struct neper_histo *histo = stat->histo(stat);
-        return histo->events(histo);
+        return neper_histo_samples(histo);
 }
 
 static int
@@ -403,7 +403,7 @@ void start_worker_threads(struct options *opts, struct callbacks *cb,
                 t[i].stats = neper_stats_init(cb);
                 t[i].rusage = neper_rusage(opts->interval);
                 t[i].data_pending = data_pending;
-                t[i].histo_factory = neper_histo_factory(&t[i],
+                t[i].histo_factory = neper_histo_factory_new(&t[i],
                                                          NEPER_HISTO_SIZE,
                                                          NEPER_HISTO_GROWTH);
                 t[i].loop_inited = loop_inited;

From f48bb72f9848076cb1d9d4297adb8a7352a3774c Mon Sep 17 00:00:00 2001
From: Luigi Rizzo <lrizzo@google.com>
Date: Thu, 4 Jan 2024 09:27:54 +0000
Subject: [PATCH 62/72] histo: replace threshold table with faster bit-based
 logarithms

histograms store samples in buckets with pseudo logarithmic size.
The previous implementation used a table of thresholds, and binary
search to locate the correct bucket.

This patch replaces the thresholds with the fast pseudo-logarithm
algorithm used in lr-cstats and bpftrace so we can locate the bucket in
a handful of instructions.

This gives memory savings, reduced cache trashing, and better
performance. Tests show that with a hot cache a lookup now takes
less than 2us compared to 20-25us with the previous approach.
Also, we can remove the now-useless neper_histo_factory.

The actual resolution of the buckets is approximately the same as in
the previous implementation (about 1.5%).

On passing, correct a few bugs in the previous implementation:
- resolution was supposed to be 0.25% but due to an implementation bug
  it was around 1% or even bigger at low values, and cause the
  thresholds to become negative
- conversion from double to int for the sample could have unchecked
  overflows.

Tested with tcp_rr and verifying that the distribution and csv files
contain correct values.
---
 histo.c  | 174 ++++++++++++++++++++++---------------------------------
 histo.h  |  22 +++----
 rr.c     |   4 +-
 thread.c |   8 ---
 thread.h |   2 -
 5 files changed, 78 insertions(+), 132 deletions(-)

diff --git a/histo.c b/histo.c
index d88fd83..1e3c326 100644
--- a/histo.c
+++ b/histo.c
@@ -26,8 +26,9 @@ static const int TIME_RESOLUTION = 100 * 1000000;
 struct neper_histo {
         const struct thread *thread;
 
-        int num_buckets;  /* # of buckets allocated */
-        int *ceil;        /* Max value that can be hashed into each bucket */
+        uint32_t num_buckets;  /* # of buckets allocated */
+        uint8_t k_bits; /* resolution */
+        uint64_t sample_max_value;
 
         uint64_t *all_buckets;
         uint64_t *cur_buckets;
@@ -58,12 +59,46 @@ struct neper_histo {
         bool first_all;   /* Is this the first call to all_percent() */
 };
 
-struct neper_histo_factory {
-        const struct thread *thread;
+/* Conversion of a 64-bit value to an approximately logarithmic index
+ * with k bits of resolution.
+ * lr_bucket(n, k) computes the log2, followed by the k next significant bits.
+ *
+ * lr_bucket_lo(b, k) returns the lower bound of bucket b.
+ * Translate the index into the starting value for the corresponding interval.
+ * Each power of 2 is mapped into N = 2**k intervals, each of size
+ * S = 1 << ((index >> k) - 1), and starting at S * N.
+ * The last k bits of index indicate which interval we want.
+ *
+ * For example, if k = 2 and index = 0b11011 (27) we have:
+ * - N = 2**2 = 4;
+ * - interval size S is 1 << ((0b11011 >> 2) - 1) = 1 << (6 - 1) = 32
+ * - starting value is S * N = 128
+ * - the last 2 bits 11 indicate the third interval so the
+ *   starting value is 128 + 32*3 = 224
+ */
 
-        int num_buckets;  /* # of buckets allocated */
-        int *ceil;        /* Max value that can be hashed into each bucket */
-};
+#define fls64(x) ((x) == 0? 0 : (64 - __builtin_clzl(x)))
+static int lr_bucket(uint64_t val, int k)
+{
+        const uint64_t mask = (1ul << k) - 1;
+        const int bucket = fls64(val >> k);
+        int slot = bucket == 0 ? val : ((bucket << k) | ((val >> (bucket - 1)) & mask) );
+        return slot;
+}
+
+static uint64_t lr_bucket_lo(int index, int k)
+{
+        const uint32_t n = (1 << k), interval = index & (n - 1);
+        if (index < n)
+                return index;
+        const uint32_t power_of_2 = (index >> k) - 1;
+        return (1ul << power_of_2) * (n + interval);
+}
+
+static uint64_t lr_bucket_hi(int index, int k)
+{
+        return lr_bucket_lo(index + 1, k) - 1;
+}
 
 double neper_histo_min(const struct neper_histo *histo)
 {
@@ -106,17 +141,17 @@ static void histo_all_finalize(struct neper_histo *impl)
         for (i = 0; i < impl->num_buckets; i++) {
                 sub += impl->all_buckets[i];
                 while (p < 100 && p * cent <= sub)
-                        impl->all_percent[p++] = impl->ceil[i];
+                        impl->all_percent[p++] = lr_bucket_hi(i, impl->k_bits);
                 if (p == 100) {
                         if (nnn <= sub) {
-                                int c = impl->ceil[i];
+                                int c = lr_bucket_hi(i, impl->k_bits);
                                 impl->all_percent[PER_INDEX_99_9] = c;
                                 p++;
                         }
                 }
                 if (p == 101) {
                         if (nnnn <= sub) {
-                                int c = impl->ceil[i];
+                                int c = lr_bucket_hi(i, impl->k_bits);
                                 impl->all_percent[PER_INDEX_99_99] = c;
                                 p++;
                         }
@@ -137,17 +172,17 @@ static void histo_one_finalize(struct neper_histo *impl)
                 int n = impl->cur_buckets[i];
                 sub += n;
                 while (p < 100 && p * cent <= sub)
-                        impl->one_percent[p++] = impl->ceil[i];
+                        impl->one_percent[p++] = lr_bucket_hi(i, impl->k_bits);
                 if (p == 100) {
                         if (nnn <= sub) {
-                                int c = impl->ceil[i];
+                                int c = lr_bucket_hi(i, impl->k_bits);
                                 impl->one_percent[PER_INDEX_99_9] = c;
                                 p++;
                         }
                 }
                 if (p == 101) {
                         if (nnnn <= sub) {
-                                int c = impl->ceil[i];
+                                int c = lr_bucket_hi(i, impl->k_bits);
                                 impl->one_percent[PER_INDEX_99_99] = c;
                                 p++;
                         }
@@ -216,35 +251,8 @@ void neper_histo_add(struct neper_histo *desi, const struct neper_histo *srci)
                 desi->cur_buckets[i] += srci->all_buckets[i];
 }
 
-// binary search for the correct bucket index
-static int histo_find_bucket_idx(struct neper_histo *impl, int ticks)
-{
-        int l_idx = 0;
-        int r_idx = impl->num_buckets - 1;
-
-        if (ticks > impl->ceil[r_idx])
-                return r_idx;
-
-        while (l_idx <= r_idx) {
-                int idx = (l_idx + r_idx) / 2;
-                if (impl->ceil[idx] < ticks) {
-                        l_idx = idx + 1;
-                } else {
-                        if (idx == 0)
-                                return idx;
-                        else if (impl->ceil[idx -1] < ticks)
-                                return idx;
-                        else
-                                r_idx = idx - 1;
-                }
-        }
-
-        return -1;
-}
-
 void neper_histo_event(struct neper_histo *impl, double delta_s)
 {
-        int ticks = delta_s * TIME_RESOLUTION;
         int i;
 
         impl->cur_count++;
@@ -254,13 +262,18 @@ void neper_histo_event(struct neper_histo *impl, double delta_s)
         impl->cur_min = MIN(impl->cur_min, delta_s);
         impl->cur_max = MAX(impl->cur_max, delta_s);
 
-        i = histo_find_bucket_idx(impl, ticks);
-        if (i == -1) {
+        delta_s *= TIME_RESOLUTION; /* convert to ticks, potential overflow */
+        if (delta_s < 0 || delta_s > impl->sample_max_value) {
                 LOG_ERROR(impl->thread->cb,
-                          "%s(): not able to find bucket for ticks %d",
-                          __func__, ticks);
+                          "%s(): not able to find bucket for delta_s %g",
+                          __func__, delta_s / TIME_RESOLUTION);
+                /* TODO: This will also cause an error in reporting 100% and
+                 * high percentiles, because the sum of buckets will never
+                 * reach the total count.
+                 */
                 return;
         }
+        i = lr_bucket((uint64_t)delta_s, impl->k_bits);
         impl->cur_buckets[i]++;
 }
 
@@ -280,7 +293,7 @@ void neper_histo_epoch(struct neper_histo *impl)
 
         impl->all_min = MIN(impl->all_min, impl->cur_min);
         impl->one_min = impl->cur_min;
-        impl->cur_min = DBL_MAX;
+        impl->cur_min = impl->sample_max_value;;
 
         impl->all_max = MAX(impl->all_max, impl->cur_max);
         impl->one_max = impl->cur_max;
@@ -289,35 +302,6 @@ void neper_histo_epoch(struct neper_histo *impl)
         histo_one_finalize(impl);
 }
 
-/*
- * Returns the size of the hash table needed for the given parameters.
- * If 'table' and 'ceil' are non-null then populate them as well.
- *
- * 'table' maps an incoming value to a bucket so we can do an O(1) lookup.
- * 'ceils' tracks the maximum value stored in each bucket.
- *
- * The delta between each bucket increases exponentially and is stored as a
- * double. However, it is rounded down to the nearest integer when used. So
- * for example, with a growth rate of 1.02, the delta between the first and
- * second buckets will be 1.02, rounded down to 1. The delta between the
- * second and third buckets will be 1.02^2 ~= 1.04, which also rounds down to 1.
- * Eventually the delta will climb above 2 and that will become the new value.
- */
-
-static void histo_hash(int num_buckets, double growth, int *ceils)
-{
-        double delta = 1.0;
-        int ceil = 1;
-        int hash = 0;
-
-        while (hash < num_buckets) {
-                ceils[hash] = ceil;
-                delta *= growth;
-                ceil += (int)delta;
-                hash++;
-        }
-}
-
 void neper_histo_print(struct neper_histo *histo)
 {
         const struct thread *t = histo->thread;
@@ -349,49 +333,29 @@ void neper_histo_delete(struct neper_histo *impl)
         if (impl) {
                 free(impl->all_buckets);
                 free(impl->cur_buckets);
-                free(impl->ceil);
                 free(impl);
         }
 }
 
-struct neper_histo *neper_histo_new( const struct neper_histo_factory *fimpl)
+struct neper_histo *neper_histo_new(const struct thread *t, uint8_t k_bits)
 {
 
         struct neper_histo *impl = calloc(1, sizeof(*impl));
 
-        impl->thread      = fimpl->thread;
-        impl->num_buckets = fimpl->num_buckets;
-        impl->ceil        = fimpl->ceil;
+        if (k_bits > 10)
+                k_bits = 10;
+        impl->thread      = t;
+        impl->k_bits      = k_bits;
+        impl->num_buckets = 65 * (1 << k_bits);
+        impl->sample_max_value = ~0ul;
 
-        impl->all_buckets = calloc(fimpl->num_buckets, sizeof(uint64_t));
-        impl->cur_buckets = calloc(fimpl->num_buckets, sizeof(uint64_t));
+        impl->all_buckets = calloc(impl->num_buckets, sizeof(uint64_t));
+        impl->cur_buckets = calloc(impl->num_buckets, sizeof(uint64_t));
 
-        impl->all_min = DBL_MAX;
-        impl->cur_min = DBL_MAX;
+        impl->all_min = impl->sample_max_value;
+        impl->cur_min = impl->sample_max_value;
 
         impl->first_all = true;
 
         return impl;
 }
-
-void neper_histo_factory_delete(struct neper_histo_factory *impl)
-{
-        if (impl) {
-                free(impl->ceil);
-                free(impl);
-        }
-}
-
-struct neper_histo_factory *neper_histo_factory_new(const struct thread *t,
-                                                int num_buckets, double growth)
-{
-        struct neper_histo_factory *impl = calloc(1, sizeof(*impl));
-
-        impl->thread      = t;
-        impl->num_buckets = num_buckets;
-        impl->ceil        = calloc(impl->num_buckets, sizeof(int));
-
-        histo_hash(num_buckets, growth, impl->ceil);
-
-        return impl;
-}
diff --git a/histo.h b/histo.h
index 2b78d5e..d847bab 100644
--- a/histo.h
+++ b/histo.h
@@ -19,8 +19,6 @@
 
 #include <stdint.h>
 
-struct thread;
-
 /*
  * A simple histogram API for tracking a series of latency measurements:
  *
@@ -31,11 +29,16 @@ struct thread;
  * neper_histo_epoch() perhaps every second or so.
  */
 
-struct neper_histo_factory;
+/* Internally the collector allows 64-bit values in buckets with k_bits
+ * significant bits. 6 gives 1.5% error and about 4K buckets.
+ */
+#define DEFAULT_K_BITS 4
+
+struct thread;
 struct neper_histo;
 
 /* Create a new collector */
-struct neper_histo *neper_histo_new(const struct neper_histo_factory *);
+struct neper_histo *neper_histo_new(const struct thread *t, uint8_t k_bits);
 
 /* Returns the min of the previous sampling epoch. */
 double neper_histo_min(const struct neper_histo *);
@@ -70,15 +73,4 @@ void neper_histo_print(struct neper_histo *);
 /* Destroy the object */
 void neper_histo_delete(struct neper_histo *);
 
-/*
- * We use a factory to create histo objects so they can all share one set of
- * common lookup tables, saving a great deal of memory.
- */
-
-void neper_histo_factory_delete(struct neper_histo_factory *);
-
-struct neper_histo_factory *neper_histo_factory_new(const struct thread *,
-                                                int size,
-                                                double growth);
-
 #endif
diff --git a/rr.c b/rr.c
index 00261e8..e163afa 100644
--- a/rr.c
+++ b/rr.c
@@ -144,7 +144,7 @@ static struct neper_stat *rr_latency_init(struct flow *f)
         if (t->opts->nostats)
                 return NULL;
 
-        struct neper_histo *histo = neper_histo_new(t->histo_factory);
+        struct neper_histo *histo = neper_histo_new(t, DEFAULT_K_BITS);
 
         size = sizeof(struct rr_snap_opaque) + t->percentiles * sizeof(double);
 
@@ -517,7 +517,7 @@ int rr_report_stats(struct thread *tinfo)
         int num_events = thread_stats_events(tinfo);
         PRINT(cb, "num_transactions", "%d", num_events);
 
-        struct neper_histo *sum = neper_histo_new(tinfo[0].histo_factory);
+        struct neper_histo *sum = neper_histo_new(tinfo, DEFAULT_K_BITS);
         for (i = 0; i < opts->num_threads; i++)
                 tinfo[i].stats->sumforeach(tinfo[i].stats, fn_add, sum);
         neper_histo_epoch(sum);
diff --git a/thread.c b/thread.c
index fe8328e..7497e8d 100644
--- a/thread.c
+++ b/thread.c
@@ -39,11 +39,6 @@
 #include <libnuma/numaint.h>
 #endif
 
-// max value = 1.0025^8192 = 764278329
-// If TIME_RESOLUTION is 0.01 us, max latency in histogram = 7.642783298s
-#define NEPER_HISTO_SIZE   8192  /* # of buckets in the histogram */
-#define NEPER_HISTO_GROWTH 1.0025 /* bucket growth rate */
-
 /* Callbacks for the neper_stats sumforeach() function. */
 
 static int
@@ -403,9 +398,6 @@ void start_worker_threads(struct options *opts, struct callbacks *cb,
                 t[i].stats = neper_stats_init(cb);
                 t[i].rusage = neper_rusage(opts->interval);
                 t[i].data_pending = data_pending;
-                t[i].histo_factory = neper_histo_factory_new(&t[i],
-                                                         NEPER_HISTO_SIZE,
-                                                         NEPER_HISTO_GROWTH);
                 t[i].loop_inited = loop_inited;
                 t[i].loop_init_c = loop_init_c;
                 t[i].loop_init_m = loop_init_m;
diff --git a/thread.h b/thread.h
index c3f1af4..2a4a893 100644
--- a/thread.h
+++ b/thread.h
@@ -24,7 +24,6 @@
 
 struct addrinfo;
 struct neper_fn;
-struct neper_histo_factory;
 struct neper_pq;
 struct neper_stats;
 
@@ -103,7 +102,6 @@ struct thread {
         struct timespec *time_start;
         pthread_mutex_t *time_start_mutex;
         struct rusage *rusage_start;
-        struct neper_histo_factory *histo_factory;
         struct neper_stats *stats;
         struct neper_rusage *rusage;
 	struct io_stats io_stats;

From da65b51278db6879b703800aa1665123e97cfcdb Mon Sep 17 00:00:00 2001
From: Luigi Rizzo <lrizzo@google.com>
Date: Thu, 4 Jan 2024 19:20:06 +0000
Subject: [PATCH 63/72] histograms: allow arbitrary percentiles

Allow arbitrary percentiles to be specificed, instead of just integer
and p99.9 and p99.99

This also makes the code faster because we can just compute the values
requested instead of all 103 entries.

Any floating point number between 0 and 100 is now accepted, with 999
and 9999 mapped to 99.9 and 99.99 for backward compatibility.

Tested as usual with
 ./tcp_rr -c -H 127.0.0.1 -p1,2,10,50,90,999,9999,100 -A/tmp/x.csv
 and verifying the correct content of the csv file.
---
 define_all_flags.c |   2 +-
 histo.c            | 162 +++++++++++++--------------------------------
 histo.h            |   4 +-
 percentiles.c      | 105 ++++++++++++++---------------
 percentiles.h      |  12 +---
 print.c            |  17 +----
 rr.c               |  23 ++-----
 thread.c           |   3 -
 thread.h           |   1 -
 9 files changed, 109 insertions(+), 220 deletions(-)

diff --git a/define_all_flags.c b/define_all_flags.c
index 14fb096..b6ef422 100644
--- a/define_all_flags.c
+++ b/define_all_flags.c
@@ -88,7 +88,7 @@ struct flags_parser *add_flags_rr(struct flags_parser *fp)
         /* Define flags common to all RR and CRR main programs */
         DEFINE_FLAG(fp, int,                 request_size,  1,                       'Q', "Number of bytes in a request from client to server");
         DEFINE_FLAG(fp, int,                 response_size, 1,                       'R', "Number of bytes in a response from server to client");
-        DEFINE_FLAG(fp, struct percentiles,  percentiles,   { .chosen = { false } }, 'p', "Set reported latency percentiles (list)");
+        DEFINE_FLAG(fp, struct percentiles,  percentiles,   {},                      'p', "Set reported latency percentiles (list)");
         DEFINE_FLAG_PARSER(fp,               percentiles, percentiles_parse);
         DEFINE_FLAG_PRINTER(fp,              percentiles, percentiles_print);
         DEFINE_FLAG(fp, int,                 test_length,   10,                      'l', "Test length, >0 seconds, <0 transactions");
diff --git a/histo.c b/histo.c
index 1e3c326..cc697b0 100644
--- a/histo.c
+++ b/histo.c
@@ -22,6 +22,7 @@
 
 // use 0.01 us time resolution
 static const int TIME_RESOLUTION = 100 * 1000000;
+static const double TICKS_TO_SEC = 1.0 / TIME_RESOLUTION;
 
 struct neper_histo {
         const struct thread *thread;
@@ -53,8 +54,8 @@ struct neper_histo {
         double one_max;
         double cur_max;
 
-        int all_percent[PER_INDEX_COUNT];  /* % across all completed epochs */
-        int one_percent[PER_INDEX_COUNT];  /* % of the last completed epoch */
+        double *all_p_values;  /* % across all completed epochs */
+        double *one_p_values;  /* % of the last completed epoch */
 
         bool first_all;   /* Is this the first call to all_percent() */
 };
@@ -127,12 +128,9 @@ double neper_histo_stddev(const struct neper_histo *histo)
 
 static void histo_all_finalize(struct neper_histo *impl)
 {
+        const struct percentiles *pc = &impl->thread->opts->percentiles;
         double cent = impl->all_count / 100.0;
-        double nnn  = (impl->all_count * 99.9) / 100.0;
-        double nnnn = (impl->all_count * 99.99) / 100.0;
-        int sub = 0;
-        int p = 1;
-        int i;
+        int sub = 0, v = 0, i;
 
         if (!impl->first_all)
                 return;
@@ -140,96 +138,30 @@ static void histo_all_finalize(struct neper_histo *impl)
 
         for (i = 0; i < impl->num_buckets; i++) {
                 sub += impl->all_buckets[i];
-                while (p < 100 && p * cent <= sub)
-                        impl->all_percent[p++] = lr_bucket_hi(i, impl->k_bits);
-                if (p == 100) {
-                        if (nnn <= sub) {
-                                int c = lr_bucket_hi(i, impl->k_bits);
-                                impl->all_percent[PER_INDEX_99_9] = c;
-                                p++;
-                        }
-                }
-                if (p == 101) {
-                        if (nnnn <= sub) {
-                                int c = lr_bucket_hi(i, impl->k_bits);
-                                impl->all_percent[PER_INDEX_99_99] = c;
-                                p++;
-                        }
-                }
+                while (v < pc->p_count && sub >= pc->p_th[v] * cent)
+                        impl->all_p_values[v++] = lr_bucket_hi(i, impl->k_bits) * TICKS_TO_SEC;
         }
 }
 
 static void histo_one_finalize(struct neper_histo *impl)
 {
+        const struct percentiles *pc = &impl->thread->opts->percentiles;
         double cent = impl->one_count / 100.0;
-        double nnn  = (impl->one_count * 99.9) / 100.0;
-        double nnnn = (impl->one_count * 99.99) / 100.0;
-        int sub = 0;
-        int p = 1;
-        int i;
+        int sub = 0, v = 0, i;
 
         for (i = 0; i < impl->num_buckets; i++) {
                 int n = impl->cur_buckets[i];
                 sub += n;
-                while (p < 100 && p * cent <= sub)
-                        impl->one_percent[p++] = lr_bucket_hi(i, impl->k_bits);
-                if (p == 100) {
-                        if (nnn <= sub) {
-                                int c = lr_bucket_hi(i, impl->k_bits);
-                                impl->one_percent[PER_INDEX_99_9] = c;
-                                p++;
-                        }
-                }
-                if (p == 101) {
-                        if (nnnn <= sub) {
-                                int c = lr_bucket_hi(i, impl->k_bits);
-                                impl->one_percent[PER_INDEX_99_99] = c;
-                                p++;
-                        }
-                }
                 impl->all_buckets[i] += n;
                 impl->cur_buckets[i] = 0;
+                while (v < pc->p_count && sub >= pc->p_th[v] * cent)
+                        impl->one_p_values[v++] = lr_bucket_hi(i, impl->k_bits) * TICKS_TO_SEC;
         }
 }
 
-static double histo_all_percent(struct neper_histo *impl, int percentage)
+double neper_histo_percent(const struct neper_histo *impl, int index)
 {
-        histo_all_finalize(impl);
-
-        switch (percentage) {
-        case 0:
-                return impl->all_min;
-        case 100:
-                return impl->all_max;
-        case 999:
-                return (double)impl->all_percent[PER_INDEX_99_9] /
-                       TIME_RESOLUTION;
-        case 9999:
-                return (double)impl->all_percent[PER_INDEX_99_99] /
-                       TIME_RESOLUTION;
-        default:
-                return (double)impl->all_percent[percentage] /
-                       TIME_RESOLUTION;
-        }
-}
-
-double neper_histo_percent(const struct neper_histo *impl, int percentage)
-{
-        switch (percentage) {
-        case 0:
-                return impl->one_min;
-        case 100:
-                return impl->one_max;
-        case 999:
-                return (double)impl->one_percent[PER_INDEX_99_9] /
-                       TIME_RESOLUTION;
-        case 9999:
-                return (double)impl->one_percent[PER_INDEX_99_99] /
-                       TIME_RESOLUTION;
-        default:
-                return (double)impl->one_percent[percentage] /
-                       TIME_RESOLUTION;
-        }
+        return impl->one_p_values[index];
 }
 
 uint64_t neper_histo_samples(const struct neper_histo *histo)
@@ -293,7 +225,7 @@ void neper_histo_epoch(struct neper_histo *impl)
 
         impl->all_min = MIN(impl->all_min, impl->cur_min);
         impl->one_min = impl->cur_min;
-        impl->cur_min = impl->sample_max_value;;
+        impl->cur_min = impl->sample_max_value;
 
         impl->all_max = MAX(impl->all_max, impl->cur_max);
         impl->one_max = impl->cur_max;
@@ -305,57 +237,55 @@ void neper_histo_epoch(struct neper_histo *impl)
 void neper_histo_print(struct neper_histo *histo)
 {
         const struct thread *t = histo->thread;
-        const struct options *opts = t->opts;
+        const struct percentiles *pc = &t->opts->percentiles;
 
+        histo_all_finalize(histo);
         PRINT(t->cb, "latency_min", "%.9f", histo->all_min);
         PRINT(t->cb, "latency_max", "%.9f", histo->all_max);
         PRINT(t->cb, "latency_mean", "%.9f", histo->all_sum / histo->all_count);
         PRINT(t->cb, "latency_stddev", "%.9f",
               histo_stddev(histo->all_count, histo->all_sum, histo->all_sum2));
 
-        int i;
-        for (i = 0; i < 100; i++)
-                if (percentiles_chosen(&opts->percentiles, i)) {
-                        char key[13];
-                        sprintf(key, "latency_p%d", i);
-                        PRINT(t->cb, key, "%.9f", histo_all_percent(histo, i));
-                }
-        if (percentiles_chosen(&opts->percentiles, PER_INDEX_99_9))
-          PRINT(t->cb, "latency_p99.9", "%.9f",
-                histo_all_percent(histo, PER_INDEX_99_9));
-        if (percentiles_chosen(&opts->percentiles, PER_INDEX_99_99))
-          PRINT(t->cb, "latency_p99.99", "%.9f",
-                histo_all_percent(histo, PER_INDEX_99_99));
+        for (int i = 0; i < pc->p_count; i++) {
+                char key[32];
+                sprintf(key, "latency_p%g", pc->p_th[i]);
+                PRINT(t->cb, key, "%.9f", histo->all_p_values[i]);
+        }
 }
 
 void neper_histo_delete(struct neper_histo *impl)
 {
-        if (impl) {
-                free(impl->all_buckets);
-                free(impl->cur_buckets);
+        if (impl)
                 free(impl);
-        }
 }
 
 struct neper_histo *neper_histo_new(const struct thread *t, uint8_t k_bits)
 {
-
-        struct neper_histo *impl = calloc(1, sizeof(*impl));
+        const uint16_t p_count = t->opts->percentiles.p_count;
+        struct neper_histo *ret, histo = {};
+        size_t memsize = sizeof(histo);
 
         if (k_bits > 10)
                 k_bits = 10;
-        impl->thread      = t;
-        impl->k_bits      = k_bits;
-        impl->num_buckets = 65 * (1 << k_bits);
-        impl->sample_max_value = ~0ul;
-
-        impl->all_buckets = calloc(impl->num_buckets, sizeof(uint64_t));
-        impl->cur_buckets = calloc(impl->num_buckets, sizeof(uint64_t));
-
-        impl->all_min = impl->sample_max_value;
-        impl->cur_min = impl->sample_max_value;
-
-        impl->first_all = true;
-
-        return impl;
+        histo.thread      = t;
+        histo.k_bits      = k_bits;
+        histo.num_buckets = 65 * (1 << k_bits);
+        histo.sample_max_value = ~0ul;
+        histo.first_all = true;
+
+        /* Allocate memory in one chunk */
+        memsize += histo.num_buckets * 2 * sizeof(histo.all_buckets[0]);
+        memsize += p_count * 2 * sizeof(histo.all_p_values[0]);
+
+        ret = calloc(1, memsize);
+        *ret = histo;
+        ret->all_buckets = (void *)(ret + 1);
+        ret->cur_buckets = ret->all_buckets + ret->num_buckets;
+        ret->all_p_values = (void *)(ret->cur_buckets + ret->num_buckets);
+        ret->one_p_values = ret->all_p_values + p_count;
+
+        ret->all_min = ret->sample_max_value;
+        ret->cur_min = ret->sample_max_value;
+
+        return ret;
 }
diff --git a/histo.h b/histo.h
index d847bab..cb88a1c 100644
--- a/histo.h
+++ b/histo.h
@@ -52,8 +52,8 @@ double neper_histo_mean(const struct neper_histo *);
 /* Returns the stddev of the previous sampling epoch. */
 double neper_histo_stddev(const struct neper_histo *);
 
-/* Returns the percent of the previous sampling epoch. */
-double neper_histo_percent(const struct neper_histo *, int percentage);
+/* Returns the index-th percent of the previous sampling epoch. */
+double neper_histo_percent(const struct neper_histo *, int index);
 
 /* Adds one histogram to the current epoch of another. */
 void neper_histo_add(struct neper_histo *des, const struct neper_histo *src);
diff --git a/percentiles.c b/percentiles.c
index 89b757e..9f70746 100644
--- a/percentiles.c
+++ b/percentiles.c
@@ -23,77 +23,72 @@
 #include "lib.h"
 #include "logging.h"
 
+static int my_dsort(const void *p1, const void *p2)
+{
+        const double a = *(double *)p1, b = *(double *)p2;
+
+        return a < b ? -1 : (a > b ? 1 : 0);
+}
+
 void percentiles_parse(const char *arg, void *out, struct callbacks *cb)
 {
         struct percentiles *p = out;
         char *endptr;
-        long val;
+        int sz = 0;
+        double d;
 
-        while (true) {
+        while (arg) {
                 errno = 0;
-                val = strtol(arg, &endptr, 10);
-                if ((errno == ERANGE && (val == LONG_MAX || val == LONG_MIN)) ||
-                    (errno != 0 && val == 0))
-                        PLOG_FATAL(cb, "strtol");
-                if (endptr == arg)
-                        break;
-                if ((val < 0 || val > 100) && (val != 999) && (val != 9999))
-                        LOG_FATAL(cb, "%ld percentile doesn't exist", val);
-                switch (val) {
-                case 999:
-                        p->chosen[PER_INDEX_99_9] = true;
-                        break;
-                case 9999:
-                        p->chosen[PER_INDEX_99_99] = true;
-                        break;
-                default:
-                        p->chosen[val] = true;
-                        break;
+                d = strtod(arg, &endptr);
+                /* backward compatibility */
+                if (d == 999)
+                        d = 99.9;
+                else if (d == 9999)
+                        d = 99.99;
+                if (errno || d < 0 || d > 100 || endptr == arg)
+                        LOG_FATAL(cb, "invalid -p argument %s", arg);
+
+                if (p->p_count >= sz) {
+                        sz = 2 * sz + 2;
+                        p->p_th = realloc(p->p_th, sz * sizeof(double));
+                        if (!p->p_th)
+                                LOG_FATAL(cb, "cannot allocate %d entries", sz);
                 }
-                LOG_INFO(cb, "%ld percentile is chosen", val);
+                p->p_th[p->p_count++] = d;
+                LOG_INFO(cb, "%g percentile is chosen", d);
                 if (*endptr == '\0')
                         break;
                 arg = endptr + 1;
         }
+        if (!p->p_count)
+                return;
+        qsort(p->p_th, p->p_count, sizeof(double), my_dsort);
+        /* remove duplicates */
+        int i, cur = 0;
+        for (i = 1; i < p->p_count; i++) {
+                if (p->p_th[cur] == p->p_th[i])
+                        LOG_INFO(cb, "remove duplicate percentile %g", p->p_th[i]);
+                else
+                        p->p_th[++cur] = p->p_th[i];
+        }
+        p->p_count = cur;
 }
 
 void percentiles_print(const char *name, const void *var, struct callbacks *cb)
 {
         const struct percentiles *p = var;
-        char buf[10], s[400] = "";
-        int i;
-
-        for (i = 0; i <= 100; i++) {
-                if (p->chosen[i]) {
-                        sprintf(buf, "%d,", i);
-                        strcat(s, buf);
-                }
-        }
-        if (p->chosen[PER_INDEX_99_9])
-                strcat(s, "99.9,");
-        if (p->chosen[PER_INDEX_99_99])
-                strcat(s, "99.99,");
-        if (strlen(s) > 0)
-                s[strlen(s) - 1] = '\0'; /* remove trailing comma */
-        PRINT(cb, name, "%s", s);
-}
+        char buf, *s;
+        int i, len = 0;
 
-bool percentiles_chosen(const struct percentiles *p, int percent)
-{
-        if (p)
-                return p->chosen[percent];
+        /* first pass, compute length */
+        for (i = 0; i < p->p_count; i++)
+                len += snprintf(&buf, 0, "%g,", p->p_th[i]);
 
-        return false;
-}
-
-int percentiles_count(const struct percentiles *p)
-{
-        if (p) {
-                int i, sum = 0;
-                for (i = 0; i < PER_INDEX_COUNT; i++)
-                        sum += p->chosen[i] ? 1 : 0;
-                return sum;
-        }
-
-        return 0;
+        /* second pass, create string */
+        s = calloc(1, len + 1);
+        len = 0;
+        for (i = 0; i < p->p_count; i++)
+                len += sprintf(s + len, "%g,", p->p_th[i]);
+        PRINT(cb, name, "%s", s);
+        free(s);
 }
diff --git a/percentiles.h b/percentiles.h
index bfb0440..8fb725a 100644
--- a/percentiles.h
+++ b/percentiles.h
@@ -17,22 +17,14 @@
 #ifndef THIRD_PARTY_NEPER_PERCENTILES_H
 #define THIRD_PARTY_NEPER_PERCENTILES_H
 
-#include <stdbool.h>
-
-#define PER_INDEX_99_9	101
-#define PER_INDEX_99_99	102
-#define PER_INDEX_COUNT	103
-
 struct callbacks;
 
 struct percentiles {
-        bool chosen[PER_INDEX_COUNT];
+        int p_count;
+        double *p_th; /* indexes */
 };
 
 void percentiles_parse(const char *arg, void *out, struct callbacks *);
 void percentiles_print(const char *name, const void *var, struct callbacks *);
 
-bool percentiles_chosen(const struct percentiles *, int percent);
-int percentiles_count(const struct percentiles *);
-
 #endif
diff --git a/print.c b/print.c
index ed858ca..fe9b213 100644
--- a/print.c
+++ b/print.c
@@ -43,23 +43,12 @@ FILE *print_header(const char *path, const char *things, const char *nl,
         return csv;
 }
 
-void print_latency_header(FILE *csv, const struct percentiles *percentiles)
+void print_latency_header(FILE *csv, const struct percentiles *pc)
 {
         fprintf(csv, ",latency_min,latency_mean,latency_max,latency_stddev");
 
-        if (percentiles) {
-                int i;
-                for (i = 0; i < 100; i++)
-                        if (percentiles_chosen(percentiles, i))
-                                fprintf(csv, ",latency_p%d", i);
-                if (percentiles_chosen(percentiles, PER_INDEX_99_9))
-                        fprintf(csv, ",latency_p99_9");
-                if (percentiles_chosen(percentiles, PER_INDEX_99_99))
-                        fprintf(csv, ",latency_p99_99");
-                if (percentiles_chosen(percentiles, 100))
-                        fprintf(csv, ",latency_p100");
-        }
-
+        for (int i = 0; i < pc->p_count; i++)
+                fprintf(csv, ",latency_p%g", pc->p_th[i]);
         fprintf(csv, "\n");
 }
 
diff --git a/rr.c b/rr.c
index e163afa..4e08336 100644
--- a/rr.c
+++ b/rr.c
@@ -146,7 +146,7 @@ static struct neper_stat *rr_latency_init(struct flow *f)
 
         struct neper_histo *histo = neper_histo_new(t, DEFAULT_K_BITS);
 
-        size = sizeof(struct rr_snap_opaque) + t->percentiles * sizeof(double);
+        size = sizeof(struct rr_snap_opaque) + t->opts->percentiles.p_count * sizeof(double);
 
         return neper_stat_init(f, histo, size);
 }
@@ -307,13 +307,8 @@ static void rr_snapshot(struct thread *t, struct neper_stat *stat,
         opaque->mean = neper_histo_mean(histo);
         opaque->stddev = neper_histo_stddev(histo);
 
-        if (t->percentiles) {
-                int i, j = 0;
-                for (i = 0; i < PER_INDEX_COUNT; i++)
-                        if (percentiles_chosen(&t->opts->percentiles, i))
-                                opaque->percentile[j++] =
-                                        neper_histo_percent(histo, i);
-        }
+        for (int i = 0; i < t->opts->percentiles.p_count; i++)
+                opaque->percentile[i] = neper_histo_percent(histo, i);
 }
 
 static bool rr_do_compl(struct flow *f,
@@ -480,16 +475,8 @@ static void rr_print_snap(struct thread *t, int flow_index,
                 fprintf(csv, ",%f,%f,%f,%f",
                         rso->min, rso->mean, rso->max, rso->stddev);
 
-                if (t->percentiles) {
-                        const struct options *opts = t->opts;
-                        int i, j = 0;
-
-                        for (i = 0; i < PER_INDEX_COUNT; i++)
-                                if (percentiles_chosen(&opts->percentiles, i))
-                                        fprintf(csv, ",%f",
-                                                rso->percentile[j++]);
-                }
-
+                for (int i = 0; i < t->opts->percentiles.p_count; i++)
+                        fprintf(csv, ",%f", rso->percentile[i]);
                 fprintf(csv, "\n");
         }
 }
diff --git a/thread.c b/thread.c
index 7497e8d..c9a6f1e 100644
--- a/thread.c
+++ b/thread.c
@@ -363,8 +363,6 @@ void start_worker_threads(struct options *opts, struct callbacks *cb,
         allowed_cores = get_cpuset(cpuset, cb);
         LOG_INFO(cb, "Number of allowed_cores = %d", allowed_cores);
 
-        int percentiles = percentiles_count(&opts->percentiles);
-
 #if defined(WITH_TCPDEVMEM_CUDA) || defined(WITH_TCPDEVMEM_UDMABUF)
         /* perform driver reset (on host) in anticipation of TCPDEVMEM run */
         if (opts->tcpd_nic_pci_addr && !opts->client) {
@@ -388,7 +386,6 @@ void start_worker_threads(struct options *opts, struct callbacks *cb,
                 t[i].flow_first = first_flow_in_thread(&t[i]);
                 t[i].flow_limit = flows_in_thread(&t[i]);
                 t[i].flow_count = 0;
-                t[i].percentiles = percentiles;
                 t[i].local_hosts = parse_local_hosts(opts, t[i].num_local_hosts,
                                                      cb);
                 t[i].ready = ready;
diff --git a/thread.h b/thread.h
index 2a4a893..e6d66b5 100644
--- a/thread.h
+++ b/thread.h
@@ -92,7 +92,6 @@ struct thread {
         int flow_first;               /* global index of thread's first flow */
         int flow_limit;               /* number of flows to create on thread */
         int flow_count;               /* number of flows created on thread */
-        int percentiles;              /* number of requested percentiles */
         int stop;
         void *f_mbuf;                 /* replaces per-flow buffers */
         pthread_barrier_t *ready;

From 9f72daac179000a729398ca0a9120e58760f0428 Mon Sep 17 00:00:00 2001
From: Luigi Rizzo <lrizzo@google.com>
Date: Thu, 4 Jan 2024 22:38:21 +0000
Subject: [PATCH 64/72] histo: only scan necessary buckets when computing
 percentiles

Computing percentiles is expensive, as it requires scanning all the 4k-8k
buckets used to store samples, and is done for each flow.  Benchmarks show
the original code took an average of 20us per flow, with frequent peaks
in the 60-80us range.

This patch eliminates the cost by not storing samples in buckets if no
percentiles are requested, and otherwise achieves a ~5x reduction by
tracking the range of buckets that contain values in each epoch.

Also change the precision to 6 bits, which halves the cost without much
impact on the results. This value may become a command line flag.

Tested, as usual, by running tcp_rr and verifying the logs and csv
---
 histo.c | 38 +++++++++++++++++++++++++++++++-------
 histo.h |  2 +-
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/histo.c b/histo.c
index cc697b0..144e72d 100644
--- a/histo.c
+++ b/histo.c
@@ -29,6 +29,7 @@ struct neper_histo {
 
         uint32_t num_buckets;  /* # of buckets allocated */
         uint8_t k_bits; /* resolution */
+        uint16_t p_count; /* percentiles, cached for convenience */
         uint64_t sample_max_value;
 
         uint64_t *all_buckets;
@@ -54,6 +55,14 @@ struct neper_histo {
         double one_max;
         double cur_max;
 
+        uint32_t all_max_index;
+        uint32_t one_max_index;
+        uint32_t cur_max_index;
+
+        uint32_t all_min_index;
+        uint32_t one_min_index;
+        uint32_t cur_min_index;
+
         double *all_p_values;  /* % across all completed epochs */
         double *one_p_values;  /* % of the last completed epoch */
 
@@ -63,7 +72,6 @@ struct neper_histo {
 /* Conversion of a 64-bit value to an approximately logarithmic index
  * with k bits of resolution.
  * lr_bucket(n, k) computes the log2, followed by the k next significant bits.
- *
  * lr_bucket_lo(b, k) returns the lower bound of bucket b.
  * Translate the index into the starting value for the corresponding interval.
  * Each power of 2 is mapped into N = 2**k intervals, each of size
@@ -136,7 +144,7 @@ static void histo_all_finalize(struct neper_histo *impl)
                 return;
         impl->first_all = false;
 
-        for (i = 0; i < impl->num_buckets; i++) {
+        for (i = impl->all_min_index; i <= impl->all_max_index; i++) {
                 sub += impl->all_buckets[i];
                 while (v < pc->p_count && sub >= pc->p_th[v] * cent)
                         impl->all_p_values[v++] = lr_bucket_hi(i, impl->k_bits) * TICKS_TO_SEC;
@@ -149,7 +157,7 @@ static void histo_one_finalize(struct neper_histo *impl)
         double cent = impl->one_count / 100.0;
         int sub = 0, v = 0, i;
 
-        for (i = 0; i < impl->num_buckets; i++) {
+        for (i = impl->one_min_index; i <= impl->one_max_index; i++) {
                 int n = impl->cur_buckets[i];
                 sub += n;
                 impl->all_buckets[i] += n;
@@ -177,9 +185,11 @@ void neper_histo_add(struct neper_histo *desi, const struct neper_histo *srci)
 
         desi->cur_min = MIN(desi->cur_min, srci->all_min);
         desi->cur_max = MAX(desi->cur_max, srci->all_max);
+        desi->cur_min_index = MIN(desi->cur_min_index, srci->all_min_index);
+        desi->cur_max_index = MAX(desi->cur_max_index, srci->all_max_index);
 
         int i;
-        for (i = 0; i < desi->num_buckets; i++)
+        for (i = srci->all_min_index; i <= srci->all_max_index; i++)
                 desi->cur_buckets[i] += srci->all_buckets[i];
 }
 
@@ -205,7 +215,11 @@ void neper_histo_event(struct neper_histo *impl, double delta_s)
                  */
                 return;
         }
+        if (!impl->p_count)
+                return;
         i = lr_bucket((uint64_t)delta_s, impl->k_bits);
+        impl->cur_min_index = MIN(impl->cur_min_index, i);
+        impl->cur_max_index = MAX(impl->cur_max_index, i);
         impl->cur_buckets[i]++;
 }
 
@@ -231,6 +245,14 @@ void neper_histo_epoch(struct neper_histo *impl)
         impl->one_max = impl->cur_max;
         impl->cur_max = 0;
 
+        impl->all_min_index = MIN(impl->all_min_index, impl->cur_min_index);
+        impl->one_min_index = impl->cur_min_index;
+        impl->cur_min_index = impl->num_buckets - 1;
+
+        impl->all_max_index = MAX(impl->all_max_index, impl->cur_max_index);
+        impl->one_max_index = impl->cur_max_index;
+        impl->cur_max_index = 0;
+
         histo_one_finalize(impl);
 }
 
@@ -261,7 +283,6 @@ void neper_histo_delete(struct neper_histo *impl)
 
 struct neper_histo *neper_histo_new(const struct thread *t, uint8_t k_bits)
 {
-        const uint16_t p_count = t->opts->percentiles.p_count;
         struct neper_histo *ret, histo = {};
         size_t memsize = sizeof(histo);
 
@@ -272,20 +293,23 @@ struct neper_histo *neper_histo_new(const struct thread *t, uint8_t k_bits)
         histo.num_buckets = 65 * (1 << k_bits);
         histo.sample_max_value = ~0ul;
         histo.first_all = true;
+        histo.p_count = t->opts->percentiles.p_count;
 
         /* Allocate memory in one chunk */
         memsize += histo.num_buckets * 2 * sizeof(histo.all_buckets[0]);
-        memsize += p_count * 2 * sizeof(histo.all_p_values[0]);
+        memsize += histo.p_count * 2 * sizeof(histo.all_p_values[0]);
 
         ret = calloc(1, memsize);
         *ret = histo;
         ret->all_buckets = (void *)(ret + 1);
         ret->cur_buckets = ret->all_buckets + ret->num_buckets;
         ret->all_p_values = (void *)(ret->cur_buckets + ret->num_buckets);
-        ret->one_p_values = ret->all_p_values + p_count;
+        ret->one_p_values = ret->all_p_values + ret->p_count;
 
         ret->all_min = ret->sample_max_value;
         ret->cur_min = ret->sample_max_value;
+        ret->all_min_index = ret->num_buckets - 1;
+        ret->cur_min_index = ret->num_buckets - 1;
 
         return ret;
 }
diff --git a/histo.h b/histo.h
index cb88a1c..95ba9f3 100644
--- a/histo.h
+++ b/histo.h
@@ -32,7 +32,7 @@
 /* Internally the collector allows 64-bit values in buckets with k_bits
  * significant bits. 6 gives 1.5% error and about 4K buckets.
  */
-#define DEFAULT_K_BITS 4
+#define DEFAULT_K_BITS 6
 
 struct thread;
 struct neper_histo;

From 521cd24a0a521d8895d547459eb4e6e349105ccc Mon Sep 17 00:00:00 2001
From: Luigi Rizzo <lrizzo@google.com>
Date: Fri, 5 Jan 2024 15:10:42 +0000
Subject: [PATCH 65/72] snaps: de-virtualize methods. No functional change.

neper_snaps methods were implemented as virtual functions, but since there
is only one possible implementation this was overkill.  Simplify the code
by exposing the actual methods. The implementation still remains opaque.

No functional changes.

Tested with
./tcp_rr -c -H 127.0.0.1 -p1,2,10,50,90,999.9999,100 -A/tmp/x.csv -l 4
and verified that the csv file has the correct data.
(histograms are only exercised in rr tests)
---
 snaps.c  | 80 +++++++++++++++++++++-----------------------------------
 snaps.h  | 13 +++++----
 stats.c  | 10 +++----
 thread.c |  2 +-
 4 files changed, 42 insertions(+), 63 deletions(-)

diff --git a/snaps.c b/snaps.c
index 6168d66..81286a3 100644
--- a/snaps.c
+++ b/snaps.c
@@ -19,15 +19,13 @@
 #include "print.h"
 #include "rusage.h"
 
-struct snaps_impl {
-        struct neper_snaps snaps;
-
+struct neper_snaps {
         struct neper_rusage *rusage;
         int total;   /* # of snap structs allocated */
         int count;   /* # of populated snap structs */
         int iter;    /* iterator bookmark */
         int extent;  /* extended size of the snap struct */
-        void *ptr;   /* storage */
+        char *ptr;   /* storage */
 };
 
 /*
@@ -50,11 +48,8 @@ void neper_snap_print(const struct neper_snap *snap, FILE *csv,
         print_rusage(csv, snap->rusage, nl);
 }
 
-static struct neper_snap *snaps_get(const struct neper_snaps *snaps, int i)
+static struct neper_snap *snaps_get(const struct neper_snaps *impl, int i)
 {
-        struct snaps_impl *impl = (void *)snaps;
-        char *ptr = impl->ptr;
-
         if (i < 0)
                 return NULL;
         if (i >= impl->total) {
@@ -66,74 +61,57 @@ static struct neper_snap *snaps_get(const struct neper_snaps *snaps, int i)
 
                 if (!reported) {
                         fprintf(stderr, "Test longer than expected (%d), "
-				"use -l <duration> to extend\n",
+                                "use -l <duration> to extend\n",
                                 i + reported++);
-                }
-                return (void *)(ptr + impl->total * impl->extent);
+		}
+                i = impl->total; /* point to spare element */
         }
 
-        return (void *)(ptr + i * impl->extent);
+        return (void *)(impl->ptr + i * impl->extent);
 }
 
 /* Compare two containers by comparing the current iterator objects for each. */
 
-double neper_snaps_cmp(const struct neper_snaps *aptr,
-                       const struct neper_snaps *bptr)
+double neper_snaps_cmp(const struct neper_snaps *a,
+                       const struct neper_snaps *b)
 {
-        const struct snaps_impl *a = (void *)aptr;
-        const struct snaps_impl *b = (void *)bptr;
-        const struct neper_snap *sa = snaps_get(&a->snaps, a->iter);
-        const struct neper_snap *sb = snaps_get(&b->snaps, b->iter);
+        const struct neper_snap *sa = snaps_get(a, a->iter);
+        const struct neper_snap *sb = snaps_get(b, b->iter);
 
         return neper_snap_cmp(sa, sb);
 }
 
-static struct neper_snap *snaps_add(struct neper_snaps *snaps,
-                                    const struct timespec *now, uint64_t things)
+struct neper_snap *neper_snaps_add(struct neper_snaps *snaps,
+                                   const struct timespec *now, uint64_t things)
 {
-        struct snaps_impl *impl = (void *)snaps;
-        struct neper_snap *snap = snaps_get(snaps, impl->count++);
+        struct neper_snap *snap = snaps_get(snaps, snaps->count++);
 
         snap->timespec = *now;
-        snap->rusage   = impl->rusage->get(impl->rusage, now);
+        snap->rusage   = snaps->rusage->get(snaps->rusage, now);
         snap->things   = things;
 
         return snap;
 }
 
-static int snaps_count(const struct neper_snaps *snaps)
+int neper_snaps_count(const struct neper_snaps *snaps)
 {
-        const struct snaps_impl *impl = (void *)snaps;
-
-        return impl->count;
+        return snaps->count;
 }
 
-static const struct neper_snap *snaps_iter_next(struct neper_snaps *snaps)
+const struct neper_snap *neper_snaps_iter_next(struct neper_snaps *snaps)
 {
-        struct snaps_impl *impl = (void *)snaps;
-
-        int i = impl->iter++;
-        return (i < impl->count) ? snaps_get(snaps, i) : NULL;
+        int i = snaps->iter++;
+        return (i < snaps->count) ? snaps_get(snaps, i) : NULL;
 }
 
-static bool snaps_iter_done(const struct neper_snaps *snaps)
+bool neper_snaps_iter_done(const struct neper_snaps *snaps)
 {
-        const struct snaps_impl *impl = (void *)snaps;
-
-        return (impl->iter >= impl->count);
+        return snaps->iter >= snaps->count;
 }
 
 struct neper_snaps *neper_snaps_init(struct neper_rusage *rusage,
                                      int total, int extra)
 {
-        struct snaps_impl *impl = calloc(1, sizeof(struct snaps_impl));
-        struct neper_snaps *snaps = &impl->snaps;
-
-        impl->rusage = rusage;
-        impl->total  = total;
-        impl->count  = 0;
-        impl->iter   = 0;
-        impl->extent = sizeof(struct neper_snap) + extra;
         /*
          * Allocate memory for all the samples at startup, based on the
          * known values for total_length and interval. The actual test
@@ -144,12 +122,14 @@ struct neper_snaps *neper_snaps_init(struct neper_rusage *rusage,
          * See snaps_get().
          * TODO(lrizzo) Design a proper mechanism for dynamic allocation.
          */
-        impl->ptr    = calloc(total + 1, impl->extent);
-
-        snaps->add   = snaps_add;
-        snaps->count = snaps_count;
-        snaps->iter_next = snaps_iter_next;
-        snaps->iter_done = snaps_iter_done;
+        const int extent = sizeof(struct neper_snap) + extra;
+        const int size = sizeof(struct neper_snaps) + extent * (total + 1);
+        struct neper_snaps *snaps = calloc(1, size);
+
+        snaps->rusage = rusage;
+        snaps->total  = total;
+        snaps->extent = extent;
+        snaps->ptr    = (char *)(snaps + 1);
 
         return snaps;
 }
diff --git a/snaps.h b/snaps.h
index 63a1b81..e8d26c1 100644
--- a/snaps.h
+++ b/snaps.h
@@ -52,13 +52,12 @@ void neper_snap_print(const struct neper_snap *, FILE *, double raw_thruput,
  * iter_done() Returns true once the end of the iterator has been reached.
  */
 
-struct neper_snaps {
-        struct neper_snap *(*add)(struct neper_snaps *, const struct timespec *,
-                                  uint64_t);
-        int (*count)(const struct neper_snaps *);
-        const struct neper_snap *(*iter_next)(struct neper_snaps *);
-        bool (*iter_done)(const struct neper_snaps *);
-};
+struct neper_snaps;
+
+struct neper_snap *neper_snaps_add(struct neper_snaps *, const struct timespec *, uint64_t);
+int neper_snaps_count(const struct neper_snaps *);
+const struct neper_snap *neper_snaps_iter_next(struct neper_snaps *);
+bool neper_snaps_iter_done(const struct neper_snaps *);
 
 double neper_snaps_cmp(const struct neper_snaps *, const struct neper_snaps *);
 
diff --git a/stats.c b/stats.c
index 26e18d9..96cc600 100644
--- a/stats.c
+++ b/stats.c
@@ -93,13 +93,13 @@ static void stat_event(struct thread *t, struct neper_stat *stat, int things,
 
         impl->things += things;
 
-        int i = snaps->count(snaps);
+        int i = neper_snaps_count(snaps);
 
         double threshold = t->opts->interval * (i + 1);
 
         /* Always record the first event, to capture the start time. */
         if (elapsed >= threshold || !i || force) {
-                struct neper_snap *snap = snaps->add(snaps, &now, impl->things);
+                struct neper_snap *snap = neper_snaps_add(snaps, &now, impl->things);
 
                 if (fn)
                         fn(t, stat, snap);
@@ -125,7 +125,7 @@ struct neper_coef *neper_stat_print(struct thread *ts, FILE *csv,
         while ((stat = pq->deq(pq))) {
                 struct stat_impl *impl = (void *)stat;
                 struct neper_snaps *snaps = impl->snaps;
-                const struct neper_snap *snap = snaps->iter_next(snaps);
+                const struct neper_snap *snap = neper_snaps_iter_next(snaps);
 
                 current_total += snap->things - impl->scratch;
                 impl->scratch = snap->things;
@@ -157,7 +157,7 @@ struct neper_coef *neper_stat_print(struct thread *ts, FILE *csv,
 
                 coef->event(coef, &snap->timespec, current_total);
 
-                if (!snaps->iter_done(snaps))
+                if (!neper_snaps_iter_done(snaps))
                         pq->enq(pq, stat);
         }
 
@@ -221,7 +221,7 @@ static int fn_snaps(struct neper_stat *stat, void *unused)
         struct stat_impl *impl = (void *)stat;
         const struct neper_snaps *snaps = impl->snaps;
 
-        return snaps->count(snaps);
+        return neper_snaps_count(snaps);
 }
 
 static void stats_container_insert(struct neper_stats *stats,
diff --git a/thread.c b/thread.c
index c9a6f1e..bcc1c72 100644
--- a/thread.c
+++ b/thread.c
@@ -52,7 +52,7 @@ static int
 fn_count_snaps(struct neper_stat *stat, void *ptr)
 {
         const struct neper_snaps *snaps = stat->snaps(stat);
-        return snaps->count(snaps);
+        return neper_snaps_count(snaps);
 }
 
 static int

From 06cf1753916a3b5959c2ecaef1d40a7bb7f76184 Mon Sep 17 00:00:00 2001
From: Kaitlin Sem <kaitlinsem@google.com>
Date: Thu, 8 Feb 2024 15:49:32 +0000
Subject: [PATCH 66/72] neper: new option --wait-start to delay starting client
 data flows

This option on the client side will delay the client from creating any
threads (and thus flows) after the control connection has been
established. It can be useful if multiple neper server-client pairs
are created over the same link, and the link gets too congested from
earlier pairs for the later ones to successfully establish the control
connection. The option can also be used to set up simulated packet
dropping rules between making the control connection and sending
traffic.

Tested:
./tcp_stream -c -H 127.0.0.1 --wait-start 5 --logtostderr

Verified that the client waited for 5 seconds before starting to send
traffic.
---
 check_all_options.c | 2 ++
 define_all_flags.c  | 1 +
 lib.h               | 1 +
 thread.c            | 4 ++++
 4 files changed, 8 insertions(+)

diff --git a/check_all_options.c b/check_all_options.c
index 90b3564..0b6eb15 100644
--- a/check_all_options.c
+++ b/check_all_options.c
@@ -54,6 +54,8 @@ void check_options_common(struct options *opts, struct callbacks *cb)
               "Max pacing rate must be non-negative.");
         CHECK(cb, opts->max_pacing_rate <= UINT64_MAX,
               "Max pacing rate cannot exceed 64 bits.");
+        CHECK(cb, opts->client || (opts->wait_start == 0),
+              "The wait-start option is only valid for clients.");
 }
 
 void check_options_tcp(struct options *opts, struct callbacks *cb)
diff --git a/define_all_flags.c b/define_all_flags.c
index b6ef422..9941e22 100644
--- a/define_all_flags.c
+++ b/define_all_flags.c
@@ -56,6 +56,7 @@ struct flags_parser *add_flags_common(struct flags_parser *fp)
         DEFINE_FLAG_PARSER(fp,        all_samples, parse_all_samples);
         DEFINE_FLAG(fp, bool,         time_wait,     false,    0,  "Do not set SO_LINGER 0. Close gracefully. Active peer will enter TIME_WAIT state");
         DEFINE_FLAG(fp, unsigned long, iostat_ms,    0,        0,  "Print io stats snapshot every this many ms");
+        DEFINE_FLAG(fp, unsigned long, wait_start,   0,        0,  "Wait this many seconds before starting any data flows.");
 
         /* Return the updated fp */
         return (fp);
diff --git a/lib.h b/lib.h
index b0d0c61..ea8c29b 100644
--- a/lib.h
+++ b/lib.h
@@ -102,6 +102,7 @@ struct options {
         const char *port;
         int source_port;
         unsigned long iostat_ms;
+        unsigned long wait_start;
         const char *all_samples;
         const char secret[32]; /* includes test name */
         bool async_connect;
diff --git a/thread.c b/thread.c
index bcc1c72..9420b6a 100644
--- a/thread.c
+++ b/thread.c
@@ -556,6 +556,10 @@ int run_main_thread(struct options *opts, struct callbacks *cb,
         cp = control_plane_create(opts, cb, data_pending, fn);
         control_plane_start(cp, &ai);
 
+        /* if nonzero, make the client wait before the threads are started. */
+        if (opts->client)
+                sleep(opts->wait_start);
+
         /* start threads *after* control plane is up, to reuse addrinfo. */
         reset_port(ai, atoi(opts->port), cb);
         ts = calloc(opts->num_threads, sizeof(struct thread));

From 8b2f1bafa29b58dabc57a4e9efb1dd29db4d1a4a Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Wed, 21 Feb 2024 15:58:24 +0000
Subject: [PATCH 67/72] fixing minor lint complaints regarding imports

---
 define_all_flags.c | 1 +
 histo.c            | 6 ++++++
 snaps.c            | 4 ++++
 thread.h           | 2 +-
 4 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/define_all_flags.c b/define_all_flags.c
index 9941e22..bfdce05 100644
--- a/define_all_flags.c
+++ b/define_all_flags.c
@@ -18,6 +18,7 @@
 #include "flags.h"
 #include "lib.h"
 #include "parse.h"
+#include "percentiles.h"
 #include "define_all_flags.h"
 
 struct flags_parser *add_flags_common(struct flags_parser *fp)
diff --git a/histo.c b/histo.c
index 144e72d..9378cdf 100644
--- a/histo.c
+++ b/histo.c
@@ -16,9 +16,15 @@
 
 #include "histo.h"
 #include "common.h"
+#include "logging.h"
+#include "percentiles.h"
 #include "thread.h"
 
 #include <float.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
 
 // use 0.01 us time resolution
 static const int TIME_RESOLUTION = 100 * 1000000;
diff --git a/snaps.c b/snaps.c
index 81286a3..939c3ef 100644
--- a/snaps.c
+++ b/snaps.c
@@ -19,6 +19,10 @@
 #include "print.h"
 #include "rusage.h"
 
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
 struct neper_snaps {
         struct neper_rusage *rusage;
         int total;   /* # of snap structs allocated */
diff --git a/thread.h b/thread.h
index e6d66b5..d0c3c28 100644
--- a/thread.h
+++ b/thread.h
@@ -103,7 +103,7 @@ struct thread {
         struct rusage *rusage_start;
         struct neper_stats *stats;
         struct neper_rusage *rusage;
-	struct io_stats io_stats;
+        struct io_stats io_stats;
         struct countdown_cond *data_pending;
         struct rate_limit rl;
         struct flow **flows;  /* indexed by flow_id(flow) */

From fc6793765777a231c772cf3d298c34bb79dec222 Mon Sep 17 00:00:00 2001
From: Bhaskar Pardeshi <biggy.pardeshi@gmail.com>
Date: Sun, 17 Mar 2024 04:10:06 -0400
Subject: [PATCH 68/72] [Fix] Prevent skipping the last latency percentile list
 argument

The current code takes a list of comma separated 'double' values
representing the latency percentile data points that the user is
interested in. The user might repeat a particular percentile value.
To prevent printing the same percentile data point twice, the code
(percentile.c) sorts the list of percentiles requested by the user,
in the order of their percentile values, i.e. higher percentiles will
be sorted at the end.

After sorting and removing the duplicates, the code incorrectly sets
the number of the percentiles to one less than the actual count. This
results in the code not printing the last percentile in the provided
list of percentile arguments.

For example,
option_used_in_cmd>  -p 50.0,95.0,99.0, results in the following
stdout_results>    percentiles=50,95

This patch fixes the issue, by setting the total count of the request
percentiles to the correct value.
---
 percentiles.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/percentiles.c b/percentiles.c
index 9f70746..3c24627 100644
--- a/percentiles.c
+++ b/percentiles.c
@@ -71,7 +71,7 @@ void percentiles_parse(const char *arg, void *out, struct callbacks *cb)
                 else
                         p->p_th[++cur] = p->p_th[i];
         }
-        p->p_count = cur;
+        p->p_count = cur + 1;
 }
 
 void percentiles_print(const char *name, const void *var, struct callbacks *cb)

From 79eaf56f43a1abeb2b2b8d3695672a806e1b8286 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Wed, 21 Feb 2024 18:22:13 +0000
Subject: [PATCH 69/72] Reduce # of snapshots captured on stream client

Stream clients only take 1 snapshot, but doesn't properly return
timespec start_time.
However, tries to take snapshots too many times, reducing throughput on
udp_stream significantly.
Reduce it so only 2 snapshots are needed.

author: lixiaoyan@
---
 snaps.c  | 9 ---------
 snaps.h  | 9 ++++++++-
 stream.c | 6 ++++++
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/snaps.c b/snaps.c
index 939c3ef..f33a17a 100644
--- a/snaps.c
+++ b/snaps.c
@@ -23,15 +23,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-struct neper_snaps {
-        struct neper_rusage *rusage;
-        int total;   /* # of snap structs allocated */
-        int count;   /* # of populated snap structs */
-        int iter;    /* iterator bookmark */
-        int extent;  /* extended size of the snap struct */
-        char *ptr;   /* storage */
-};
-
 /*
  * Compare two neper_snap structs.
  * The one with the earlier timestamp is considered to be smaller.
diff --git a/snaps.h b/snaps.h
index e8d26c1..b7616b7 100644
--- a/snaps.h
+++ b/snaps.h
@@ -52,7 +52,14 @@ void neper_snap_print(const struct neper_snap *, FILE *, double raw_thruput,
  * iter_done() Returns true once the end of the iterator has been reached.
  */
 
-struct neper_snaps;
+struct neper_snaps {
+        struct neper_rusage *rusage;
+        int total;   /* # of snap structs allocated */
+        int count;   /* # of populated snap structs */
+        int iter;    /* iterator bookmark */
+        int extent;  /* extended size of the snap struct */
+        char *ptr;   /* storage */
+};
 
 struct neper_snap *neper_snaps_add(struct neper_snaps *, const struct timespec *, uint64_t);
 int neper_snaps_count(const struct neper_snaps *);
diff --git a/stream.c b/stream.c
index 6cb9a69..7803627 100644
--- a/stream.c
+++ b/stream.c
@@ -20,6 +20,7 @@
 #include "common.h"
 #include "flow.h"
 #include "print.h"
+#include "snaps.h"
 #include "socket.h"
 #include "stats.h"
 #include "thread.h"
@@ -83,6 +84,7 @@ void stream_handler(struct flow *f, uint32_t events)
         struct thread *t = flow_thread(f);
         void *mbuf = flow_mbuf(f);
         int fd = flow_fd(f);
+        const struct neper_snaps *snaps;
         const struct options *opts = t->opts;
         /*
          * The actual size can be calculated with CMSG_SPACE(sizeof(struct X)),
@@ -107,6 +109,10 @@ void stream_handler(struct flow *f, uint32_t events)
         if (events & (EPOLLHUP | EPOLLRDHUP))
                 return flow_delete(f);
 
+        snaps = stat->snaps(stat);
+        if (neper_snaps_count(snaps) == 0)
+                stat->event(t, stat, 0, false, NULL);
+
         if (events & EPOLLIN)
                 do {
                         do {

From cc96940faf54a8f0a4c4a3d71b5951ccfb59b886 Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Wed, 21 Feb 2024 18:36:29 +0000
Subject: [PATCH 70/72] include header, brief changelog to README

---
 README.md | 7 +++++++
 stats.h   | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/README.md b/README.md
index 293126f..1a67936 100644
--- a/README.md
+++ b/README.md
@@ -427,3 +427,10 @@ be insignificant.  However, the keys are case sensitive.
 * C99, avoid compiler-specific extensions.
 * No external dependency.
 * Linux coding style, with tabs expanded to 8 spaces.
+
+## Changelog (not comprehensive)
+
+### 2024-02-21
+
+* **Breaking**: changed histogram implementation
+* 
\ No newline at end of file
diff --git a/stats.h b/stats.h
index 068fc3c..d8c51e4 100644
--- a/stats.h
+++ b/stats.h
@@ -17,6 +17,8 @@
 #ifndef THIRD_PARTY_NEPER_STATS_H
 #define THIRD_PARTY_NEPER_STATS_H
 
+#include "snaps.h"
+
 #include <stdbool.h>
 #include <stdio.h>
 

From a32203bb6284d885372a6819814fd209c10ffbcc Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Thu, 18 Apr 2024 18:41:11 +0000
Subject: [PATCH 71/72] addressing pull request comments

many LOG <- printf swaps
removing commented out code
indent fixes
---
 flow.c              |  6 +++---
 stream.c            | 32 +++++++++++++++-----------------
 tcpdevmem.c         |  4 ++--
 tcpdevmem_cuda.cu   | 12 +++++-------
 tcpdevmem_udmabuf.c |  6 ++----
 5 files changed, 27 insertions(+), 33 deletions(-)

diff --git a/flow.c b/flow.c
index 222bc22..6e24aa9 100644
--- a/flow.c
+++ b/flow.c
@@ -257,12 +257,12 @@ void flow_delete(struct flow *f)
         }
 
 #ifdef WITH_TCPDEVMEM_CUDA
-        if (flow_thread(f)->opts->tcpd_gpu_pci_addr) {
+        if (flow_thread(f)->opts->tcpd_gpu_pci_addr)
                 cuda_flow_cleanup(f->f_mbuf);
-        } else
 #endif /* WITH_TCPDEVMEM_CUDA */
 #ifdef WITH_TCPDEVMEM_UDMABUF
-        if (flow_thread(f)->opts->tcpd_nic_pci_addr)
+        if (flow_thread(f)->opts->tcpd_nic_pci_addr
+            && !flow_thread(f)->opts->tcpd_gpu_pci_addr)
                 udmabuf_flow_cleanup(f->f_mbuf);
 #endif /* WITH_TCPDEVMEM_UDMABUF */
 
diff --git a/stream.c b/stream.c
index 6dfdc37..cceea6b 100644
--- a/stream.c
+++ b/stream.c
@@ -36,20 +36,16 @@ static void *stream_alloc(struct thread *t)
 
 #ifdef WITH_TCPDEVMEM_CUDA
         if (!t->f_mbuf && t->opts->tcpd_gpu_pci_addr) {
-                if (tcpd_cuda_setup_alloc(t->opts, &t->f_mbuf, t)) {
+                if (tcpd_cuda_setup_alloc(t->opts, &t->f_mbuf, t))
                         LOG_FATAL(t->cb, "%s: failed to setup devmem CUDA socket",
                                   __func__);
-                        exit(1);
-                }
         }
 #endif /* WITH_TCPDEVMEM_CUDA */
 #ifdef WITH_TCPDEVMEM_UDMABUF
         if (!t->f_mbuf && t->opts->tcpd_nic_pci_addr) {
-                if (udmabuf_setup_alloc(t->opts, &t->f_mbuf, t)) {
+                if (udmabuf_setup_alloc(t->opts, &t->f_mbuf, t))
                         LOG_FATAL(t->cb, "%s: failed to setup devmem UDMABUF socket",
                                   __func__);
-                        exit(1);
-                }
         }
 #endif /* WITH_TCPDEVMEM_UDMABUF */
 
@@ -113,16 +109,16 @@ void stream_handler(struct flow *f, uint32_t events)
 #ifdef WITH_TCPDEVMEM_CUDA
                                 if (t->opts->tcpd_gpu_pci_addr)
                                         n = tcpd_recv(fd, mbuf,
-                                                           opts->buffer_size,
-                                                           opts->recv_flags,
-                                                           t);
+                                                      opts->buffer_size,
+                                                      opts->recv_flags,
+                                                      t);
                                 else
 #endif /* WITH_TCPDEVMEM_CUDA */
 #ifdef WITH_TCPDEVMEM_UDMABUF
                                 if (t->opts->tcpd_nic_pci_addr)
                                         n = udmabuf_recv(fd, mbuf,
-                                                      opts->buffer_size,
-                                                      t);
+                                                         opts->buffer_size,
+                                                         t);
                                 else
 #endif /* WITH_TCPDEVMEM_UDMABUF */
                                 n = recv(fd, mbuf, opts->buffer_size,
@@ -144,16 +140,18 @@ void stream_handler(struct flow *f, uint32_t events)
                 do {
 #ifdef WITH_TCPDEVMEM_CUDA
                         if (t->opts->tcpd_gpu_pci_addr) {
-                                n = tcpd_send(fd, mbuf, opts->buffer_size, opts->send_flags, t);
+                                n = tcpd_send(fd, mbuf,
+                                              opts->buffer_size,
+                                              opts->send_flags,
+                                              t);
                         } else
 #endif /* WITH_TCPDEVMEM_CUDA */
 #ifdef WITH_TCPDEVMEM_UDMABUF
                         if (t->opts->tcpd_nic_pci_addr) {
-                                n = udmabuf_send(fd,
-                                              mbuf,
-                                              opts->buffer_size,
-                                              opts->send_flags,
-                                              t);
+                                n = udmabuf_send(fd, mbuf,
+                                                 opts->buffer_size,
+                                                 opts->send_flags,
+                                                 t);
                         } else
 #endif /* WITH_TCPDEVMEM_UDMABUF */
                         n = send(fd, mbuf, opts->buffer_size, opts->send_flags);
diff --git a/tcpdevmem.c b/tcpdevmem.c
index c16d0b9..cb86795 100644
--- a/tcpdevmem.c
+++ b/tcpdevmem.c
@@ -59,7 +59,7 @@ int install_flow_steering(const struct options *opts, intptr_t buf,
 	int ret;
 
 	int num_queues = q_start + (t->index % q_num);
-	printf("Bind to queue %i\n", num_queues);
+	LOG_INFO(t->cb, "Bind to queue %i\n", num_queues);
 	struct dma_buf_pages_bind_rx_queue bind_cmd;
 
 	strcpy(bind_cmd.ifname, opts->tcpd_link_name);
@@ -95,7 +95,7 @@ int install_flow_steering(const struct options *opts, intptr_t buf,
 		sprintf(ethtool_cmd, "ethtool --set-rxfh-indir %s equal 8", opts->tcpd_link_name);
 		RETURN_IF_NON_ZERO(system(ethtool_cmd));
 
-		printf("ethtool cmds returned %i, sleeping 1...\n", ret);
+		LOG_INFO(t->cb, "ethtool cmds returned %i, sleeping 1...\n", ret);
 		sleep(1);
 	}
 	return ret;
diff --git a/tcpdevmem_cuda.cu b/tcpdevmem_cuda.cu
index a0cf8f0..3aaf964 100644
--- a/tcpdevmem_cuda.cu
+++ b/tcpdevmem_cuda.cu
@@ -132,8 +132,8 @@ int get_gpumem_dmabuf_pages_fd(const std::string& gpu_pci_addr,
   if (*dma_buf_fd < 0)
     PLOG_FATAL(t->cb, "cuMemGetHandleForAddressRange");
 
-  printf("Registered dmabuf region 0x%p of %lu Bytes\n",
-         gpu_mem, gpu_mem_sz);
+  LOG_INFO(t->cb, "Registered dmabuf region 0x%p of %lu Bytes\n",
+           gpu_mem, gpu_mem_sz);
 
   struct dma_buf_create_pages_info frags_create_info;
   frags_create_info.dma_buf_fd = *dma_buf_fd;
@@ -169,8 +169,6 @@ int tcpd_cuda_setup_alloc(const struct options *opts, void **f_mbuf, struct thre
   void *gpu_gen_mem_;
   int gpu_mem_fd_;
   int dma_buf_fd_;
-  // int q_start = opts->queue_start;
-  // int q_num = opts->queue_num;
   struct tcpdevmem_cuda_mbuf *tmbuf;
   const char *gpu_pci_addr = opts->tcpd_gpu_pci_addr;
   const char *nic_pci_addr = opts->tcpd_nic_pci_addr;
@@ -312,13 +310,13 @@ int tcpd_recv(int socket, void *f_mbuf, size_t n, int flags, struct thread *t) {
 
   ssize_t received = recvmsg(socket, msg, MSG_SOCK_DEVMEM | MSG_DONTWAIT);
   if (received < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
-    printf("%s %d: recvmsg returned < 0\n", __func__, __LINE__);
+    LOG_ERROR(t->cb, "%s %d: recvmsg returned < 0\n", __func__, __LINE__);
     return -1;
   } else if (received < 0) {
-    printf("%s %d\n", __func__, __LINE__);
+    LOG_ERROR(t->cb, "%s %d\n", __func__, __LINE__);
     return -1;
   } else if (received == 0) {
-    printf("Client exited\n");
+    LOG_ERROR(t->cb, "Client exited\n");
     return -1;
   }
 
diff --git a/tcpdevmem_udmabuf.c b/tcpdevmem_udmabuf.c
index 4c5db38..9cbb444 100644
--- a/tcpdevmem_udmabuf.c
+++ b/tcpdevmem_udmabuf.c
@@ -51,21 +51,20 @@ int udmabuf_setup_alloc(const struct options *opts, void **f_mbuf, struct thread
         if (memfd < 0)
                 LOG_FATAL(t->cb, "[skip,no-memfd]");
 
-
         ret = fcntl(memfd, F_ADD_SEALS, F_SEAL_SHRINK);
         if (ret < 0)
                 LOG_FATAL(t->cb, "[skip,fcntl-add-seals]");
 
         ret = ftruncate(memfd, size);
         if (ret == -1)
-                LOG_FATAL(t->cb, "[FAIL,memfd-truncate]\n");
+                LOG_FATAL(t->cb, "[FAIL,memfd-truncate]");
 
         memset(&create, 0, sizeof(create));
 
         create.memfd = memfd;
         create.offset = 0;
         create.size = size;
-        printf("size=%lu\n", size);
+        LOG_INFO(t->cb, "udmabuf size=%lu", size);
         buf = ioctl(devfd, UDMABUF_CREATE, &create);
         if (buf < 0)
                 LOG_FATAL(t->cb, "[FAIL, create udmabuf]");
@@ -137,7 +136,6 @@ int udmabuf_send(int socket, void *f_mbuf, size_t n, int flags, struct thread *t
         munmap(buf_mem, n);
 
         memset(msg, 0, sizeof(struct msghdr));
-        // memset(cmsg, 0, sizeof(struct cmsghdr));
 
         iov.iov_base = buf_dummy;
         iov.iov_len = n - tmbuf->bytes_sent;

From e92168c13ae394da80a8a10e34efccf223d0614a Mon Sep 17 00:00:00 2001
From: Jeffrey Ji <jeffreyji@google.com>
Date: Tue, 25 Jun 2024 17:33:47 +0000
Subject: [PATCH 72/72] Add tcpdevmem README

Includes instructions for running Neper with udmabuf tcpdevmem.
---
 README_tcpdevmem.md | 212 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 212 insertions(+)
 create mode 100644 README_tcpdevmem.md

diff --git a/README_tcpdevmem.md b/README_tcpdevmem.md
new file mode 100644
index 0000000..1c33fe6
--- /dev/null
+++ b/README_tcpdevmem.md
@@ -0,0 +1,212 @@
+# Neper with TCPDevmem run instructions
+
+Table of Contents
+- [Neper with TCPDevmem run instructions](#neper-with-tcpdevmem-run-instructions)
+  - [TCPDevmem UDMABUF: Compiling tcp\_stream](#tcpdevmem-udmabuf-compiling-tcp_stream)
+    - [Manually specifying kernel headers directory (i.e. NOT in `usr/include`)](#manually-specifying-kernel-headers-directory-ie-not-in-usrinclude)
+  - [Running tcp\_stream](#running-tcp_stream)
+    - [Added flags](#added-flags)
+    - [Running tcp\_stream via `multi_neper.py`](#running-tcp_stream-via-multi_neperpy)
+      - [Example of successful output](#example-of-successful-output)
+    - [Running tcp\_stream directly](#running-tcp_stream-directly)
+
+
+## TCPDevmem UDMABUF: Compiling tcp_stream
+
+**UDMABUF-capable tcp_stream can be built statically on a workstation.**
+
+Neper can be built statically on a host with UDMABUF header files.
+
+```
+# clone the Neper repository and checkout the tcpd branch
+git clone -b tcpd https://github.com/google/neper.git
+cd neper
+
+# copy kernel header files to Neper working directory
+# (assumed to be found in ~/kernel/usr/include)
+mkdir usr
+cp -r ~/kernel/usr/include/ ./usr/
+
+make tcp_steam WITH_TCPDEVMEM_UDMABUF=1
+
+# copy the binary to your hosts
+scp tcp_stream root@${HOST1}:~/
+scp multi_neper.py root@${HOST1}:~/
+
+scp tcp_stream root@${HOST2}:~/
+scp multi_neper.py root@${HOST2}:~/
+```
+
+### Manually specifying kernel headers directory (i.e. NOT in `usr/include`)
+
+Copying the header files is unnecessary if you override `HEADERS_DIR` variable when running make. The default value for this variable is `usr/include`.
+
+```
+git clone -b tcpd https://github.com/google/neper.git
+cd neper
+
+make tcp_steam WITH_TCPDEVMEM_UDMABUF=1 HEADERS_DIR=~/kernel/usr/include
+```
+
+
+## Running tcp_stream
+
+
+### Added flags
+
+In general, these flags will be automatically populated by `multi_neper.py`.
+
+```
+--tcpd-validate     # payload validation - must pass to both Tx/Rx if enabled
+--tcpd-tcpd-rx-cpy  # copies payload to another buffer (but doesn't validate)
+--tcpd-nic-pci-addr
+--tcpd-gpu-pci-addr
+--tcpd-phys-len     # CUDA mode allows for a much larger value than UDMABUF mode
+--tcpd-src-ip
+--tcpd-dst-ip
+--tcpd-link-name
+--queue-start
+--queue-num
+```
+
+`--tcpd-validate`: Client populates the send buffer with [1,111] repeating, and Host verifies the repeating sequence.
+
+
+### Running tcp_stream via `multi_neper.py`
+
+`multi_neper.py` is a python script that runs in parallel multiple tcp_streams, which is useful when running tcp_stream across multiple pairs of NICs.
+
+The script also calls ethtool commands on the receiver (host) before spawning tcp_streams, to set the receiver into a TCPDevmem-capable state.
+
+To view all of `multi_neper.py`’s accepted flags, run `multi_neper.py --help`.
+
+
+```
+# Rx (host)
+FLOWS=2
+BUF_SIZE=409600
+DEVS=eth1,eth2,eth3,eth4
+DSTS=192.168.1.26,192.168.2.26,192.168.3.26,192.168.4.26 # host IP addresses
+SRCS=192.168.1.23,192.168.2.23,192.168.3.23,192.168.4.23 # client IP addresses
+./multi_neper.py --hosts $DSTS \
+  --devices $DEVS --buffer-size $BUF_SIZE \
+  --flows $FLOWS --threads $FLOWS \
+  --src-ips $SRCS --log DEBUG \
+  --q-num $FLOWS --phys-len 2147483648 \
+  --mode cuda
+
+
+# Tx (client)
+FLOWS=2
+BUF_SIZE=409600
+DEVS=eth1,eth2,eth3,eth4
+DSTS=192.168.1.26,192.168.2.26,192.168.3.26,192.168.4.26
+SRCS=192.168.1.23,192.168.2.23,192.168.3.23,192.168.4.23
+./multi_neper.py --hosts $DSTS \
+  --devices $DEVS --buffer-size $BUF_SIZE \
+  --flows $FLOWS --threads $FLOWS \
+  --src-ips $SRCS --log DEBUG \
+  --q-num $FLOWS --phys-len 2147483648 \
+  --client \
+  --mode cuda
+```
+
+#### Example of successful output
+
+```
+DEBUG:root:minflt_end=6037
+DEBUG:root:majflt_start=0
+DEBUG:root:majflt_end=0
+DEBUG:root:nvcsw_start=653
+DEBUG:root:nvcsw_end=675141
+DEBUG:root:nivcsw_start=2
+DEBUG:root:nivcsw_end=1018
+DEBUG:root:num_samples=155
+DEBUG:root:time_end=613529.729042674
+DEBUG:root:correlation_coefficient=1.00
+DEBUG:root:throughput=193669.32
+DEBUG:root:throughput_units=Mbit/s
+DEBUG:root:local_throughput=193669323769
+DEBUG:root:remote_throughput=0
+DEBUG:root:
+[eth1] Throughput (Mb/s): 193551.94
+[eth2] Throughput (Mb/s): 193652.69
+[eth3] Throughput (Mb/s): 193640.21
+[eth4] Throughput (Mb/s): 193669.32
+```
+
+
+
+### Running tcp_stream directly
+
+**If you’re running Neper outside of the container, make sure to run**
+
+```
+sudo -s
+```
+
+**before everything. `ethtool` commands and queue-binding is only available to superuser.**
+
+Before running tcp_stream, the ethtool commands that `multi_neper.py` runs should also be run:
+
+```
+# run as superuser, if running Neper as root
+sudo -s
+
+res_link() {
+ethtool --set-priv-flags $1 enable-strict-header-split on
+ethtool --set-priv-flags $1 enable-strict-header-split off
+ethtool --set-priv-flags $1 enable-header-split off
+ethtool --set-rxfh-indir $1 equal 16
+ethtool -K $1 ntuple off
+ethtool --set-priv-flags $1 enable-strict-header-split off
+ethtool --set-priv-flags $1 enable-header-split off
+ethtool -K $1 ntuple off
+ethtool --set-priv-flags $1 enable-max-rx-buffer-size on
+ethtool -K $1 ntuple on
+}
+
+# call on each link you plan to run tcp_stream across
+res_link eth1
+```
+
+
+You can then run `multi_neper.py` with the `--dry-run` flag, to see what tcp_stream commands the script would run:
+
+
+```
+$ FLOWS=1
+$ BUF_SIZE=409600
+$ DEVS=eth1
+$ DSTS=192.168.1.26
+$ SRCS=192.168.1.23
+$ ./multi_neper.py --hosts $DSTS \
+  --devices $DEVS --buffer-size $BUF_SIZE \
+  --flows $FLOWS --threads $FLOWS \
+  --src-ips $SRCS --log DEBUG \
+  --q-num $FLOWS --phys-len 2147483648 \
+  --client \
+  --mode cuda \
+  --dry-run
+
+DEBUG:root:running on ['eth1']
+DEBUG:root:('taskset --cpu-list 2-2 ./tcp_stream -T 1 -F 1 --port 12345 --source-port 12345 --control-port 12866 --buffer-size 409600  -l 10 --num-ports 1 --tcpd-phys-len 2147483648 --tcpd-nic-pci-addr 0000:06:00.0 --tcpd-gpu-pci-addr 0000:04:00.0 -c -H 192.168.1.26', {'CUDA_VISIBLE_DEVICES': '0', ...
+```
+
+The script will print the tcp_stream command, as well as the environment variables. The only environment variable that matters is `CUDA_VISIBLE_DEVICES` if running in `cuda` mode, which tells tcp_stream which GPU it should allocate memory on.
+
+You can then reset the receiver, and copy/paste the command:
+
+```
+# on Rx (host)
+res_link eth1
+./multi_neper.py --dry-run ${other_rx_args}
+
+CUDA_VISIBLE_DEVICES=0 ./tcp_stream # copy cmd from previous line
+
+
+# on Tx (client)
+./multi_neper.py --dry-run ${other_tx_args}
+
+CUDA_VISIBLE_DEVICES=0 ./tcp_stream # copy cmd from previous line
+```