From 6cec0203f5d9f4d177db4bd91e1f484b643232c0 Mon Sep 17 00:00:00 2001 From: noahcui Date: Fri, 21 Feb 2025 18:42:55 +0000 Subject: [PATCH 1/6] gitignore and some bug fix --- .gitignore | 3 +++ repeatfs/provenance/management.py | 14 ++++++++++---- repeatfs/provenance/process_record.py | 2 +- 3 files changed, 14 insertions(+), 5 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..73301fc --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +# Ignore build directory +build/ +*.egg-info/ \ No newline at end of file diff --git a/repeatfs/provenance/management.py b/repeatfs/provenance/management.py index bdb41a5..82762bc 100755 --- a/repeatfs/provenance/management.py +++ b/repeatfs/provenance/management.py @@ -207,9 +207,12 @@ def register_read(self, descriptor, op_type=OP_IO, pid=None, update_process=True return pid = self.core.get_pid(pid) - + pids = IORecord.get(descriptor, self) + # check for None + if pids is None: + self.register_open(descriptor, pid=pid, record_file=False) # Ensure pid recorded to this descriptor (for descriptors passed to child processes) - if pid not in IORecord.get(descriptor, self): + elif pid not in pids: self.register_open(descriptor, pid=pid, record_file=False) IORecord.get(descriptor, self, pid=pid).update(IORecord.IO_READ, op_type, io_time=io_time) @@ -223,9 +226,12 @@ def register_write(self, descriptor, op_type=OP_IO, pid=None, update_process=Tru return pid = self.core.get_pid(pid) - + pids = IORecord.get(descriptor, self) + # check for None + if pids is None: + self.register_open(descriptor, pid=pid, record_file=False) # Ensure pid recorded to this descriptor (for descriptors passed to child processes) - if pid not in IORecord.get(descriptor, self): + elif pid not in pids: self.register_open(descriptor, pid=pid, record_file=False) IORecord.get(descriptor, self, pid=pid).update(IORecord.IO_WRITE, op_type, io_time=io_time) diff --git a/repeatfs/provenance/process_record.py b/repeatfs/provenance/process_record.py index c6d3a75..08272bf 100755 --- a/repeatfs/provenance/process_record.py +++ b/repeatfs/provenance/process_record.py @@ -34,7 +34,7 @@ def update(cls, pid, management, ignore_pipes=None): if pid in cls._lookup: # Update entry process_record = cls._lookup[pid] - process_record._update(ignore_pipes) + process_record._update(ignore_pipes=ignore_pipes) else: # Create and register new entry process_record = ProcessRecord(pid, management, ignore_pipes) From 7fcc24ab4d9f99bde4f5535cd82208baf1eda5e1 Mon Sep 17 00:00:00 2001 From: noahcui Date: Mon, 3 Mar 2025 18:28:47 +0000 Subject: [PATCH 2/6] bug fix --- repeatfs/provenance/io_record.py | 2 +- repeatfs/provenance/management.py | 14 ++++---------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/repeatfs/provenance/io_record.py b/repeatfs/provenance/io_record.py index 25499f3..46fbce2 100755 --- a/repeatfs/provenance/io_record.py +++ b/repeatfs/provenance/io_record.py @@ -28,7 +28,7 @@ def get(cls, descriptor, management, pid=None): """ Get IO for a descriptor and pid """ with management.lock: if descriptor not in cls._lookup: - return + return {} if pid is None else None if not pid: return cls._lookup[descriptor] diff --git a/repeatfs/provenance/management.py b/repeatfs/provenance/management.py index 82762bc..bdb41a5 100755 --- a/repeatfs/provenance/management.py +++ b/repeatfs/provenance/management.py @@ -207,12 +207,9 @@ def register_read(self, descriptor, op_type=OP_IO, pid=None, update_process=True return pid = self.core.get_pid(pid) - pids = IORecord.get(descriptor, self) - # check for None - if pids is None: - self.register_open(descriptor, pid=pid, record_file=False) + # Ensure pid recorded to this descriptor (for descriptors passed to child processes) - elif pid not in pids: + if pid not in IORecord.get(descriptor, self): self.register_open(descriptor, pid=pid, record_file=False) IORecord.get(descriptor, self, pid=pid).update(IORecord.IO_READ, op_type, io_time=io_time) @@ -226,12 +223,9 @@ def register_write(self, descriptor, op_type=OP_IO, pid=None, update_process=Tru return pid = self.core.get_pid(pid) - pids = IORecord.get(descriptor, self) - # check for None - if pids is None: - self.register_open(descriptor, pid=pid, record_file=False) + # Ensure pid recorded to this descriptor (for descriptors passed to child processes) - elif pid not in pids: + if pid not in IORecord.get(descriptor, self): self.register_open(descriptor, pid=pid, record_file=False) IORecord.get(descriptor, self, pid=pid).update(IORecord.IO_WRITE, op_type, io_time=io_time) From 8ac5f0f7427ed6dd1ae1b922715ddafe497fff00 Mon Sep 17 00:00:00 2001 From: noahcui Date: Fri, 28 Mar 2025 19:21:54 +0000 Subject: [PATCH 3/6] Use real path for executable hashing. To avoid deadlock. --- repeatfs/provenance/process_record.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/repeatfs/provenance/process_record.py b/repeatfs/provenance/process_record.py index 08272bf..2793b25 100755 --- a/repeatfs/provenance/process_record.py +++ b/repeatfs/provenance/process_record.py @@ -184,6 +184,10 @@ def _update(self, force=False, ignore_pipes=None): # Record executable try: self.exe = os.readlink("/proc/{0}/exe".format(self.pid)) if self.pid > 1 else "" + # re-direction to the real path + if self.management.core.mount == os.path.commonpath([self.exe, self.management.core.mount]): + relative = os.path.relpath(self.exe, self.management.core.mount) + self.exe = os.path.join(self.management.core.root, relative) try: self.md5 = self.management._calculate_hash(self.exe) except (PermissionError, FileNotFoundError): From f191ae37f6c0ff9737cc6973e420a31ea085846e Mon Sep 17 00:00:00 2001 From: noahcui Date: Wed, 2 Apr 2025 15:03:14 +0000 Subject: [PATCH 4/6] Handling empty exe issue --- repeatfs/provenance/process_record.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/repeatfs/provenance/process_record.py b/repeatfs/provenance/process_record.py index 2793b25..be37a78 100755 --- a/repeatfs/provenance/process_record.py +++ b/repeatfs/provenance/process_record.py @@ -185,9 +185,10 @@ def _update(self, force=False, ignore_pipes=None): try: self.exe = os.readlink("/proc/{0}/exe".format(self.pid)) if self.pid > 1 else "" # re-direction to the real path - if self.management.core.mount == os.path.commonpath([self.exe, self.management.core.mount]): - relative = os.path.relpath(self.exe, self.management.core.mount) - self.exe = os.path.join(self.management.core.root, relative) + if self.exe is not None and self.exe!="": + if self.management.core.mount == os.path.commonpath([self.exe, self.management.core.mount]): + relative = os.path.relpath(self.exe, self.management.core.mount) + self.exe = os.path.join(self.management.core.root, relative) try: self.md5 = self.management._calculate_hash(self.exe) except (PermissionError, FileNotFoundError): From 1c215e3b23d85b21bb8def7373da8f5b61f0389c Mon Sep 17 00:00:00 2001 From: noahcui Date: Fri, 18 Apr 2025 18:45:40 +0000 Subject: [PATCH 5/6] rework on process_record.py to use FileEntry for path changing --- repeatfs/provenance/process_record.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/repeatfs/provenance/process_record.py b/repeatfs/provenance/process_record.py index be37a78..367e063 100755 --- a/repeatfs/provenance/process_record.py +++ b/repeatfs/provenance/process_record.py @@ -186,9 +186,9 @@ def _update(self, force=False, ignore_pipes=None): self.exe = os.readlink("/proc/{0}/exe".format(self.pid)) if self.pid > 1 else "" # re-direction to the real path if self.exe is not None and self.exe!="": - if self.management.core.mount == os.path.commonpath([self.exe, self.management.core.mount]): - relative = os.path.relpath(self.exe, self.management.core.mount) - self.exe = os.path.join(self.management.core.root, relative) + path=FileEntry.get_paths(self.exe, self.management.core.root, self.management.core.mount) + if path is not None and path['abs_real'] is not None: + self.exe=path['abs_real'] try: self.md5 = self.management._calculate_hash(self.exe) except (PermissionError, FileNotFoundError): From b020db1eb2538d3f355331151a923f5cd3421537 Mon Sep 17 00:00:00 2001 From: noahcui Date: Fri, 18 Apr 2025 18:49:54 +0000 Subject: [PATCH 6/6] Handling "1 session with 1 process" --- repeatfs/provenance/replication.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/repeatfs/provenance/replication.py b/repeatfs/provenance/replication.py index 3d1f367..138e7e2 100755 --- a/repeatfs/provenance/replication.py +++ b/repeatfs/provenance/replication.py @@ -368,6 +368,16 @@ def get_session_chains(self, filter_expanded): # Update remaining expand_remain.discard(parent_id) + + unexpandable = { + pid for pid in expand_remain + if not any( + tuple(str(p[k]) for k in ["phost", "parent_start", "parent_pid"]) == pid + for p in self.provenance["process"].values() + ) + } + session_children.update(unexpandable) + expand_remain -= unexpandable # Build chains for all session children for process_id in session_children: