From dd13588f1a3a63fe58b26569433f5d12d2e38392 Mon Sep 17 00:00:00 2001 From: Francesco Servida Date: Thu, 20 Jul 2023 21:03:44 +0300 Subject: [PATCH 01/16] Implemented arbitrary size ZipSegments, allow ZipSegments in addition to hashbasedlogical --- pyaff4/container.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pyaff4/container.py b/pyaff4/container.py index aef7e6e..562773c 100644 --- a/pyaff4/container.py +++ b/pyaff4/container.py @@ -141,7 +141,7 @@ def open(filename): return Container.openURN(rdfvalue.URN.FromFileName(filename)) @staticmethod - def createURN(resolver, container_urn, encryption=False): + def createURN(resolver, container_urn, encryption=False, zip_based=False): """Public method to create a new writable locical AFF4 container.""" resolver.Set(lexicon.transient_graph, container_urn, lexicon.AFF4_STREAM_WRITE_MODE, rdfvalue.XSDString("truncate")) @@ -151,7 +151,11 @@ def createURN(resolver, container_urn, encryption=False): with zip.ZipFile.NewZipFile(resolver, version, container_urn) as zip_file: volume_urn = zip_file.urn with resolver.AFF4FactoryOpen(zip_file.backing_store_urn) as backing_store: - return WritableHashBasedImageContainer(backing_store, zip_file, version, volume_urn, resolver, lexicon.standard) + if not zip_based: + return WritableHashBasedImageContainer(backing_store, zip_file, version, volume_urn, resolver, lexicon.standard) + else: + return WritableLogicalImageContainer(backing_store, zip_file, version, volume_urn, resolver, lexicon.standard) + else: version = Version(1, 2, "pyaff4") with zip.ZipFile.NewZipFile(resolver, version, container_urn) as zip_file: @@ -393,14 +397,14 @@ def newLogicalStream(self, filename, length): self.resolver.Add(self.urn, image_urn, rdfvalue.URN(lexicon.standard11.pathName), rdfvalue.XSDString(filename)) return writer - def writeLogicalStream(self, filename, readstream, length): + def writeLogicalStream(self, filename, readstream, length, allow_large_zipsegments=False): image_urn = None if self.isAFF4Collision(filename): image_urn = rdfvalue.URN("aff4://%s" % uuid.uuid4()) else: image_urn = self.urn.Append(escaping.arnPathFragment_from_path(filename), quote=False) - if length > self.maxSegmentResidentSize: + if length > self.maxSegmentResidentSize and not allow_large_zipsegments: self.writeCompressedBlockStream(image_urn, filename, readstream) self.resolver.Add(self.urn, image_urn, rdfvalue.URN(lexicon.AFF4_TYPE), rdfvalue.URN(lexicon.AFF4_IMAGE_TYPE)) From 9090d979c07eb029254ce3f54ecd85788102dfab Mon Sep 17 00:00:00 2001 From: Francesco Servida Date: Fri, 25 Aug 2023 11:23:23 +0300 Subject: [PATCH 02/16] Update requirements.txt Updated to PyYaml 5.4 to reflect latest packaged pyaff4 version on pip --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 015a472..bb10d62 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ future == 0.17.1 aff4-snappy == 0.5.1 rdflib[sparql] == 4.2.2 intervaltree == 2.1.0 -pyyaml == 5.1 +pyyaml == 5.4 tzlocal == 2.1 html5lib == 1.0.1 python-dateutil == 2.8.0 From d42ad5cff90f41176e164738f6ca00a90a869332 Mon Sep 17 00:00:00 2001 From: Francesco Servida Date: Sat, 26 Aug 2023 12:56:06 +0300 Subject: [PATCH 03/16] Fixed Struct for FileHeader to have uint32 - Else if file more than 2GB it would error when packing struct --- pyaff4/zip.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyaff4/zip.py b/pyaff4/zip.py index c4b7acb..4141db5 100644 --- a/pyaff4/zip.py +++ b/pyaff4/zip.py @@ -132,8 +132,8 @@ class ZipFileHeader(struct_parser.CreateStruct( uint16_t lastmodtime; uint16_t lastmoddate; uint32_t crc32; - int32_t compress_size; - int32_t file_size; + uint32_t compress_size; + uint32_t file_size; uint16_t file_name_length; uint16_t extra_field_len = 0; """)): From 4a59a6202d1b3fd88f0c29658e7926d0000b6416 Mon Sep 17 00:00:00 2001 From: Francesco Servida Date: Sun, 27 Aug 2023 23:19:36 +0300 Subject: [PATCH 04/16] Critical - Reserves Zip64 header before writing file data to avoid overwriting Allow creation of containers based on ZIP_STORED --- pyaff4/container.py | 15 +++++++++++---- pyaff4/zip.py | 19 ++++++++++++------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/pyaff4/container.py b/pyaff4/container.py index 562773c..16e97d8 100644 --- a/pyaff4/container.py +++ b/pyaff4/container.py @@ -141,7 +141,7 @@ def open(filename): return Container.openURN(rdfvalue.URN.FromFileName(filename)) @staticmethod - def createURN(resolver, container_urn, encryption=False, zip_based=False): + def createURN(resolver, container_urn, encryption=False, zip_based=False, compression_method=zip.ZIP_DEFLATE): """Public method to create a new writable locical AFF4 container.""" resolver.Set(lexicon.transient_graph, container_urn, lexicon.AFF4_STREAM_WRITE_MODE, rdfvalue.XSDString("truncate")) @@ -154,7 +154,7 @@ def createURN(resolver, container_urn, encryption=False, zip_based=False): if not zip_based: return WritableHashBasedImageContainer(backing_store, zip_file, version, volume_urn, resolver, lexicon.standard) else: - return WritableLogicalImageContainer(backing_store, zip_file, version, volume_urn, resolver, lexicon.standard) + return WritableLogicalImageContainer(backing_store, zip_file, version, volume_urn, resolver, lexicon.standard, compression_method=compression_method) else: version = Version(1, 2, "pyaff4") @@ -330,9 +330,13 @@ class WritableLogicalImageContainer(Container): maxSegmentResidentSize = 1 * 1024 * 1024 #maxSegmentResidentSize = 1 - def __init__(self, backing_store, zip_file, version, volumeURN, resolver, lex): + compression_method = None + + def __init__(self, backing_store, zip_file, version, volumeURN, resolver, lex, compression_method=zip.ZIP_DEFLATE): super(WritableLogicalImageContainer, self).__init__(backing_store, zip_file, version, volumeURN, resolver, lex) + self.compression_method = compression_method + with self.resolver.AFF4FactoryOpen(self.urn) as volume: container_description_urn = self.urn.Append("container.description") volume.version = self.version @@ -361,7 +365,10 @@ def writeCompressedBlockStream(self, image_urn, filename, readstream): def writeZipStream(self, image_urn, filename, readstream): with self.resolver.AFF4FactoryOpen(self.urn) as volume: with volume.CreateMember(image_urn) as streamed: - streamed.compression_method = zip.ZIP_DEFLATE + if self.compression_method is not None and self.compression_method == lexicon.AFF4_IMAGE_COMPRESSION_STORED: + streamed.compression_method = zip.ZIP_STORED + else: + streamed.compression_method = zip.ZIP_DEFLATE streamed.WriteStream(readstream) # create a file like object for writing a logical image as a new compressed block stream diff --git a/pyaff4/zip.py b/pyaff4/zip.py index 4141db5..f2264b4 100644 --- a/pyaff4/zip.py +++ b/pyaff4/zip.py @@ -25,6 +25,7 @@ import zlib import struct import traceback +import os from pyaff4 import aff4 from pyaff4 import aff4_file @@ -310,25 +311,29 @@ def WriteFileHeader(self, backing_store): if USE_UNICODE: header.flags = header.flags | (1 << 11) + # Always calculate and reserve the zip64 header size + # Alternatively, as the size of the file is not passed on first header creation + # a file larger than 4GB would triggers creation of the header only after file has been written and would get + # the first bytes overwritten creating a corrupted container. + # Only set header to 0xFFFFFFFF if really needed to look into extra header. + extra_header_64 = Zip64FileHeaderExtensibleField() if self.file_size > ZIP32_MAX_SIZE: header.file_size = 0xFFFFFFFF - extra_header_64.Set("file_size", self.file_size) + extra_header_64.Set("file_size", self.file_size) if self.compress_size > ZIP32_MAX_SIZE: header.compress_size = 0xFFFFFFFF - extra_header_64.Set("compress_size", self.compress_size) + extra_header_64.Set("compress_size", self.compress_size) - # Only write the extra header if we have to. - if not extra_header_64.empty(): - header.extra_field_len = extra_header_64.sizeof() + # Write the extra header in any case + header.extra_field_len = extra_header_64.sizeof() backing_store.SeekWrite(self.file_header_offset) backing_store.Write(header.Pack()) backing_store.write(encodedFilename) - if not extra_header_64.empty(): - backing_store.Write(extra_header_64.Pack()) + backing_store.Write(extra_header_64.Pack()) def WriteCDFileHeader(self, backing_store): encodedFilename = self.filename From 30226afb206f9237aa8fb7d703a3332fac12c8a6 Mon Sep 17 00:00:00 2001 From: Francesco Servida Date: Fri, 1 Sep 2023 15:26:47 +0300 Subject: [PATCH 05/16] Fixed ZIP64 compatibility, now local header filesizes are always set to 0xFFFFFFFF to force reading from extra field, extra field size is now written correctly (was always 0 before) --- pyaff4/zip.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/pyaff4/zip.py b/pyaff4/zip.py index f2264b4..54f8f13 100644 --- a/pyaff4/zip.py +++ b/pyaff4/zip.py @@ -170,7 +170,9 @@ def empty(self): def Pack(self): # Size of extra less the header. - #self.Set("data_size", self.sizeof() - 4) + # Data size needs to be set for a zip64 extra field to be compliant with zip specification. + self.Set("data_size", self.sizeof() - 4) + # Don't think the value set below is used anywhere, might be removable. self.data_size = self.sizeof() return struct.pack(self.format_string(), *[v for t, _, v in self.fields if v is not None]) @@ -311,22 +313,21 @@ def WriteFileHeader(self, backing_store): if USE_UNICODE: header.flags = header.flags | (1 << 11) + # For local header force usage of ZIP64 even when not needed as we do not know the file size, nor what it would + # compress to, before writing the header the first time + # (similar to how zip works in command line when compressing from stdin) # Always calculate and reserve the zip64 header size # Alternatively, as the size of the file is not passed on first header creation # a file larger than 4GB would triggers creation of the header only after file has been written and would get # the first bytes overwritten creating a corrupted container. - # Only set header to 0xFFFFFFFF if really needed to look into extra header. - extra_header_64 = Zip64FileHeaderExtensibleField() - if self.file_size > ZIP32_MAX_SIZE: - header.file_size = 0xFFFFFFFF + + header.file_size = 0xFFFFFFFF extra_header_64.Set("file_size", self.file_size) - if self.compress_size > ZIP32_MAX_SIZE: - header.compress_size = 0xFFFFFFFF + header.compress_size = 0xFFFFFFFF extra_header_64.Set("compress_size", self.compress_size) - # Write the extra header in any case header.extra_field_len = extra_header_64.sizeof() backing_store.SeekWrite(self.file_header_offset) From 8d6b459882c808c6797beaefbb030e561594ac7a Mon Sep 17 00:00:00 2001 From: Francesco Servida Date: Sat, 2 Sep 2023 11:19:48 +0300 Subject: [PATCH 06/16] Implement Ability to pass Progress object when creating zipsegments and when hasing with linearhasher2 --- pyaff4/container.py | 8 ++++---- pyaff4/linear_hasher.py | 12 +++++++++--- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/pyaff4/container.py b/pyaff4/container.py index 16e97d8..e586f27 100644 --- a/pyaff4/container.py +++ b/pyaff4/container.py @@ -362,14 +362,14 @@ def writeCompressedBlockStream(self, image_urn, filename, readstream): stream.WriteStream(readstream) # write the logical stream as a zip segment using the Stream API - def writeZipStream(self, image_urn, filename, readstream): + def writeZipStream(self, image_urn, filename, readstream, progress=None): with self.resolver.AFF4FactoryOpen(self.urn) as volume: with volume.CreateMember(image_urn) as streamed: if self.compression_method is not None and self.compression_method == lexicon.AFF4_IMAGE_COMPRESSION_STORED: streamed.compression_method = zip.ZIP_STORED else: streamed.compression_method = zip.ZIP_DEFLATE - streamed.WriteStream(readstream) + streamed.WriteStream(readstream, progress=progress) # create a file like object for writing a logical image as a new compressed block stream def newCompressedBlockStream(self, image_urn, filename): @@ -404,7 +404,7 @@ def newLogicalStream(self, filename, length): self.resolver.Add(self.urn, image_urn, rdfvalue.URN(lexicon.standard11.pathName), rdfvalue.XSDString(filename)) return writer - def writeLogicalStream(self, filename, readstream, length, allow_large_zipsegments=False): + def writeLogicalStream(self, filename, readstream, length, allow_large_zipsegments=False, progress=None): image_urn = None if self.isAFF4Collision(filename): image_urn = rdfvalue.URN("aff4://%s" % uuid.uuid4()) @@ -416,7 +416,7 @@ def writeLogicalStream(self, filename, readstream, length, allow_large_zipsegmen self.resolver.Add(self.urn, image_urn, rdfvalue.URN(lexicon.AFF4_TYPE), rdfvalue.URN(lexicon.AFF4_IMAGE_TYPE)) else: - self.writeZipStream(image_urn, filename, readstream) + self.writeZipStream(image_urn, filename, readstream, progress=progress) self.resolver.Add(self.urn, image_urn, rdfvalue.URN(lexicon.AFF4_TYPE), rdfvalue.URN(lexicon.AFF4_ZIP_SEGMENT_IMAGE_TYPE)) self.resolver.Add(self.urn, image_urn, rdfvalue.URN(lexicon.AFF4_TYPE), rdfvalue.URN(lexicon.standard11.FileImage)) diff --git a/pyaff4/linear_hasher.py b/pyaff4/linear_hasher.py index e279c95..6e41bf9 100644 --- a/pyaff4/linear_hasher.py +++ b/pyaff4/linear_hasher.py @@ -22,6 +22,7 @@ from pyaff4 import hashes from pyaff4 import lexicon from pyaff4 import zip +from pyaff4 import aff4 class LinearHasher(object): @@ -144,13 +145,13 @@ def __init__(self, resolver, listener=None): self.delegate = None self.resolver = resolver - def hash(self, image): + def hash(self, image, progress=None): storedHashes = list(self.resolver.QuerySubjectPredicate(image.container.urn, image.urn, lexicon.standard.hash)) with self.resolver.AFF4FactoryOpen(image.urn, version=image.container.version) as stream: datatypes = [h.datatype for h in storedHashes] stream2 = StreamHasher(stream, datatypes) - self.readall2(stream2) + self.readall2(stream2, progress=progress) for storedHash in storedHashes: dt = storedHash.datatype shortHashAlgoName = storedHash.shortName() @@ -162,10 +163,15 @@ def hash(self, image): self.listener.onInvalidHash(shortHashAlgoName, storedHashHexDigest, calculatedHashHexDigest, image.urn) - def readall2(self, stream): + def readall2(self, stream, progress=None): + total_read = 0 + if progress is None: + progress = aff4.EMPTY_PROGRESS while True: toRead = 32 * 1024 data = stream.read(toRead) + total_read += len(data) + progress.Report(total_read) if data == None or len(data) == 0: # EOF return From 23fe59718b9882faa372d415ba5d8aea2aed145d Mon Sep 17 00:00:00 2001 From: Francesco Servida Date: Tue, 6 Feb 2024 12:50:16 +0100 Subject: [PATCH 07/16] Update requirements.txt Updated PyYaml to avoid build issues on cython 3.0 https://github.com/yaml/pyyaml/issues/601 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index bb10d62..48c42ca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ future == 0.17.1 aff4-snappy == 0.5.1 rdflib[sparql] == 4.2.2 intervaltree == 2.1.0 -pyyaml == 5.4 +pyyaml == 5.10 tzlocal == 2.1 html5lib == 1.0.1 python-dateutil == 2.8.0 From 196b35faadde08847f4d3b1a401375127c45a80c Mon Sep 17 00:00:00 2001 From: Francesco Servida Date: Tue, 6 Feb 2024 12:56:04 +0100 Subject: [PATCH 08/16] Update requirements.txt Removed version pinning for pyyaml --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 48c42ca..a3d9fdb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ future == 0.17.1 aff4-snappy == 0.5.1 rdflib[sparql] == 4.2.2 intervaltree == 2.1.0 -pyyaml == 5.10 +pyyaml tzlocal == 2.1 html5lib == 1.0.1 python-dateutil == 2.8.0 From 7a410375547d12f2b0349d2ee4ba3a5979e2f5a3 Mon Sep 17 00:00:00 2001 From: Francesco Servida Date: Tue, 6 Feb 2024 13:05:42 +0100 Subject: [PATCH 09/16] Update requirements.txt Addedd pybindgen as required for successful build of fastchunking --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index a3d9fdb..d1c0b51 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ pyyaml tzlocal == 2.1 html5lib == 1.0.1 python-dateutil == 2.8.0 +pybindgen fastchunking == 0.0.3 hexdump pynacl From 8bb7f70beea488d292e5980612fedba50b0f360e Mon Sep 17 00:00:00 2001 From: Francesco Servida Date: Fri, 9 Feb 2024 15:00:39 +0100 Subject: [PATCH 10/16] Update requirements.txt require latest intervaltree to avoid issues with mutableset in python >=3.10 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d1c0b51..0ba917c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ future == 0.17.1 aff4-snappy == 0.5.1 rdflib[sparql] == 4.2.2 -intervaltree == 2.1.0 +intervaltree pyyaml tzlocal == 2.1 html5lib == 1.0.1 From e159e998474ad38253e73d26738c28adfefc7c7c Mon Sep 17 00:00:00 2001 From: Francesco Servida Date: Sat, 27 Apr 2024 00:17:02 +0300 Subject: [PATCH 11/16] Implementing Support for Axiom's non-standard paths --- pyaff4/aff4_image.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/pyaff4/aff4_image.py b/pyaff4/aff4_image.py index b5078c8..83e3d91 100644 --- a/pyaff4/aff4_image.py +++ b/pyaff4/aff4_image.py @@ -15,6 +15,7 @@ """This module implements the standard AFF4 Image.""" from __future__ import division from __future__ import unicode_literals + from builtins import range from builtins import str from past.utils import old_div @@ -23,6 +24,7 @@ import logging import lz4.block import struct +import urllib from expiringdict import ExpiringDict @@ -491,8 +493,19 @@ def _parse_bevy_index(self, bevy): return result def reloadBevy(self, bevy_id): - bevy_urn = self.urn.Append("%08d" % bevy_id) - bevy_index_urn = rdfvalue.URN("%s.index" % bevy_urn) + if "AXIOMProcess" in self.version.tool: + # Axiom does strange stuff with paths and URNs, we need to fix the URN for reading bevys + volume_urn = '/'.join(self.urn.SerializeToString().split('/')[0:3]) + original_filename = self.resolver.Get(volume_urn, self.urn, rdfvalue.URN(lexicon.standard11.pathName))[0] + original_filename_escaped = urllib.parse.quote(str(original_filename).encode(), safe='/\\') + corrected_urn = f"{volume_urn}/{original_filename_escaped}\\{'%08d' % bevy_id}".encode() + print(corrected_urn) + bevy_urn = rdfvalue.URN().UnSerializeFromString(corrected_urn) + # bevy_index_urn = rdfvalue.URN("%s.index" % bevy_urn) # This is unused anyway apparently + print("AXIOOOOOM") + else: + bevy_urn = self.urn.Append("%08d" % bevy_id) + bevy_index_urn = rdfvalue.URN("%s.index" % bevy_urn) if LOGGER.isEnabledFor(logging.INFO): LOGGER.info("Reload Bevy %s", bevy_urn) chunks = [] From b05317ed4c71d91b4d4751553afd0b0d55e6d62c Mon Sep 17 00:00:00 2001 From: Francesco Servida Date: Sat, 27 Apr 2024 00:31:42 +0300 Subject: [PATCH 12/16] Removed forgotten test print --- pyaff4/aff4_image.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pyaff4/aff4_image.py b/pyaff4/aff4_image.py index 83e3d91..51c381b 100644 --- a/pyaff4/aff4_image.py +++ b/pyaff4/aff4_image.py @@ -502,7 +502,6 @@ def reloadBevy(self, bevy_id): print(corrected_urn) bevy_urn = rdfvalue.URN().UnSerializeFromString(corrected_urn) # bevy_index_urn = rdfvalue.URN("%s.index" % bevy_urn) # This is unused anyway apparently - print("AXIOOOOOM") else: bevy_urn = self.urn.Append("%08d" % bevy_id) bevy_index_urn = rdfvalue.URN("%s.index" % bevy_urn) From cb36b4fba841eeab85d9f6475cd2872a80a12b51 Mon Sep 17 00:00:00 2001 From: Francesco Servida Date: Sat, 27 Apr 2024 01:38:15 +0300 Subject: [PATCH 13/16] Fixed volume URN when container does not have URN in Central Directory Comment --- pyaff4/data_store.py | 5 +++++ pyaff4/zip.py | 16 +++++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/pyaff4/data_store.py b/pyaff4/data_store.py index b1aac0f..6964872 100644 --- a/pyaff4/data_store.py +++ b/pyaff4/data_store.py @@ -535,6 +535,11 @@ def _DumpToTurtle(self, volumeurn, verbose=False): return result + def loadZipURN(self, zip): + with zip.OpenZipSegment("container.description") as fd: + urn = streams.ReadAll(fd).strip(b'\n') + return urn + def loadMetadata(self, zip): # Load the turtle metadata. #if zip.urn not in self.loadedVolumes: diff --git a/pyaff4/zip.py b/pyaff4/zip.py index 54f8f13..b79d466 100644 --- a/pyaff4/zip.py +++ b/pyaff4/zip.py @@ -575,7 +575,8 @@ def __init__(self, *args, **kwargs): except: self.version = Version(0,0, "pyaff4") - def parse_cd(self, backing_store_urn): + def parse_cd(self, backing_store_urn, urn: str = None): + # We can pass the urn as parameter, this allows correct opening of images not having the urn in CD comment with self.resolver.AFF4FactoryOpen(backing_store_urn) as backing_store: # Find the End of Central Directory Record - We read about 4k of # data and scan for the header from the end, just in case there is @@ -618,6 +619,8 @@ def parse_cd(self, backing_store_urn): # URN and then create a new ZipFile volume. After parsing the # central directory we discover our URN and therefore we can delete # the old, randomly selected URN. + if not urn_string and urn: + urn_string = urn if urn_string and self.urn != urn_string and self.version != basic_zip : self.resolver.DeleteSubject(self.urn) self.urn.Set(utils.SmartUnicode(urn_string)) @@ -866,8 +869,6 @@ def OpenMember(self, segment_urn): return self.resolver.CachePut(result) - - def LoadFromURN(self): self.backing_store_urn = self.resolver.GetUnique(lexicon.transient_graph, self.urn, lexicon.AFF4_STORED) @@ -880,7 +881,16 @@ def LoadFromURN(self): raise IOError("Unable to load backing urn.") try: + # Possibly inefficient method, but easiest to implement + # Create a copy of transient store, parse zip and read container.description to discover urn + # Reread the ZIP with urn as parameter to ensure the transient store has objects with correct URNs. + # Necessary for containers missing the URN in CD comment. + ## Backup Transient Store + transient_store = copy.deepcopy(self.resolver.transient_store) self.parse_cd(self.backing_store_urn) + # Restore Transient Store + self.resolver.transient_store = transient_store + self.parse_cd(self.backing_store_urn, urn=self.resolver.loadZipURN(self)) self.resolver.loadMetadata(self) except IOError: # If we can not parse a CD from the zip file, this is fine, we just From 2274e2338ae2f558c0013b4b03d93be44fc38746 Mon Sep 17 00:00:00 2001 From: Francesco Servida Date: Sun, 9 Nov 2025 14:25:41 +0100 Subject: [PATCH 14/16] Require aff4-snappy sources Use source version of aff4-snappy to be able to build on ARM64 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 0ba917c..d4f2afd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ future == 0.17.1 -aff4-snappy == 0.5.1 +aff4-snappy @ git+https://github.com/aff4/aff4-snappy@88aba3a3fe4b3f9c20bcfeb5b4c1935c801760bb # Use source version of aff4-snappy to be able to build on ARM64 (https://github.com/aff4/aff4-snappy/pull/2) rdflib[sparql] == 4.2.2 intervaltree pyyaml From 21e7eb8720907d5af2cf687dec768326eee2f210 Mon Sep 17 00:00:00 2001 From: Francesco Servida Date: Sun, 9 Nov 2025 19:59:33 +0100 Subject: [PATCH 15/16] Update future to support python 3.13 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d4f2afd..92cada8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -future == 0.17.1 +future == 1.0.0 aff4-snappy @ git+https://github.com/aff4/aff4-snappy@88aba3a3fe4b3f9c20bcfeb5b4c1935c801760bb # Use source version of aff4-snappy to be able to build on ARM64 (https://github.com/aff4/aff4-snappy/pull/2) rdflib[sparql] == 4.2.2 intervaltree From faa1361b48616bad8c63c0c13cba1e4e080dee77 Mon Sep 17 00:00:00 2001 From: Francesco Servida Date: Mon, 24 Nov 2025 08:37:05 +0100 Subject: [PATCH 16/16] Draft Implementation of Container Hash --- aff4.py | 15 +++++++++++++ pyaff4/data_store.py | 52 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/aff4.py b/aff4.py index 7725ca1..aed8f25 100644 --- a/aff4.py +++ b/aff4.py @@ -181,6 +181,14 @@ def verify(file, password): printVolumeInfo(file, childVolume) printCaseInfo(childVolume) resolver = childVolume.resolver + + metadata_verified, metadata_hashes = resolver.verify_container_metadata_integrity(volume.zip_file) + print("\tContainer Metadata:") + if not metadata_verified: + print("\t\tContainer Metadata Verification Failed") + for hash in metadata_hashes: + print(f"\t\t-{hash['hash_type'].upper()} - {'VERIFIED' if hash['verified'] else 'FAILED'} | Stored: {hash['stored_hash']} - Calculated {hash['calculated_hash']}") + hasher = linear_hasher.LinearHasher2(resolver, LinearVerificationListener()) for image in childVolume.images(): print("\t%s <%s>" % (image.name(), trimVolume(childVolume.urn, image.urn))) @@ -190,6 +198,13 @@ def verify(file, password): printCaseInfo(volume) resolver = volume.resolver + metadata_verified, metadata_hashes = resolver.verify_container_metadata_integrity(volume.zip_file) + print("\tContainer Metadata:") + if not metadata_verified: + print("\t\tContainer Metadata Verification Failed") + for hash in metadata_hashes: + print(f"\t\t- {hash['hash_type'].upper()} - {'VERIFIED' if hash['verified'] else 'FAILED'} | Stored: {hash['stored_hash']} - Calculated {hash['calculated_hash']}") + if type(volume) == container.PhysicalImageContainer: image = volume.image listener = VerificationListener() diff --git a/pyaff4/data_store.py b/pyaff4/data_store.py index 6964872..baef8e9 100644 --- a/pyaff4/data_store.py +++ b/pyaff4/data_store.py @@ -31,6 +31,8 @@ import sys import types import binascii +import hashlib +import json from rdflib import URIRef from itertools import chain @@ -490,6 +492,7 @@ def DumpToTurtle(self, zipcontainer, ): break turtle_segment.Flush() turtle_segment.Close() + self.write_metadata_hashes(zipcontainer) def _DumpToTurtle(self, volumeurn, verbose=False): g = rdflib.Graph() @@ -547,6 +550,55 @@ def loadMetadata(self, zip): self.LoadFromTurtle(fd, zip.urn) self.loadedVolumes.append(zip.urn) + def write_metadata_hashes(self, zipcontainer): + with zipcontainer.OpenZipSegment("information.turtle") as fd: + data = fd.read() + hashes = { + "md5": hashlib.md5(data).hexdigest(), + "sha1": hashlib.sha1(data).hexdigest(), + "sha256": hashlib.sha256(data).hexdigest() + } + with zipcontainer.CreateZipSegment(u"container.hashes") as container_hashes_segment: + container_hashes_segment.compression_method = ZIP_DEFLATE + container_hashes_segment.write(utils.SmartStr(json.dumps(hashes))) + container_hashes_segment.Flush() + container_hashes_segment.Close() + + def read_metadata_hashes(self, zipcontainer): + # containerHashesURN = escaping.urn_from_member_name(u"container.hashes", zipcontainer.urn, zipcontainer.version) + if not zipcontainer.ContainsMember("container.hashes"): + with zipcontainer.OpenZipSegment("container.hashes") as fd: + data = fd.read() + hashes = json.loads(data) + return hashes + else: + # No container.hashes found, return empty hashlist. + return {} + + def verify_container_metadata_integrity(self, zipcontainer): + stored_hashes = self.read_metadata_hashes(zipcontainer) + hashes = [] + failed = False + with zipcontainer.OpenZipSegment("information.turtle") as fd: + data = fd.read() + for hash_type, stored_hash in stored_hashes.items(): + calculated_hash = "" + if hash_type == "md5": + calculated_hash = hashlib.md5(data).hexdigest() + elif hash_type == "sha1": + calculated_hash = hashlib.sha1(data).hexdigest() + elif hash_type == "sha256": + calculated_hash = hashlib.sha256(data).hexdigest() + verified = stored_hash == calculated_hash + hashes.append({ + 'hash_type': hash_type, + 'stored_hash': stored_hash, + 'calculated_hash': calculated_hash, + 'verified': verified + }) + failed = failed if failed else verified == False + return not failed, hashes + def LoadFromTurtle(self, stream, volume_arn): data = streams.ReadAll(stream) g = rdflib.Graph()