data-preservation-programs · parkan · Nov 29, 2025 · Nov 25, 2025 · Nov 29, 2025 · Nov 29, 2025
diff --git a/.gitignore b/.gitignore
@@ -49,6 +49,11 @@ yarn-error.log*
 
 dist/
 
+# Profiling and logs
+*.prof
+*.log
+*.csv
+
 /test.db*
 dump_extracted.sql
 dump_postgres_reordered_data_only.sql

diff --git a/cmd/dataprep_test.go b/cmd/dataprep_test.go
@@ -305,7 +305,7 @@ func TestDataPreparationAddPieceHandler(t *testing.T) {
 			FileSize:      100,
 			StorageID:     ptr.Of(model.StorageID(1)),
 			StoragePath:   "test1.car",
-			PreparationID: 1,
+			PreparationID: ptr.Of(model.PreparationID(1)),
 		}, nil)
 		_, _, err := runner.Run(ctx, "singularity prep add-piece --piece-cid xxx --piece-size 100 --file-size 100 1")
 		require.NoError(t, err)
@@ -342,7 +342,7 @@ func TestDataPreparationListPiecesHandler(t *testing.T) {
 					FileSize:      200,
 					StorageID:     ptr.Of(model.StorageID(1)),
 					StoragePath:   "test1.car",
-					PreparationID: 1,
+					PreparationID: ptr.Of(model.PreparationID(1)),
 				}, {
 					ID:            2,
 					CreatedAt:     time.Time{},
@@ -351,7 +351,7 @@ func TestDataPreparationListPiecesHandler(t *testing.T) {
 					FileSize:      400,
 					StorageID:     ptr.Of(model.StorageID(1)),
 					StoragePath:   "test2.car",
-					PreparationID: 1,
+					PreparationID: ptr.Of(model.PreparationID(1)),
 				}},
 			},
 			{
@@ -363,7 +363,7 @@ func TestDataPreparationListPiecesHandler(t *testing.T) {
 					FileSize:      600,
 					StorageID:     ptr.Of(model.StorageID(1)),
 					StoragePath:   "test3.car",
-					PreparationID: 1,
+					PreparationID: ptr.Of(model.PreparationID(1)),
 				}, {
 					ID:            4,
 					CreatedAt:     time.Time{},
@@ -372,7 +372,7 @@ func TestDataPreparationListPiecesHandler(t *testing.T) {
 					FileSize:      800,
 					StorageID:     ptr.Of(model.StorageID(1)),
 					StoragePath:   "test4.car",
-					PreparationID: 1,
+					PreparationID: ptr.Of(model.PreparationID(1)),
 				}},
 			},
 		}, nil)

diff --git a/cmd/download_padded_test.go b/cmd/download_padded_test.go
@@ -0,0 +1,228 @@
+package cmd
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/data-preservation-programs/singularity/model"
+	"github.com/data-preservation-programs/singularity/util/testutil"
+	"github.com/stretchr/testify/require"
+	"gorm.io/gorm"
+)
+
+// downloadTestBindBase is used to generate unique ports for each test
+const downloadTestBindBase = 7779
+
+// TestDownloadPaddedPieces tests downloading padded pieces in three scenarios:
+// 1. non-inline + enable-http-piece: content-provider serves CAR file directly from disk
+// 2. inline + enable-http-piece: content-provider assembles piece on-the-fly
+// 3. inline + metadata-only: downloader assembles piece locally from metadata
+func TestDownloadPaddedPieces(t *testing.T) {
+	tests := []struct {
+		name                string
+		inline              bool
+		enableHTTPPiece     bool
+		minPieceSize        int64
+		expectedPaddingInfo string
+	}{
+		{
+			name:                "non-inline with padding",
+			inline:              false,
+			enableHTTPPiece:     true,
+			minPieceSize:        1 << 20, // 1 MiB - will force padding
+			expectedPaddingInfo: "literal zeros in CAR file",
+		},
+		{
+			name:                "inline with piece assembly",
+			inline:              true,
+			enableHTTPPiece:     true,
+			minPieceSize:        1 << 20, // 1 MiB - will force padding
+			expectedPaddingInfo: "PieceReader serves zeros virtually",
+		},
+		{
+			name:                "inline metadata-only",
+			inline:              true,
+			enableHTTPPiece:     false,
+			minPieceSize:        1 << 20, // 1 MiB - will force padding
+			expectedPaddingInfo: "downloader assembles with PieceReader",
+		},
+	}
+
+	for i, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			testutil.All(t, func(ctx context.Context, t *testing.T, db *gorm.DB) {
+				source := t.TempDir()
+				// Create a test file that's smaller than minPieceSize to force padding
+				err := os.WriteFile(filepath.Join(source, "test.txt"), testutil.GenerateFixedBytes(100<<10), 0644) // 100 KiB file
+				require.NoError(t, err)
+
+				output := t.TempDir()
+				downloadDir := t.TempDir()
+
+				// Use unique port for each test
+				downloadTestBind := fmt.Sprintf("127.0.0.1:%d", downloadTestBindBase+i)
+
+				runner := Runner{mode: Normal}
+				defer runner.Save(t, source, output, downloadDir)
+
+				// Create source storage
+				_, _, err = runner.Run(ctx, fmt.Sprintf("singularity storage create local --name source --path %s", testutil.EscapePath(source)))
+				require.NoError(t, err)
+
+				var outputStorageFlag string
+				if !tc.inline {
+					// For non-inline mode, use --local-output
+					outputStorageFlag = fmt.Sprintf(" --local-output %s", testutil.EscapePath(output))
+				}
+
+				// Create preparation with minPieceSize to force padding
+				prepName := "test"
+				inlineFlag := ""
+				if !tc.inline {
+					// Only add flag for non-inline mode (inline is default)
+					inlineFlag = " --no-inline"
+				}
+				_, _, err = runner.Run(ctx, fmt.Sprintf("singularity prep create --name %s --max-size 2MB --min-piece-size %d%s --source source%s",
+					prepName, tc.minPieceSize, inlineFlag, outputStorageFlag))
+				require.NoError(t, err)
+
+				// Start scanning
+				_, _, err = runner.Run(ctx, fmt.Sprintf("singularity prep start-scan %s source", prepName))
+				require.NoError(t, err)
+
+				// Run dataset worker to pack
+				_, _, err = runner.Run(ctx, "singularity run dataset-worker --exit-on-complete=true --exit-on-error=true")
+				require.NoError(t, err)
+
+				// List pieces
+				listPiecesOut, _, err := runner.Run(ctx, fmt.Sprintf("singularity prep list-pieces %s", prepName))
+				require.NoError(t, err)
+				pieceCIDs := GetAllPieceCIDs(listPiecesOut)
+				require.NotEmpty(t, pieceCIDs, "should have at least one piece")
+
+				t.Logf("Created %d piece(s) with %s", len(pieceCIDs), tc.expectedPaddingInfo)
+
+				// Verify the piece is padded by checking database
+				var cars []model.Car
+				err = db.Find(&cars).Error
+				require.NoError(t, err)
+				require.NotEmpty(t, cars)
+
+				// Find the car that matches our expected criteria
+				// FileSize should be (127/128) × PieceSize due to Fr32 padding overhead
+				expectedFileSize := (tc.minPieceSize * 127) / 128
+				var paddedCar *model.Car
+				for _, car := range cars {
+					if car.PieceSize == tc.minPieceSize && car.FileSize == expectedFileSize {
+						carCopy := car
+						paddedCar = &carCopy
+						t.Logf("Found padded CAR: PieceCID=%s, PieceSize=%d, FileSize=%d (expected %d), MinPieceSizePadding=%d",
+							car.PieceCID.String(), car.PieceSize, car.FileSize, expectedFileSize, car.MinPieceSizePadding)
+
+						if tc.inline {
+							require.Greater(t, car.MinPieceSizePadding, int64(0), "inline mode should have virtual padding")
+						} else {
+							require.Equal(t, int64(0), car.MinPieceSizePadding, "non-inline mode should have zeros in file")
+						}
+						break
+					}
+				}
+				require.NotNil(t, paddedCar, "should have found a padded CAR")
+
+				// For non-inline mode, verify the CAR file on disk has correct size
+				if !tc.inline {
+					carPath := filepath.Join(output, paddedCar.PieceCID.String()+".car")
+					fileInfo, err := os.Stat(carPath)
+					require.NoError(t, err, "CAR file should exist on disk")
+					t.Logf("CAR file on disk: path=%s, size=%d (should equal piece size %d for Curio TreeD)",
+						carPath, fileInfo.Size(), paddedCar.PieceSize)
+					require.Equal(t, paddedCar.FileSize, fileInfo.Size(), "CAR file should be padded to piece size")
+
+					// CommP was calculated before padding, so we don't verify it here
+					// The important verification is that downloaded piece matches (tested below)
+				}
+
+				// Start content-provider with appropriate flags
+				contentProviderCtx, cancel := context.WithCancel(ctx)
+				defer cancel()
+				contentProviderDone := make(chan struct{})
+				defer func() { <-contentProviderDone }()
+
+				httpPieceFlag := ""
+				if !tc.enableHTTPPiece {
+					httpPieceFlag = " --enable-http-piece=false"
+				}
+
+				go func() {
+					NewRunner().Run(contentProviderCtx, fmt.Sprintf("singularity run content-provider --http-bind %s%s", downloadTestBind, httpPieceFlag))
+					close(contentProviderDone)
+				}()
+
+				// Wait for content-provider to be ready
+				err = WaitForServerReady(ctx, fmt.Sprintf("http://%s/health", downloadTestBind))
+				require.NoError(t, err)
+
+				// Download all pieces
+				for _, pieceCID := range pieceCIDs {
+					t.Logf("Downloading %s...", pieceCID)
+
+					var downloaded []byte
+					if tc.enableHTTPPiece {
+						// For scenarios with enable-http-piece, download directly from /piece endpoint
+						// (content-provider serves the piece directly)
+						downloaded, err = Download(ctx, fmt.Sprintf("http://%s/piece/%s", downloadTestBind, pieceCID), 10)
+						require.NoError(t, err)
+					} else {
+						// For metadata-only mode, use download command (downloader assembles locally)
+						_, _, err = runner.Run(ctx, fmt.Sprintf("singularity download --quiet --api http://%s --out-dir %s %s",
+							downloadTestBind, testutil.EscapePath(downloadDir), pieceCID))
+						require.NoError(t, err)
+
+						// Read downloaded file
+						downloaded, err = os.ReadFile(filepath.Join(downloadDir, pieceCID+".car"))
+						require.NoError(t, err)
+					}
+
+					require.NotEmpty(t, downloaded)
+
+					// Verify the downloaded piece is the expected size (127/128 × pieceSize)
+					require.Equal(t, int(expectedFileSize), len(downloaded), "downloaded piece should be padded to (127/128)×pieceSize")
+
+					// Verify CommP matches - use the actual piece size from database
+					calculatedPieceCID := CalculateCommp(t, downloaded, uint64(paddedCar.PieceSize))
+					if pieceCID != calculatedPieceCID {
+						t.Logf("CommP mismatch: expected=%s, calculated=%s, downloaded_size=%d, piece_size=%d",
+							pieceCID, calculatedPieceCID, len(downloaded), paddedCar.PieceSize)
+					}
+					require.Equal(t, pieceCID, calculatedPieceCID, "CommP should match")
+
+					// Verify file ends with zeros (padding)
+					foundNonZero := false
+					paddingStart := len(downloaded) - 1
+					for paddingStart > 0 && downloaded[paddingStart] == 0 {
+						paddingStart--
+					}
+					paddingStart++ // Move to first zero
+					require.Greater(t, len(downloaded)-paddingStart, 0, "should have trailing zero padding")
+
+					// Verify everything before padding region is not all zeros
+					for i := 0; i < paddingStart; i++ {
+						if downloaded[i] != 0 {
+							foundNonZero = true
+							break
+						}
+					}
+					require.True(t, foundNonZero, "should have non-zero data before padding")
+
+					t.Logf("Successfully downloaded and verified %s: %d bytes with %d bytes of padding",
+						pieceCID, len(downloaded), len(downloaded)-paddingStart)
+				}
+
+				cancel() // Stop content-provider
+			})
+		})
+	}
+}
diff --git a/cmd/job_test.go b/cmd/job_test.go
@@ -28,7 +28,7 @@ var testDagGenJob = model.Job{
 	Type:         model.DagGen,
 	State:        model.Ready,
 	WorkerID:     nil,
-	AttachmentID: 1,
+	AttachmentID: ptr.Of(model.SourceAttachmentID(1)),
 }
 
 func TestDataPrepStartDagGenHandler(t *testing.T) {
@@ -68,7 +68,7 @@ var testScanJob = model.Job{
 	Type:         model.Scan,
 	State:        model.Ready,
 	WorkerID:     nil,
-	AttachmentID: 1,
+	AttachmentID: ptr.Of(model.SourceAttachmentID(1)),
 }
 
 func TestDataPrepStartScanHandler(t *testing.T) {
@@ -108,7 +108,7 @@ var testPackJob = model.Job{
 	Type:         model.Pack,
 	State:        model.Ready,
 	WorkerID:     nil,
-	AttachmentID: 1,
+	AttachmentID: ptr.Of(model.SourceAttachmentID(1)),
 }
 
 func TestDataPrepStartPackHandler(t *testing.T) {
@@ -178,7 +178,7 @@ func TestDataPreparationGetStatusHandler(t *testing.T) {
 						Type:         model.Pack,
 						State:        model.Processing,
 						WorkerID:     ptr.Of(uuid.NewString()),
-						AttachmentID: 1,
+						AttachmentID: ptr.Of(model.SourceAttachmentID(1)),
 					},
 				},
 			},

diff --git a/docs/en/topics/inline-preparation.md b/docs/en/topics/inline-preparation.md
@@ -52,7 +52,7 @@ The metadata API does not return any credentials required to access the data fro
 
 Inline preparation introduces a minimal overhead, primarily in terms of the storage space required. Additionally, computational and bandwidth overhead is also minimal.
 
-Metadata for each block of data is stored as a database row, requiring 100 bytes for every 1MiB block of prepared data. For a 1PiB dataset, this translates to a requisite of 10TiB of disk space to store the mapping metadata. While this isn't typically an issue, datasets with a large number of small files may result in a significantly higher disk overhead.
+Metadata for each block of data is stored as a database row, requiring approximately 100 bytes for every 1MiB block of prepared data. For a 1PiB dataset, this translates to approximately 100GiB of disk space to store the mapping metadata. While this isn't typically an issue, datasets with a large number of small files may result in a significantly higher disk overhead.
 
 Later, when CAR files are dynamically regenerated from the original data source, it's necessary to cross-reference these mappings in the database. However, this is generally not a concern. A bandwidth of 1GB/sec equates to 1,000 database entry lookups, which is far from the bottleneck capabilities of all supported database backends. Additionally, future optimizations may further reduce this overhead.
 

diff --git a/docs/jp/topics/inline-preparation.md b/docs/jp/topics/inline-preparation.md
@@ -52,7 +52,7 @@ SingularityメタデータAPIは、元のデータソースからCARファイル
 
 インライン準備には、主に必要なストレージ容量に関するわずかなオーバーヘッドが発生します。また、計算および帯域幅のオーバーヘッドもわずかです。
 
-データの各ブロックのメタデータはデータベースの行として保存され、1MiBのデータブロックごとに100バイトのディスク容量が必要です。1PiBのデータセットの場合、このメタデータマッピングを保存するために10TiBのディスク容量が必要です。これは通常問題ではありませんが、多数の小さいファイルを含むデータセットは、著しく高いディスクオーバーヘッドをもたらす場合があります。
+データの各ブロックのメタデータはデータベースの行として保存され、1MiBのデータブロックごとに約100バイトのディスク容量が必要です。1PiBのデータセットの場合、このメタデータマッピングを保存するために約100GiBのディスク容量が必要です。これは通常問題ではありませんが、多数の小さいファイルを含むデータセットは、著しく高いディスクオーバーヘッドをもたらす場合があります。
 
 その後、CARファイルが元のデータソースから動的に再生成されると、データベースでこれらのマッピングをクロスリファレンスする必要があります。ただし、これは一般的には懸念されません。1GB/秒の帯域幅は、1,000のデータベースエントリの検索に相当し、これはすべてのサポートされているデータベースバックエンドのボトルネック能力からはるかに少ないです。さらに、将来の最適化により、このオーバーヘッドはさらに削減される可能性があります。
 

diff --git a/docs/kr/topics/inline-preparation.md b/docs/kr/topics/inline-preparation.md
@@ -52,7 +52,7 @@ Singularity 메타데이터 API는 원본 데이터 소스에서 CAR 파일을
 
 인라인 준비는 주로 필요한 저장 공간 측면에서 최소한의 오버헤드를 가져옵니다. 또한, 계산 및 대역폭 오버헤드도 최소화됩니다.
 
-각 데이터 블록의 메타데이터는 데이터베이스 행으로 저장되며, 준비된 데이터 1MiB 블록 당 100바이트가 필요합니다. 1PiB 데이터 세트의 경우, 이는 매핑 메타데이터를 저장하기 위해 10TiB의 디스크 공간이 필요합니다. 이는 일반적으로 문제가 되지 않지만, 많은 수의 작은 파일이 있는 데이터 세트의 경우 디스크 오버헤드가 상당히 높아질 수 있습니다.
+각 데이터 블록의 메타데이터는 데이터베이스 행으로 저장되며, 준비된 데이터 1MiB 블록 당 약 100바이트가 필요합니다. 1PiB 데이터 세트의 경우, 이는 매핑 메타데이터를 저장하기 위해 약 100GiB의 디스크 공간이 필요합니다. 이는 일반적으로 문제가 되지 않지만, 많은 수의 작은 파일이 있는 데이터 세트의 경우 디스크 오버헤드가 상당히 높아질 수 있습니다.
 
 이후, CAR 파일이 원본 데이터 소스에서 동적으로 재생성될 때 데이터베이스에서 이러한 매핑을 교차 참조하는 것이 필요합니다. 그러나 일반적으로 이는 문제가 되지 않습니다. 대역폭 1GB/초는 1,000개의 데이터베이스 항목 조회를 의미하며, 이는 모든 지원되는 데이터베이스 백엔드의 병목 기능과는 거리가 멉니다. 또한, 향후 최적화로 이러한 오버헤드를 더욱 감소시킬 수 있습니다.
 

diff --git a/docs/zh/topics/inline-preparation.md b/docs/zh/topics/inline-preparation.md
@@ -52,7 +52,7 @@ Singularity元数据API将返回一个从原始数据源组装CAR文件的计划
 
 在线数据准备引入了一些极小的开销，主要是存储空间的需求。此外，计算和带宽开销也非常小。
 
-每个数据块的元数据以数据库行的形式存储，对于每1 MiB的准备数据，需要100字节来存储映射元数据。对于1 PiB的数据集，需要10 TiB的磁盘空间来存储映射元数据。虽然这通常不是问题，但是具有大量小文件的数据集可能会导致显著的磁盘开销。
+每个数据块的元数据以数据库行的形式存储，对于每1 MiB的准备数据，大约需要100字节来存储映射元数据。对于1 PiB的数据集，大约需要100 GiB的磁盘空间来存储映射元数据。虽然这通常不是问题，但是具有大量小文件的数据集可能会导致显著的磁盘开销。
 
 当需要从原始数据源动态重新生成CAR文件时，需要在数据库中交叉引用这些映射。然而，这通常不是一个问题。1 GB/sec的带宽等于1,000次数据库条目查找，远远没有所有支持的数据库后端的能力极限。此外，未来的优化可能进一步减少此开销。
 

diff --git a/handler/dataprep/create.go b/handler/dataprep/create.go
@@ -184,7 +184,7 @@ func (DefaultHandler) CreatePreparationHandler(
 
 		for _, attachment := range attachments {
 			err = db.Create(&model.Directory{
-				AttachmentID: attachment.ID,
+				AttachmentID: &attachment.ID,
 			}).Error
 			if err != nil {
 				return errors.WithStack(err)