Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,11 @@ yarn-error.log*

dist/

# Profiling and logs
*.prof
*.log
*.csv

/test.db*
dump_extracted.sql
dump_postgres_reordered_data_only.sql
Expand Down
10 changes: 5 additions & 5 deletions cmd/dataprep_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ func TestDataPreparationAddPieceHandler(t *testing.T) {
FileSize: 100,
StorageID: ptr.Of(model.StorageID(1)),
StoragePath: "test1.car",
PreparationID: 1,
PreparationID: ptr.Of(model.PreparationID(1)),
}, nil)
_, _, err := runner.Run(ctx, "singularity prep add-piece --piece-cid xxx --piece-size 100 --file-size 100 1")
require.NoError(t, err)
Expand Down Expand Up @@ -342,7 +342,7 @@ func TestDataPreparationListPiecesHandler(t *testing.T) {
FileSize: 200,
StorageID: ptr.Of(model.StorageID(1)),
StoragePath: "test1.car",
PreparationID: 1,
PreparationID: ptr.Of(model.PreparationID(1)),
}, {
ID: 2,
CreatedAt: time.Time{},
Expand All @@ -351,7 +351,7 @@ func TestDataPreparationListPiecesHandler(t *testing.T) {
FileSize: 400,
StorageID: ptr.Of(model.StorageID(1)),
StoragePath: "test2.car",
PreparationID: 1,
PreparationID: ptr.Of(model.PreparationID(1)),
}},
},
{
Expand All @@ -363,7 +363,7 @@ func TestDataPreparationListPiecesHandler(t *testing.T) {
FileSize: 600,
StorageID: ptr.Of(model.StorageID(1)),
StoragePath: "test3.car",
PreparationID: 1,
PreparationID: ptr.Of(model.PreparationID(1)),
}, {
ID: 4,
CreatedAt: time.Time{},
Expand All @@ -372,7 +372,7 @@ func TestDataPreparationListPiecesHandler(t *testing.T) {
FileSize: 800,
StorageID: ptr.Of(model.StorageID(1)),
StoragePath: "test4.car",
PreparationID: 1,
PreparationID: ptr.Of(model.PreparationID(1)),
}},
},
}, nil)
Expand Down
228 changes: 228 additions & 0 deletions cmd/download_padded_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
package cmd

import (
"context"
"fmt"
"os"
"path/filepath"
"testing"

"github.com/data-preservation-programs/singularity/model"
"github.com/data-preservation-programs/singularity/util/testutil"
"github.com/stretchr/testify/require"
"gorm.io/gorm"
)

// downloadTestBindBase is used to generate unique ports for each test
const downloadTestBindBase = 7779

// TestDownloadPaddedPieces tests downloading padded pieces in three scenarios:
// 1. non-inline + enable-http-piece: content-provider serves CAR file directly from disk
// 2. inline + enable-http-piece: content-provider assembles piece on-the-fly
// 3. inline + metadata-only: downloader assembles piece locally from metadata
func TestDownloadPaddedPieces(t *testing.T) {
tests := []struct {
name string
inline bool
enableHTTPPiece bool
minPieceSize int64
expectedPaddingInfo string
}{
{
name: "non-inline with padding",
inline: false,
enableHTTPPiece: true,
minPieceSize: 1 << 20, // 1 MiB - will force padding
expectedPaddingInfo: "literal zeros in CAR file",
},
{
name: "inline with piece assembly",
inline: true,
enableHTTPPiece: true,
minPieceSize: 1 << 20, // 1 MiB - will force padding
expectedPaddingInfo: "PieceReader serves zeros virtually",
},
{
name: "inline metadata-only",
inline: true,
enableHTTPPiece: false,
minPieceSize: 1 << 20, // 1 MiB - will force padding
expectedPaddingInfo: "downloader assembles with PieceReader",
},
}

for i, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
testutil.All(t, func(ctx context.Context, t *testing.T, db *gorm.DB) {
source := t.TempDir()
// Create a test file that's smaller than minPieceSize to force padding
err := os.WriteFile(filepath.Join(source, "test.txt"), testutil.GenerateFixedBytes(100<<10), 0644) // 100 KiB file
require.NoError(t, err)

output := t.TempDir()
downloadDir := t.TempDir()

// Use unique port for each test
downloadTestBind := fmt.Sprintf("127.0.0.1:%d", downloadTestBindBase+i)

runner := Runner{mode: Normal}
defer runner.Save(t, source, output, downloadDir)

// Create source storage
_, _, err = runner.Run(ctx, fmt.Sprintf("singularity storage create local --name source --path %s", testutil.EscapePath(source)))
require.NoError(t, err)

var outputStorageFlag string
if !tc.inline {
// For non-inline mode, use --local-output
outputStorageFlag = fmt.Sprintf(" --local-output %s", testutil.EscapePath(output))
}

// Create preparation with minPieceSize to force padding
prepName := "test"
inlineFlag := ""
if !tc.inline {
// Only add flag for non-inline mode (inline is default)
inlineFlag = " --no-inline"
}
_, _, err = runner.Run(ctx, fmt.Sprintf("singularity prep create --name %s --max-size 2MB --min-piece-size %d%s --source source%s",
prepName, tc.minPieceSize, inlineFlag, outputStorageFlag))
require.NoError(t, err)

// Start scanning
_, _, err = runner.Run(ctx, fmt.Sprintf("singularity prep start-scan %s source", prepName))
require.NoError(t, err)

// Run dataset worker to pack
_, _, err = runner.Run(ctx, "singularity run dataset-worker --exit-on-complete=true --exit-on-error=true")
require.NoError(t, err)

// List pieces
listPiecesOut, _, err := runner.Run(ctx, fmt.Sprintf("singularity prep list-pieces %s", prepName))
require.NoError(t, err)
pieceCIDs := GetAllPieceCIDs(listPiecesOut)
require.NotEmpty(t, pieceCIDs, "should have at least one piece")

t.Logf("Created %d piece(s) with %s", len(pieceCIDs), tc.expectedPaddingInfo)

// Verify the piece is padded by checking database
var cars []model.Car
err = db.Find(&cars).Error
require.NoError(t, err)
require.NotEmpty(t, cars)

// Find the car that matches our expected criteria
// FileSize should be (127/128) × PieceSize due to Fr32 padding overhead
expectedFileSize := (tc.minPieceSize * 127) / 128
var paddedCar *model.Car
for _, car := range cars {
if car.PieceSize == tc.minPieceSize && car.FileSize == expectedFileSize {
carCopy := car
paddedCar = &carCopy
t.Logf("Found padded CAR: PieceCID=%s, PieceSize=%d, FileSize=%d (expected %d), MinPieceSizePadding=%d",
car.PieceCID.String(), car.PieceSize, car.FileSize, expectedFileSize, car.MinPieceSizePadding)

if tc.inline {
require.Greater(t, car.MinPieceSizePadding, int64(0), "inline mode should have virtual padding")
} else {
require.Equal(t, int64(0), car.MinPieceSizePadding, "non-inline mode should have zeros in file")
}
break
}
}
require.NotNil(t, paddedCar, "should have found a padded CAR")

// For non-inline mode, verify the CAR file on disk has correct size
if !tc.inline {
carPath := filepath.Join(output, paddedCar.PieceCID.String()+".car")
fileInfo, err := os.Stat(carPath)
require.NoError(t, err, "CAR file should exist on disk")
t.Logf("CAR file on disk: path=%s, size=%d (should equal piece size %d for Curio TreeD)",
carPath, fileInfo.Size(), paddedCar.PieceSize)
require.Equal(t, paddedCar.FileSize, fileInfo.Size(), "CAR file should be padded to piece size")

// CommP was calculated before padding, so we don't verify it here
// The important verification is that downloaded piece matches (tested below)
}

// Start content-provider with appropriate flags
contentProviderCtx, cancel := context.WithCancel(ctx)
defer cancel()
contentProviderDone := make(chan struct{})
defer func() { <-contentProviderDone }()

httpPieceFlag := ""
if !tc.enableHTTPPiece {
httpPieceFlag = " --enable-http-piece=false"
}

go func() {
NewRunner().Run(contentProviderCtx, fmt.Sprintf("singularity run content-provider --http-bind %s%s", downloadTestBind, httpPieceFlag))
close(contentProviderDone)
}()

// Wait for content-provider to be ready
err = WaitForServerReady(ctx, fmt.Sprintf("http://%s/health", downloadTestBind))
require.NoError(t, err)

// Download all pieces
for _, pieceCID := range pieceCIDs {
t.Logf("Downloading %s...", pieceCID)

var downloaded []byte
if tc.enableHTTPPiece {
// For scenarios with enable-http-piece, download directly from /piece endpoint
// (content-provider serves the piece directly)
downloaded, err = Download(ctx, fmt.Sprintf("http://%s/piece/%s", downloadTestBind, pieceCID), 10)
require.NoError(t, err)
} else {
// For metadata-only mode, use download command (downloader assembles locally)
_, _, err = runner.Run(ctx, fmt.Sprintf("singularity download --quiet --api http://%s --out-dir %s %s",
downloadTestBind, testutil.EscapePath(downloadDir), pieceCID))
require.NoError(t, err)

// Read downloaded file
downloaded, err = os.ReadFile(filepath.Join(downloadDir, pieceCID+".car"))
require.NoError(t, err)
}

require.NotEmpty(t, downloaded)

// Verify the downloaded piece is the expected size (127/128 × pieceSize)
require.Equal(t, int(expectedFileSize), len(downloaded), "downloaded piece should be padded to (127/128)×pieceSize")

// Verify CommP matches - use the actual piece size from database
calculatedPieceCID := CalculateCommp(t, downloaded, uint64(paddedCar.PieceSize))
if pieceCID != calculatedPieceCID {
t.Logf("CommP mismatch: expected=%s, calculated=%s, downloaded_size=%d, piece_size=%d",
pieceCID, calculatedPieceCID, len(downloaded), paddedCar.PieceSize)
}
require.Equal(t, pieceCID, calculatedPieceCID, "CommP should match")

// Verify file ends with zeros (padding)
foundNonZero := false
paddingStart := len(downloaded) - 1
for paddingStart > 0 && downloaded[paddingStart] == 0 {
paddingStart--
}
paddingStart++ // Move to first zero
require.Greater(t, len(downloaded)-paddingStart, 0, "should have trailing zero padding")

// Verify everything before padding region is not all zeros
for i := 0; i < paddingStart; i++ {
if downloaded[i] != 0 {
foundNonZero = true
break
}
}
require.True(t, foundNonZero, "should have non-zero data before padding")

t.Logf("Successfully downloaded and verified %s: %d bytes with %d bytes of padding",
pieceCID, len(downloaded), len(downloaded)-paddingStart)
}

cancel() // Stop content-provider
})
})
}
}
8 changes: 4 additions & 4 deletions cmd/job_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ var testDagGenJob = model.Job{
Type: model.DagGen,
State: model.Ready,
WorkerID: nil,
AttachmentID: 1,
AttachmentID: ptr.Of(model.SourceAttachmentID(1)),
}

func TestDataPrepStartDagGenHandler(t *testing.T) {
Expand Down Expand Up @@ -68,7 +68,7 @@ var testScanJob = model.Job{
Type: model.Scan,
State: model.Ready,
WorkerID: nil,
AttachmentID: 1,
AttachmentID: ptr.Of(model.SourceAttachmentID(1)),
}

func TestDataPrepStartScanHandler(t *testing.T) {
Expand Down Expand Up @@ -108,7 +108,7 @@ var testPackJob = model.Job{
Type: model.Pack,
State: model.Ready,
WorkerID: nil,
AttachmentID: 1,
AttachmentID: ptr.Of(model.SourceAttachmentID(1)),
}

func TestDataPrepStartPackHandler(t *testing.T) {
Expand Down Expand Up @@ -178,7 +178,7 @@ func TestDataPreparationGetStatusHandler(t *testing.T) {
Type: model.Pack,
State: model.Processing,
WorkerID: ptr.Of(uuid.NewString()),
AttachmentID: 1,
AttachmentID: ptr.Of(model.SourceAttachmentID(1)),
},
},
},
Expand Down
2 changes: 1 addition & 1 deletion docs/en/topics/inline-preparation.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ The metadata API does not return any credentials required to access the data fro

Inline preparation introduces a minimal overhead, primarily in terms of the storage space required. Additionally, computational and bandwidth overhead is also minimal.

Metadata for each block of data is stored as a database row, requiring 100 bytes for every 1MiB block of prepared data. For a 1PiB dataset, this translates to a requisite of 10TiB of disk space to store the mapping metadata. While this isn't typically an issue, datasets with a large number of small files may result in a significantly higher disk overhead.
Metadata for each block of data is stored as a database row, requiring approximately 100 bytes for every 1MiB block of prepared data. For a 1PiB dataset, this translates to approximately 100GiB of disk space to store the mapping metadata. While this isn't typically an issue, datasets with a large number of small files may result in a significantly higher disk overhead.

Later, when CAR files are dynamically regenerated from the original data source, it's necessary to cross-reference these mappings in the database. However, this is generally not a concern. A bandwidth of 1GB/sec equates to 1,000 database entry lookups, which is far from the bottleneck capabilities of all supported database backends. Additionally, future optimizations may further reduce this overhead.

Expand Down
2 changes: 1 addition & 1 deletion docs/jp/topics/inline-preparation.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ SingularityメタデータAPIは、元のデータソースからCARファイル

インライン準備には、主に必要なストレージ容量に関するわずかなオーバーヘッドが発生します。また、計算および帯域幅のオーバーヘッドもわずかです。

データの各ブロックのメタデータはデータベースの行として保存され、1MiBのデータブロックごとに100バイトのディスク容量が必要です。1PiBのデータセットの場合、このメタデータマッピングを保存するために10TiBのディスク容量が必要です。これは通常問題ではありませんが、多数の小さいファイルを含むデータセットは、著しく高いディスクオーバーヘッドをもたらす場合があります。
データの各ブロックのメタデータはデータベースの行として保存され、1MiBのデータブロックごとに約100バイトのディスク容量が必要です。1PiBのデータセットの場合、このメタデータマッピングを保存するために約100GiBのディスク容量が必要です。これは通常問題ではありませんが、多数の小さいファイルを含むデータセットは、著しく高いディスクオーバーヘッドをもたらす場合があります。

その後、CARファイルが元のデータソースから動的に再生成されると、データベースでこれらのマッピングをクロスリファレンスする必要があります。ただし、これは一般的には懸念されません。1GB/秒の帯域幅は、1,000のデータベースエントリの検索に相当し、これはすべてのサポートされているデータベースバックエンドのボトルネック能力からはるかに少ないです。さらに、将来の最適化により、このオーバーヘッドはさらに削減される可能性があります。

Expand Down
2 changes: 1 addition & 1 deletion docs/kr/topics/inline-preparation.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ Singularity 메타데이터 API는 원본 데이터 소스에서 CAR 파일을

인라인 준비는 주로 필요한 저장 공간 측면에서 최소한의 오버헤드를 가져옵니다. 또한, 계산 및 대역폭 오버헤드도 최소화됩니다.

각 데이터 블록의 메타데이터는 데이터베이스 행으로 저장되며, 준비된 데이터 1MiB 블록 당 100바이트가 필요합니다. 1PiB 데이터 세트의 경우, 이는 매핑 메타데이터를 저장하기 위해 10TiB의 디스크 공간이 필요합니다. 이는 일반적으로 문제가 되지 않지만, 많은 수의 작은 파일이 있는 데이터 세트의 경우 디스크 오버헤드가 상당히 높아질 수 있습니다.
각 데이터 블록의 메타데이터는 데이터베이스 행으로 저장되며, 준비된 데이터 1MiB 블록 당 100바이트가 필요합니다. 1PiB 데이터 세트의 경우, 이는 매핑 메타데이터를 저장하기 위해 약 100GiB의 디스크 공간이 필요합니다. 이는 일반적으로 문제가 되지 않지만, 많은 수의 작은 파일이 있는 데이터 세트의 경우 디스크 오버헤드가 상당히 높아질 수 있습니다.

이후, CAR 파일이 원본 데이터 소스에서 동적으로 재생성될 때 데이터베이스에서 이러한 매핑을 교차 참조하는 것이 필요합니다. 그러나 일반적으로 이는 문제가 되지 않습니다. 대역폭 1GB/초는 1,000개의 데이터베이스 항목 조회를 의미하며, 이는 모든 지원되는 데이터베이스 백엔드의 병목 기능과는 거리가 멉니다. 또한, 향후 최적화로 이러한 오버헤드를 더욱 감소시킬 수 있습니다.

Expand Down
2 changes: 1 addition & 1 deletion docs/zh/topics/inline-preparation.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ Singularity元数据API将返回一个从原始数据源组装CAR文件的计划

在线数据准备引入了一些极小的开销,主要是存储空间的需求。此外,计算和带宽开销也非常小。

每个数据块的元数据以数据库行的形式存储,对于每1 MiB的准备数据,需要100字节来存储映射元数据。对于1 PiB的数据集,需要10 TiB的磁盘空间来存储映射元数据。虽然这通常不是问题,但是具有大量小文件的数据集可能会导致显著的磁盘开销。
每个数据块的元数据以数据库行的形式存储,对于每1 MiB的准备数据,大约需要100字节来存储映射元数据。对于1 PiB的数据集,大约需要100 GiB的磁盘空间来存储映射元数据。虽然这通常不是问题,但是具有大量小文件的数据集可能会导致显著的磁盘开销。

当需要从原始数据源动态重新生成CAR文件时,需要在数据库中交叉引用这些映射。然而,这通常不是一个问题。1 GB/sec的带宽等于1,000次数据库条目查找,远远没有所有支持的数据库后端的能力极限。此外,未来的优化可能进一步减少此开销。

Expand Down
2 changes: 1 addition & 1 deletion handler/dataprep/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ func (DefaultHandler) CreatePreparationHandler(

for _, attachment := range attachments {
err = db.Create(&model.Directory{
AttachmentID: attachment.ID,
AttachmentID: &attachment.ID,
}).Error
if err != nil {
return errors.WithStack(err)
Expand Down
Loading