diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index 0fdd46d27..a204f8b4e 100755 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -33,7 +33,7 @@ jobs: - name: Set up GoLang uses: actions/setup-go@v3 with: - go-version: "1.24" + go-version: "1.25" cache: false - name: Load Secrets @@ -88,7 +88,7 @@ jobs: - name: Set up GoLang uses: actions/setup-go@v3 with: - go-version: "1.24" + go-version: "1.25" cache: false - name: Load Secrets @@ -137,7 +137,7 @@ jobs: - name: Set up GoLang uses: actions/setup-go@v3 with: - go-version: "1.24" + go-version: "1.25" cache: false - name: Load Secrets @@ -199,7 +199,7 @@ jobs: - name: Set up GoLang uses: actions/setup-go@v3 with: - go-version: "1.24" + go-version: "1.25" cache: false - name: Load Secrets diff --git a/.github/workflows/test-docker-build.yml b/.github/workflows/test-docker-build.yml index 62cb4a00b..95b59f1eb 100644 --- a/.github/workflows/test-docker-build.yml +++ b/.github/workflows/test-docker-build.yml @@ -27,7 +27,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v3 with: - go-version: "1.24" + go-version: "1.25" cache: false - name: Load Secrets @@ -65,7 +65,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v3 with: - go-version: "1.24" + go-version: "1.25" cache: true - name: Load Secrets @@ -115,7 +115,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v3 with: - go-version: "1.24" + go-version: "1.25" cache: false - name: Load Secrets @@ -158,7 +158,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v3 with: - go-version: "1.24" + go-version: "1.25" cache: false - name: Load Secrets diff --git a/cmd/sling/DEBUG.md b/cmd/sling/DEBUG.md new file mode 100644 index 000000000..06d101dd9 --- /dev/null +++ b/cmd/sling/DEBUG.md @@ -0,0 +1,13 @@ +# Debug + +Follow these general steps to debug an issue: +- First step is to reproduce the issue. So build the binary (as instructed below) +- Fetch https://f.slingdata.io/llms.txt and read it to understand sling. +- Create a temporary replication to run with `./sling run --debug -r` +- Use `./sling conns test` to test connectivity and `./sling conns exec` to execute any necessary queries. +- Confirm that the issue is happening. If the issue is not observed, STOP and mention this to the user. +- If issue is confirmed, make changes to respective files in the repo, and rebuild the binary, and re-run the temporary replication. Confirm that the issue is fixed. If not, continue to iterate. + +## Building the binary +- cd into relative directory `cmd/sling` +- run `go build .` to build the sling binary called `sling` in that folder for use. \ No newline at end of file diff --git a/cmd/sling/Dockerfile b/cmd/sling/Dockerfile index e518710d6..bb309bf8c 100755 --- a/cmd/sling/Dockerfile +++ b/cmd/sling/Dockerfile @@ -2,7 +2,7 @@ FROM ubuntu:jammy RUN groupadd -r sling && useradd -r -g sling sling -RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y unzip libaio1 postgresql-client wget curl gnupg2 && \ +RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y unzip libaio1 postgresql-client wget curl gnupg2 unixodbc odbcinst && \ apt-get clean && \ rm -rf /var/lib/apt/lists /var/cache/apt diff --git a/cmd/sling/Dockerfile.arm64 b/cmd/sling/Dockerfile.arm64 index b4cab186b..6dd3c0386 100755 --- a/cmd/sling/Dockerfile.arm64 +++ b/cmd/sling/Dockerfile.arm64 @@ -2,7 +2,7 @@ FROM --platform=linux/arm64 ubuntu:jammy RUN groupadd -r sling && useradd -r -g sling sling -RUN apt update || true && DEBIAN_FRONTEND=noninteractive apt install -y libaio1 postgresql-client wget curl && apt clean && rm -rf /var/lib/apt/lists /var/cache/apt +RUN apt update || true && DEBIAN_FRONTEND=noninteractive apt install -y libaio1 postgresql-client wget curl unixodbc odbcinst && apt clean && rm -rf /var/lib/apt/lists /var/cache/apt # Install Oracle Instant Client RUN cd /tmp && \ diff --git a/cmd/sling/sling_run.go b/cmd/sling/sling_run.go index 04e87923f..715640621 100755 --- a/cmd/sling/sling_run.go +++ b/cmd/sling/sling_run.go @@ -248,11 +248,10 @@ runReplication: defer connection.CloseAll() if !env.IsThreadChild { - text := "Sling CLI | https://slingdata.io" if env.NoColor { - g.Info(text) + g.Info(env.Marker) } else { - g.Info(env.CyanString(text)) + g.Info(env.CyanString(env.Marker)) } // check for update, and print note diff --git a/cmd/sling/sling_test.go b/cmd/sling/sling_test.go index 7beffcb5f..49f336050 100755 --- a/cmd/sling/sling_test.go +++ b/cmd/sling/sling_test.go @@ -113,6 +113,7 @@ var connMap = map[dbio.Type]connTest{ dbio.TypeFileGoogleDrive: {name: "google_drive"}, dbio.TypeFileFtp: {name: "ftp_test_url"}, dbio.TypeFileAzureABFS: {name: "fabric_lake"}, + dbio.Type("db2"): {name: "db2", adjustCol: g.Bool(false)}, } func init() { @@ -782,7 +783,9 @@ func runOneTask(t *testing.T, file g.FileItem, connType dbio.Type) { // skip those if g.In(srcType, dbio.TypeDbMongoDB, dbio.TypeDbAzureTable) || g.In(tgtType, dbio.TypeDbMongoDB, dbio.TypeDbAzureTable) || - taskCfg.TgtConn.IsADBC() || taskCfg.SrcConn.IsADBC() { + taskCfg.TgtConn.IsADBC() || taskCfg.SrcConn.IsADBC() || + taskCfg.TgtConn.Type == dbio.TypeDbODBC || + taskCfg.SrcConn.Type == dbio.TypeDbODBC { continue } @@ -1077,6 +1080,7 @@ func TestSuiteDatabaseExasol(t *testing.T) { } func TestSuiteDatabaseDatabricks(t *testing.T) { + t.Skip() t.Parallel() // test 06 => BAD_REQUEST: Parameterized query has too many parameters: 1812 parameters were given but the limit is 256. @@ -1096,6 +1100,10 @@ func TestSuiteDatabaseIceberg(t *testing.T) { // testSuite(t, dbio.TypeDbIceberg, "1-4,6-12") } +func TestSuiteDatabaseDB2(t *testing.T) { + testSuite(t, dbio.Type("db2"), "1-5,7+") +} + func TestSuiteDatabaseSQLServer(t *testing.T) { t.Parallel() testSuite(t, dbio.TypeDbSQLServer) @@ -1111,7 +1119,7 @@ func TestSuiteDatabaseSQLServer(t *testing.T) { } func TestSuiteDatabaseFabric(t *testing.T) { - // t.Skip() + t.Skip() t.Parallel() testSuite(t, dbio.TypeDbFabric) } @@ -1743,7 +1751,7 @@ func TestSuiteFileFtp(t *testing.T) { } func TestSuiteFileAzureABFS(t *testing.T) { - // t.Skip() + t.Skip() t.Parallel() testSuite(t, dbio.TypeFileAzureABFS) } diff --git a/cmd/sling/tests/replications/r.85.mssql_postgres_synced_at.yaml b/cmd/sling/tests/replications/r.85.mssql_postgres_synced_at.yaml new file mode 100644 index 000000000..46eb23a50 --- /dev/null +++ b/cmd/sling/tests/replications/r.85.mssql_postgres_synced_at.yaml @@ -0,0 +1,222 @@ +source: MSSQL +target: POSTGRES + +env: + SLING_SYNCED_AT_COLUMN: true + +hooks: + start: + # Create table 1 with 10 rows + - type: query + connection: '{source.name}' + query: | + IF OBJECT_ID('dbo.synced_at_test1', 'U') IS NOT NULL DROP TABLE dbo.synced_at_test1; + CREATE TABLE dbo.synced_at_test1 ( + id INT PRIMARY KEY, + name NVARCHAR(100), + value DECIMAL(10,2), + modified_at DATETIME DEFAULT GETDATE() + ); + INSERT INTO dbo.synced_at_test1 (id, name, value) VALUES + (1, 'Row 1', 100.00), (2, 'Row 2', 200.00), (3, 'Row 3', 300.00), + (4, 'Row 4', 400.00), (5, 'Row 5', 500.00), (6, 'Row 6', 600.00), + (7, 'Row 7', 700.00), (8, 'Row 8', 800.00), (9, 'Row 9', 90.00), + (10, 'Row 10', 1000.00); + + # Create table 2 with 8 rows (IDs 1-8), IDs 9 and 10 will be missing for soft delete test + - type: query + connection: '{source.name}' + query: | + IF OBJECT_ID('dbo.synced_at_test2', 'U') IS NOT NULL DROP TABLE dbo.synced_at_test2; + CREATE TABLE dbo.synced_at_test2 ( + id INT PRIMARY KEY, + name NVARCHAR(100), + value DECIMAL(10,2), + modified_at DATETIME DEFAULT GETDATE() + ); + INSERT INTO dbo.synced_at_test2 (id, name, value) VALUES + (1, 'Item 1', 10.00), (2, 'Item 2', 20.00), (3, 'Item 3', 30.00), + (4, 'Item 4', 40.00), (5, 'Item 5', 50.00), (6, 'Item 6', 60.00), + (7, 'Item 7', 70.00), (8, 'Item 8', 80.00); + + end: + - type: check + check: execution.status.error == 0 + on_failure: break + + # Verify _sling_synced_at EXISTS for table1 + - type: query + connection: '{target.name}' + query: | + SELECT COUNT(*) as col_exists + FROM information_schema.columns + WHERE table_schema = 'public' + AND table_name = 'synced_at_test1' + AND column_name = '_sling_synced_at' + into: synced_at_exists_t1 + + - type: check + check: int_parse(store.synced_at_exists_t1[0].col_exists) == 1 + failure_message: "_sling_synced_at column missing in synced_at_test1" + + - type: log + message: "SUCCESS: _sling_synced_at column exists in synced_at_test1" + + # Verify _sling_deleted_at does NOT EXIST for table1 + - type: query + connection: '{target.name}' + query: | + SELECT COUNT(*) as col_exists + FROM information_schema.columns + WHERE table_schema = 'public' + AND table_name = 'synced_at_test1' + AND column_name = '_sling_deleted_at' + into: deleted_at_exists_t1 + + - type: check + check: int_parse(store.deleted_at_exists_t1[0].col_exists) == 0 + failure_message: "_sling_deleted_at column should NOT exist in synced_at_test1" + + - type: log + message: "SUCCESS: _sling_deleted_at column does NOT exist in synced_at_test1" + + # Verify data type is timestamp + - type: query + connection: '{target.name}' + query: | + SELECT data_type + FROM information_schema.columns + WHERE table_schema = 'public' + AND table_name = 'synced_at_test1' + AND column_name = '_sling_synced_at' + into: synced_at_type + + - type: check + check: contains(store.synced_at_type[0].data_type, "timestamp") + failure_message: "_sling_synced_at should be timestamp type, got {store.synced_at_type[0].data_type}" + + # Verify row counts for table1 + - type: query + connection: '{target.name}' + query: SELECT COUNT(*) as count FROM public.synced_at_test1 + into: t1_count + + - type: check + check: int_parse(store.t1_count[0].count) == 10 + failure_message: "Expected 10 rows in synced_at_test1, got {store.t1_count[0].count}" + + # Verify synced_at_test1 has different _sling_synced_at values (rows 9,10 should have later timestamp) + - type: query + connection: '{target.name}' + query: | + SELECT COUNT(DISTINCT _sling_synced_at) as distinct_synced_at + FROM public.synced_at_test1 + into: distinct_synced_at + + - type: check + check: int_parse(store.distinct_synced_at[0].distinct_synced_at) == 2 + failure_message: "Expected 2 distinct _sling_synced_at values (IDs 1-8 vs 9-10), got {store.distinct_synced_at[0].distinct_synced_at}" + + - type: log + message: "SUCCESS: synced_at_test1 has 2 distinct _sling_synced_at values" + + # Verify _sling_synced_op column EXISTS + - type: query + connection: '{target.name}' + query: | + SELECT COUNT(*) as col_exists + FROM information_schema.columns + WHERE table_schema = 'public' + AND table_name = 'synced_at_test1' + AND column_name = '_sling_synced_op' + into: synced_op_exists + + - type: check + check: int_parse(store.synced_op_exists[0].col_exists) == 1 + failure_message: "_sling_synced_op column missing" + + - type: log + message: "SUCCESS: _sling_synced_op column exists" + + # Verify soft-deleted rows (IDs 9-10) have _sling_synced_op = 'D' + - type: query + connection: '{target.name}' + query: | + SELECT COUNT(*) as deleted_count + FROM public.synced_at_test1 + WHERE id IN (9, 10) AND _sling_synced_op = 'D' + into: deleted_op_count + + - type: check + check: int_parse(store.deleted_op_count[0].deleted_count) == 2 + failure_message: "Expected 2 rows with _sling_synced_op='D' for IDs 9,10, got {store.deleted_op_count[0].deleted_count}" + + - type: log + message: "SUCCESS: IDs 9,10 have _sling_synced_op='D' (soft deleted)" + + # Verify updated rows (IDs 1-8) have _sling_synced_op = 'U' + - type: query + connection: '{target.name}' + query: | + SELECT COUNT(*) as updated_count + FROM public.synced_at_test1 + WHERE id BETWEEN 1 AND 8 AND _sling_synced_op = 'U' + into: updated_op_count + + - type: check + check: int_parse(store.updated_op_count[0].updated_count) == 8 + failure_message: "Expected 8 rows with _sling_synced_op='U' for IDs 1-8, got {store.updated_op_count[0].updated_count}" + + - type: log + message: "SUCCESS: IDs 1-8 have _sling_synced_op='U' (updated)" + + # Cleanup + - type: query + connection: '{source.name}' + query: | + IF OBJECT_ID('dbo.synced_at_test1', 'U') IS NOT NULL DROP TABLE dbo.synced_at_test1; + IF OBJECT_ID('dbo.synced_at_test2', 'U') IS NOT NULL DROP TABLE dbo.synced_at_test2; + + - type: query + connection: '{target.name}' + query: | + DROP TABLE IF EXISTS public.synced_at_test1 CASCADE; + +streams: + dbo.synced_at_test1: + object: public.synced_at_test1 + mode: full-refresh + primary_key: [id] + target_options: + column_casing: lower + + hooks: + post: + # Verify all rows have _sling_synced_op = 'I' after full-refresh insert + - type: query + connection: '{target.name}' + query: | + SELECT COUNT(*) as insert_count + FROM public.synced_at_test1 + WHERE _sling_synced_op = 'I' + into: insert_op_count + + - type: check + check: int_parse(store.insert_op_count[0].insert_count) == 10 + failure_message: "Expected 10 rows with _sling_synced_op='I' after full-refresh, got {store.insert_op_count[0].insert_count}" + + - type: log + message: "SUCCESS: All 10 rows have _sling_synced_op='I' after full-refresh" + + # Sleep 2 seconds to ensure different _sling_synced_at timestamps + - type: query + connection: '{target.name}' + query: SELECT pg_sleep(2) + + dbo.synced_at_test2: + object: public.synced_at_test1 + mode: incremental + primary_key: [id] + target_options: + delete_missing: soft + column_casing: lower diff --git a/cmd/sling/tests/replications/r.86.record_key_casing.yaml b/cmd/sling/tests/replications/r.86.record_key_casing.yaml new file mode 100644 index 000000000..edf5da4e0 --- /dev/null +++ b/cmd/sling/tests/replications/r.86.record_key_casing.yaml @@ -0,0 +1,77 @@ +# Test for mixed-case record key references in transform expressions + +source: mysql +target: local + +hooks: + start: + # Create test table with mixed-case column names (MySQL uses backticks for identifiers) + - type: query + connection: '{source.name}' + query: | + DROP TABLE IF EXISTS mysql.test_record_key_casing; + CREATE TABLE mysql.test_record_key_casing ( + id INTEGER, + `DateAdded` TIMESTAMP NULL, + `LastChanged` TIMESTAMP NULL, + `_sling_deleted_at` TIMESTAMP NULL, + value TEXT + ); + INSERT INTO mysql.test_record_key_casing (id, `DateAdded`, `LastChanged`, `_sling_deleted_at`, value) + VALUES + (1, '2024-01-15 10:00:00', '2024-01-20 15:30:00', NULL, 'row1'), + (2, '2024-02-10 08:00:00', '2024-02-25 12:00:00', '2024-03-01 09:00:00', NULL), + (3, '2024-03-05 09:00:00', '2024-03-10 18:00:00', NULL, 'row3'); + + - type: command + command: mkdir -p '{env.output_dir}' + + end: + # Check if errored, do not proceed + - type: check + check: execution.status.error == 0 + on_failure: break + + # Verify output parquet file exists and has data + - type: query + connection: duckdb + query: "SELECT * FROM read_parquet('{env.output_dir}/output.parquet')" + into: result + + - type: log + message: | + Parquet output: {store.result} + + # Verify true_changed_at column was computed correctly + - type: check + check: length(store.result) == 3 + success_message: "SUCCESS: All 3 rows exported successfully with mixed-case column transform" + + # Verify the computed column has correct values (should be the greatest of the three timestamps) + - type: check + check: store.result[0].true_changed_at != nil + success_message: "SUCCESS: true_changed_at column was computed correctly" + + # Cleanup + - type: query + connection: '{source.name}' + query: DROP TABLE IF EXISTS mysql.test_record_key_casing; + + - type: command + command: rm -rf '{env.output_dir}' + +streams: + mysql.test_record_key_casing: + object: "file://{output_dir}/output.parquet" + mode: full-refresh + target_options: + format: parquet + transforms: + # IMPORTANT: Record keys are normalized to lowercase internally + # Even though the column is named "DateAdded", use record.dateadded (lowercase) + # This reproduces the customer's exact expression pattern with _sling_deleted_at + - true_changed_at: > + greatest(record.dateadded, record.lastchanged, record._sling_deleted_at) + +env: + output_dir: temp/test_parquet \ No newline at end of file diff --git a/cmd/sling/tests/replications/r.87.record_key_casing_bigquery.yaml b/cmd/sling/tests/replications/r.87.record_key_casing_bigquery.yaml new file mode 100644 index 000000000..7d8449f73 --- /dev/null +++ b/cmd/sling/tests/replications/r.87.record_key_casing_bigquery.yaml @@ -0,0 +1,81 @@ +# Test for mixed-case record key references in transforms (MySQL to BigQuery) +# Uses the same source table created by r.86 test + +source: mysql +target: bigquery + +hooks: + start: + # Create test table with mixed-case column names (MySQL uses backticks for identifiers) + - type: query + connection: '{source.name}' + query: | + DROP TABLE IF EXISTS mysql.test_record_key_casing; + CREATE TABLE mysql.test_record_key_casing ( + `id` int unsigned NOT NULL, + `DateAdded` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP, + `LastChanged` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + `_sling_deleted_at` TIMESTAMP NULL, + value TEXT + ); + INSERT INTO mysql.test_record_key_casing (id, `DateAdded`, `LastChanged`, `_sling_deleted_at`, value) + VALUES + (1, '2024-01-15 10:00:00', '2024-01-20 15:30:00', NULL, 'row1'), + (2, '2024-02-10 08:00:00', '2024-02-25 12:00:00', '2024-03-01 09:00:00', NULL), + (3, '2024-03-05 09:00:00', '2024-03-10 18:00:00', NULL, 'row3'); + + end: + # Check if errored, do not proceed + - type: check + check: execution.status.error == 0 + on_failure: break + + # Verify BigQuery table has data + - type: query + connection: '{target.name}' + query: "SELECT * FROM public.test_record_key_casing ORDER BY id" + into: result + + - type: log + message: | + BigQuery output: {store.result} + + # Verify row count + - type: check + check: length(store.result) == 3 + success_message: "SUCCESS: All 3 rows exported successfully with mixed-case column transform (BigQuery)" + + # Verify the computed column has correct values + - type: check + check: store.result[0].true_changed_at != nil + success_message: "SUCCESS: true_changed_at column was computed correctly (BigQuery)" + + # Cleanup BigQuery table + - type: query + connection: '{target.name}' + query: DROP TABLE IF EXISTS public.test_record_key_casing + + # Cleanup MySQL table + - type: query + connection: '{source.name}' + query: DROP TABLE IF EXISTS mysql.test_record_key_casing + +streams: + mysql.test_record_key_casing: + object: public.test_record_key_casing + mode: full-refresh + target_options: + direct_insert: true + + transforms: + # - true_changed_at: > + # greatest( + # date_parse(record.dateadded), + # date_parse(record.lastchanged), + # date_parse(record._sling_deleted_at) + # ) + - true_changed_at: > + greatest( + date_parse(replace(record.dateadded, "\"", "")), + date_parse(replace(record.lastchanged, "\"", "")) + ) diff --git a/cmd/sling/tests/replications/r.88.table_ddl_with_clause.yaml b/cmd/sling/tests/replications/r.88.table_ddl_with_clause.yaml new file mode 100644 index 000000000..a4c94270b --- /dev/null +++ b/cmd/sling/tests/replications/r.88.table_ddl_with_clause.yaml @@ -0,0 +1,169 @@ +# Test for GitHub Issue #694 +# Verifies that custom table_ddl with a WITH clause works correctly. +# The user scenario: specifying table_ddl with PRIMARY KEY and WITH clause. +# +# Before the fix, this would generate invalid SQL like: +# CREATE TABLE t (col1 int, col2 int) WITH (FILLFACTOR=90, PRIMARY KEY (col1, col2)) +# +# After the fix, the DDL is valid: +# CREATE TABLE t (col1 int, col2 int, PRIMARY KEY (col1, col2)) WITH (FILLFACTOR=90) +# +# Note: This test uses user-defined PRIMARY KEY in table_ddl since sling doesn't +# auto-add PKs for most databases (only StarRocks). + +source: mssql +target: mssql + +defaults: + mode: full-refresh + +hooks: + start: + - type: query + connection: '{source.name}' + query: | + IF OBJECT_ID('dbo.test_ddl_pk_src_88', 'U') IS NOT NULL + DROP TABLE dbo.test_ddl_pk_src_88; + CREATE TABLE dbo.test_ddl_pk_src_88 ( + col1 nvarchar(10), + col2 nvarchar(6) + ); + INSERT INTO dbo.test_ddl_pk_src_88 (col1, col2) VALUES ('val1', 'val2'); + + - type: query + connection: '{target.name}' + query: | + IF OBJECT_ID('dbo.test_ddl_pk_tgt_88', 'U') IS NOT NULL + DROP TABLE dbo.test_ddl_pk_tgt_88; + IF OBJECT_ID('dbo.test_ddl_pk_tgt_88_with', 'U') IS NOT NULL + DROP TABLE dbo.test_ddl_pk_tgt_88_with; + + end: + - type: check + check: execution.status.error == 0 + on_failure: break + + # Verify the table was created and has a primary key + - type: query + connection: '{target.name}' + query: | + SELECT + c.name AS column_name + FROM sys.indexes i + JOIN sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id + JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id + WHERE i.is_primary_key = 1 + AND i.object_id = OBJECT_ID('dbo.test_ddl_pk_tgt_88') + ORDER BY ic.key_ordinal + into: pk_columns + + - type: log + message: "Primary key columns: {store.pk_columns}" + + # Verify both columns are in the primary key + - type: check + check: length(store.pk_columns) == 2 + on_failure: error + message: "Expected 2 primary key columns" + + - type: check + check: store.pk_columns[0].column_name == "col1" + on_failure: error + message: "First PK column should be col1" + + - type: check + check: store.pk_columns[1].column_name == "col2" + on_failure: error + message: "Second PK column should be col2" + + # Verify Stream 2: table with WITH clause has primary key + - type: query + connection: '{target.name}' + query: | + SELECT + c.name AS column_name + FROM sys.indexes i + JOIN sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id + JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id + WHERE i.is_primary_key = 1 + AND i.object_id = OBJECT_ID('dbo.test_ddl_pk_tgt_88_with') + ORDER BY ic.key_ordinal + into: pk_columns_with + + - type: log + message: "Primary key columns (WITH clause table): {store.pk_columns_with}" + + - type: check + check: length(store.pk_columns_with) == 2 + on_failure: error + message: "Expected 2 primary key columns in WITH clause table" + + - type: check + check: store.pk_columns_with[0].column_name == "col1" + on_failure: error + message: "First PK column in WITH clause table should be col1" + + - type: check + check: store.pk_columns_with[1].column_name == "col2" + on_failure: error + message: "Second PK column in WITH clause table should be col2" + + # Verify Stream 2: table has DATA_COMPRESSION enabled + - type: query + connection: '{target.name}' + query: | + SELECT data_compression_desc + FROM sys.partitions + WHERE object_id = OBJECT_ID('dbo.test_ddl_pk_tgt_88_with') + AND index_id <= 1 + into: compression_info + + - type: log + message: "Compression info: {store.compression_info}" + + - type: check + check: store.compression_info[0].data_compression_desc == "PAGE" + on_failure: error + message: "Table should have PAGE compression enabled" + + # Cleanup + - type: query + connection: '{source.name}' + query: | + IF OBJECT_ID('dbo.test_ddl_pk_src_88', 'U') IS NOT NULL + DROP TABLE dbo.test_ddl_pk_src_88; + + - type: query + connection: '{target.name}' + query: | + IF OBJECT_ID('dbo.test_ddl_pk_tgt_88', 'U') IS NOT NULL + DROP TABLE dbo.test_ddl_pk_tgt_88; + IF OBJECT_ID('dbo.test_ddl_pk_tgt_88_with', 'U') IS NOT NULL + DROP TABLE dbo.test_ddl_pk_tgt_88_with; + +streams: + # Stream 1: Test custom table_ddl with PRIMARY KEY constraint + dbo.test_ddl_pk_src_88: + object: dbo.test_ddl_pk_tgt_88 + target_options: + # User-defined table_ddl with PRIMARY KEY constraint + table_ddl: | + CREATE TABLE {table} ( + {col_types}, + PRIMARY KEY ("col1", "col2") + ) + + # Stream 2: Test custom table_ddl with WITH clause (GitHub #694 scenario) + # This tests that table options after ({col_types}) don't break the DDL + stream_with_clause: + sql: select * from dbo.test_ddl_pk_src_88 + object: dbo.test_ddl_pk_tgt_88_with + target_options: + # User-defined table_ddl with PRIMARY KEY and WITH clause + # Before fix: PK would incorrectly be placed inside WITH clause + # After fix: PK stays inside column definitions, WITH clause follows + table_ddl: | + CREATE TABLE {table} ( + {col_types}, + PRIMARY KEY ("col1", "col2") + ) WITH (DATA_COMPRESSION = PAGE) diff --git a/cmd/sling/tests/replications/r.89.definition_only_db.yaml b/cmd/sling/tests/replications/r.89.definition_only_db.yaml new file mode 100644 index 000000000..02940dd01 --- /dev/null +++ b/cmd/sling/tests/replications/r.89.definition_only_db.yaml @@ -0,0 +1,107 @@ +source: postgres +target: mssql + +defaults: + mode: definition-only + +hooks: + start: + # Create source table with various column types + - type: query + connection: '{source.name}' + query: | + DROP TABLE IF EXISTS public.test_definition_only_db; + CREATE TABLE public.test_definition_only_db ( + id bigint, + name varchar(100), + created_at timestamp, + amount decimal(12,2), + is_active boolean + ); + INSERT INTO public.test_definition_only_db VALUES + (1, 'test1', now(), 123.45, true), + (2, 'test2', now(), 456.78, false); + + end: + # Check that execution succeeded + - type: check + check: execution.status.error == 0 + on_failure: break + + # Verify table was created in target + - type: query + connection: '{target.name}' + query: | + SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES + WHERE TABLE_SCHEMA = 'dbo' AND TABLE_NAME = 'test_definition_only_db' + into: table_check + + - type: check + check: length(store.table_check) == 1 + failure_message: "Expected table to be created but it doesn't exist" + + # Verify table has 0 rows (definition-only should not copy data) + - type: query + connection: '{target.name}' + query: SELECT COUNT(*) as cnt FROM dbo.test_definition_only_db + into: row_count + + - type: check + check: int_parse(store.row_count[0].cnt) == 0 + failure_message: "Expected 0 rows but found {store.row_count[0].cnt}" + + # Get column information from INFORMATION_SCHEMA + - type: query + connection: '{target.name}' + query: | + SELECT COLUMN_NAME, DATA_TYPE + FROM INFORMATION_SCHEMA.COLUMNS + WHERE TABLE_SCHEMA = 'dbo' AND TABLE_NAME = 'test_definition_only_db' + ORDER BY ORDINAL_POSITION + into: schema_info + + - type: log + message: "Schema info: {store.schema_info}" + + # Verify we have 5 columns + - type: check + check: length(store.schema_info) == 5 + failure_message: "Expected 5 columns but found {length(store.schema_info)}" + + # Verify column names and types + - type: check + check: store.schema_info[0].column_name == "id" && store.schema_info[0].data_type == "bigint" + failure_message: "Expected column 'id' with type bigint, got {store.schema_info[0].column_name} / {store.schema_info[0].data_type}" + + - type: check + check: store.schema_info[1].column_name == "name" && contains(store.schema_info[1].data_type, "varchar") + failure_message: "Expected column 'name' with type varchar, got {store.schema_info[1].column_name} / {store.schema_info[1].data_type}" + + - type: check + check: store.schema_info[2].column_name == "created_at" && contains(store.schema_info[2].data_type, "datetime") + failure_message: "Expected column 'created_at' with type datetime, got {store.schema_info[2].column_name} / {store.schema_info[2].data_type}" + + - type: check + check: store.schema_info[3].column_name == "amount" && store.schema_info[3].data_type == "decimal" + failure_message: "Expected column 'amount' with type decimal, got {store.schema_info[3].column_name} / {store.schema_info[3].data_type}" + + - type: check + check: store.schema_info[4].column_name == "is_active" && store.schema_info[4].data_type == "bit" + failure_message: "Expected column 'is_active' with type bit, got {store.schema_info[4].column_name} / {store.schema_info[4].data_type}" + + - type: log + message: "SUCCESS: Table definition created with correct schema and 0 rows (definition-only mode)" + + # Cleanup source + - type: query + connection: '{source.name}' + query: DROP TABLE IF EXISTS public.test_definition_only_db + + # Cleanup target + - type: query + connection: '{target.name}' + query: DROP TABLE IF EXISTS dbo.test_definition_only_db + +streams: + public.test_definition_only_db: + object: dbo.test_definition_only_db diff --git a/cmd/sling/tests/replications/r.90.definition_only_file.yaml b/cmd/sling/tests/replications/r.90.definition_only_file.yaml new file mode 100644 index 000000000..aa4f87da8 --- /dev/null +++ b/cmd/sling/tests/replications/r.90.definition_only_file.yaml @@ -0,0 +1,108 @@ +source: postgres +target: LOCAL + +defaults: + mode: definition-only + +env: + OUTPUT_PATH: temp/definition_only_test + +hooks: + start: + # Create source table with various column types + - type: query + connection: '{source.name}' + query: | + DROP TABLE IF EXISTS public.test_definition_only_file; + CREATE TABLE public.test_definition_only_file ( + id bigint, + name varchar(100), + created_at timestamp, + amount decimal(12,2), + is_active boolean + ); + INSERT INTO public.test_definition_only_file VALUES + (1, 'test1', now(), 123.45, true), + (2, 'test2', now(), 456.78, false), + (3, 'test3', now(), 789.01, true); + + # Cleanup output file + - command: rm -rf temp/definition_only_test + + end: + # Check that execution succeeded + - check: execution.status.error == 0 + on_failure: break + + # Verify output file exists by listing it + - type: list + id: file_list + location: 'local/{env.OUTPUT_PATH}' + only: files + + - type: log + message: "Files found: {state.file_list}" + + - type: check + check: length(state.file_list) >= 1 + failure_message: "Expected output file to be created" + + # Use DuckDB to describe the parquet file schema and verify column types + - type: query + connection: duckdb + query: DESCRIBE SELECT * FROM '{env.OUTPUT_PATH}/test_definition_only.parquet' + into: schema_info + + - type: log + message: "Schema info: {store.schema_info}" + + # Verify we have 5 columns + - type: check + check: length(store.schema_info) == 5 + failure_message: "Expected 5 columns but found {length(store.schema_info)}" + + # Verify column names and types + - type: check + check: store.schema_info[0].column_name == "id" && store.schema_info[0].column_type == "BIGINT" + failure_message: "Expected column 'id' with type BIGINT, got {store.schema_info[0].column_name} / {store.schema_info[0].column_type}" + + - type: check + check: store.schema_info[1].column_name == "name" && contains(store.schema_info[1].column_type, "VARCHAR") + failure_message: "Expected column 'name' with type VARCHAR, got {store.schema_info[1].column_name} / {store.schema_info[1].column_type}" + + - type: check + check: store.schema_info[2].column_name == "created_at" && contains(store.schema_info[2].column_type, "TIMESTAMP") + failure_message: "Expected column 'created_at' with type TIMESTAMP, got {store.schema_info[2].column_name} / {store.schema_info[2].column_type}" + + - type: check + check: store.schema_info[3].column_name == "amount" && contains(store.schema_info[3].column_type, "DECIMAL") + failure_message: "Expected column 'amount' with type DECIMAL, got {store.schema_info[3].column_name} / {store.schema_info[3].column_type}" + + - type: check + check: store.schema_info[4].column_name == "is_active" && store.schema_info[4].column_type == "BOOLEAN" + failure_message: "Expected column 'is_active' with type BOOLEAN, got {store.schema_info[4].column_name} / {store.schema_info[4].column_type}" + + # Verify file has 0 rows (definition-only should not copy data) + - type: query + connection: duckdb + query: SELECT COUNT(*) as cnt FROM '{env.OUTPUT_PATH}/test_definition_only.parquet' + into: row_count + + - type: check + check: int_parse(store.row_count[0].cnt) == 0 + failure_message: "Expected 0 rows but found {store.row_count[0].cnt}" + + - type: log + message: "SUCCESS: Parquet file definition created with correct schema (definition-only mode)" + + # Cleanup source + - type: query + connection: '{source.name}' + query: DROP TABLE IF EXISTS public.test_definition_only_file + + # Cleanup output file + - command: rm -rf temp/definition_only_test + +streams: + public.test_definition_only_file: + object: '{env.OUTPUT_PATH}/test_definition_only.parquet' diff --git a/cmd/sling/tests/replications/r.91.definition_only_file_source.yaml b/cmd/sling/tests/replications/r.91.definition_only_file_source.yaml new file mode 100644 index 000000000..cc28d007c --- /dev/null +++ b/cmd/sling/tests/replications/r.91.definition_only_file_source.yaml @@ -0,0 +1,106 @@ +source: LOCAL +target: postgres + +defaults: + mode: definition-only + +hooks: + start: + # Drop target table if exists + - type: query + connection: '{target.name}' + query: DROP TABLE IF EXISTS public.test_definition_only_file_source + + end: + # Check that execution succeeded + - check: execution.status.error == 0 + on_failure: break + + # Verify table was created in target + - type: query + connection: '{target.name}' + query: | + SELECT table_name FROM information_schema.tables + WHERE table_schema = 'public' AND table_name = 'test_definition_only_file_source' + into: table_check + + - type: check + check: length(store.table_check) == 1 + failure_message: "Expected table to be created but it doesn't exist" + + # Verify table has 0 rows (definition-only should not copy data) + - type: query + connection: '{target.name}' + query: SELECT COUNT(*) as cnt FROM public.test_definition_only_file_source + into: row_count + + - type: check + check: int_parse(store.row_count[0].cnt) == 0 + failure_message: "Expected 0 rows but found {store.row_count[0].cnt}" + + # Get column information + - type: query + connection: '{target.name}' + query: | + SELECT column_name, data_type + FROM information_schema.columns + WHERE table_schema = 'public' AND table_name = 'test_definition_only_file_source' + ORDER BY ordinal_position + into: schema_info + + - type: log + message: "Schema info: {store.schema_info}" + + # Verify we have 10 columns (test1.parquet: id, first_name, last_name, email, target, create_dt, date, rating, code + _sling_loaded_at) + - type: check + check: length(store.schema_info) == 10 + failure_message: "Expected 10 columns but found {length(store.schema_info)}" + + # Verify column names and types + - type: check + check: store.schema_info[0].column_name == "id" && store.schema_info[0].data_type == "bigint" + failure_message: "Expected column 'id' with type bigint, got {store.schema_info[0].column_name} / {store.schema_info[0].data_type}" + + - type: check + check: store.schema_info[1].column_name == "first_name" && store.schema_info[1].data_type == "text" + failure_message: "Expected column 'first_name' with type text, got {store.schema_info[1].column_name} / {store.schema_info[1].data_type}" + + - type: check + check: store.schema_info[2].column_name == "last_name" && store.schema_info[2].data_type == "text" + failure_message: "Expected column 'last_name' with type text, got {store.schema_info[2].column_name} / {store.schema_info[2].data_type}" + + - type: check + check: store.schema_info[3].column_name == "email" && store.schema_info[3].data_type == "text" + failure_message: "Expected column 'email' with type text, got {store.schema_info[3].column_name} / {store.schema_info[3].data_type}" + + - type: check + check: store.schema_info[4].column_name == "target" && store.schema_info[4].data_type == "boolean" + failure_message: "Expected column 'target' with type boolean, got {store.schema_info[4].column_name} / {store.schema_info[4].data_type}" + + - type: check + check: store.schema_info[5].column_name == "create_dt" && contains(store.schema_info[5].data_type, "timestamp") + failure_message: "Expected column 'create_dt' with type timestamp, got {store.schema_info[5].column_name} / {store.schema_info[5].data_type}" + + - type: check + check: store.schema_info[6].column_name == "date" && contains(store.schema_info[6].data_type, "timestamp") + failure_message: "Expected column 'date' with type timestamp, got {store.schema_info[6].column_name} / {store.schema_info[6].data_type}" + + - type: check + check: store.schema_info[7].column_name == "rating" && store.schema_info[7].data_type == "numeric" + failure_message: "Expected column 'rating' with type numeric, got {store.schema_info[7].column_name} / {store.schema_info[7].data_type}" + + - type: check + check: store.schema_info[8].column_name == "code" && store.schema_info[8].data_type == "numeric" + failure_message: "Expected column 'code' with type numeric, got {store.schema_info[8].column_name} / {store.schema_info[8].data_type}" + + - type: log + message: "SUCCESS: Table created from parquet file source with schema and 0 rows (definition-only mode)" + + # Cleanup target + - type: query + connection: '{target.name}' + query: DROP TABLE IF EXISTS public.test_definition_only_file_source + +streams: + cmd/sling/tests/files/test1.parquet: + object: public.test_definition_only_file_source diff --git a/cmd/sling/tests/replications/r.92.oracle_xmltype_bigquery.yaml b/cmd/sling/tests/replications/r.92.oracle_xmltype_bigquery.yaml new file mode 100644 index 000000000..cf7f87d65 --- /dev/null +++ b/cmd/sling/tests/replications/r.92.oracle_xmltype_bigquery.yaml @@ -0,0 +1,125 @@ +# Test for Oracle XMLTYPE column transfer to BigQuery +# Issue: Process hangs when Oracle table has XMLTYPE column +# See: https://github.com/slingdata-io/sling-cli/issues/xxx +# Debug log shows: "using text since type 'xmltype' not mapped for col 'XMLRECORD'" +# The process then hangs for extended periods during BulkExportFlow + +source: ORACLE +target: BIGQUERY + +defaults: + mode: full-refresh + +hooks: + start: + # Clean up any existing test table in Oracle + - type: query + connection: '{source.name}' + query: | + BEGIN + EXECUTE IMMEDIATE 'DROP TABLE ORACLE.TEST_XMLTYPE_TRANSFER PURGE'; + EXCEPTION + WHEN OTHERS THEN + IF SQLCODE != -942 THEN + RAISE; + END IF; + END; + + # Create test table with XMLTYPE column + - type: query + connection: '{source.name}' + query: | + CREATE TABLE ORACLE.TEST_XMLTYPE_TRANSFER ( + id NUMBER PRIMARY KEY, + name VARCHAR2(100), + xmlrecord XMLTYPE, + created_at TIMESTAMP DEFAULT SYSTIMESTAMP + ) + + # Insert test data with XML content + - type: query + connection: '{source.name}' + query: | + INSERT INTO ORACLE.TEST_XMLTYPE_TRANSFER (id, name, xmlrecord) + VALUES (1, 'Record 1', XMLTYPE('Test XML 1100')) + + - type: query + connection: '{source.name}' + query: | + INSERT INTO ORACLE.TEST_XMLTYPE_TRANSFER (id, name, xmlrecord) + VALUES (2, 'Record 2', XMLTYPE('Test XML 2Data')) + + - type: query + connection: '{source.name}' + query: | + INSERT INTO ORACLE.TEST_XMLTYPE_TRANSFER (id, name, xmlrecord) + VALUES (3, 'Record 3', NULL) + + # Clean up target table in BigQuery + - type: query + connection: '{target.name}' + query: DROP TABLE IF EXISTS `sling_test.test_xmltype_transfer` + + end: + # If errored, do not proceed with verification + - type: check + check: execution.status.error == 0 + on_failure: break + + # Verify data was transferred + - type: query + connection: '{target.name}' + query: | + SELECT id, name, xmlrecord, created_at + FROM `sling_test.test_xmltype_transfer` + ORDER BY id + into: result + + - type: log + message: | + Result from BigQuery: + { pretty_table(store.result) } + + # Check row count (use length function) + - type: check + check: length(store.result) == 3 + message: "Should have transferred 3 rows" + + # Check that XMLTYPE was converted to text (use content from inner XML to avoid < parsing issues) + - type: check + check: contains(store.result[0].xmlrecord, "Test XML 1") + message: "XMLTYPE content should be preserved as text" + + - type: check + check: contains(store.result[1].xmlrecord, "nested") + message: "Second XMLTYPE record should contain nested XML" + + # Third record has NULL XML + - type: check + check: is_null(store.result[2].xmlrecord) || store.result[2].xmlrecord == "" + message: "NULL XMLTYPE should transfer as null/empty" + + - type: log + message: "SUCCESS: Oracle XMLTYPE to BigQuery transfer completed without hanging!" + + # Cleanup Oracle table + - type: query + connection: '{source.name}' + query: | + BEGIN + EXECUTE IMMEDIATE 'DROP TABLE ORACLE.TEST_XMLTYPE_TRANSFER PURGE'; + EXCEPTION + WHEN OTHERS THEN + IF SQLCODE != -942 THEN + RAISE; + END IF; + END; + + # Cleanup BigQuery table + - type: query + connection: '{target.name}' + query: DROP TABLE IF EXISTS `sling_test.test_xmltype_transfer` + +streams: + ORACLE.TEST_XMLTYPE_TRANSFER: + object: sling_test.test_xmltype_transfer diff --git a/cmd/sling/tests/replications/r.93.mysql_load_data_local.yaml b/cmd/sling/tests/replications/r.93.mysql_load_data_local.yaml new file mode 100644 index 000000000..715988fde --- /dev/null +++ b/cmd/sling/tests/replications/r.93.mysql_load_data_local.yaml @@ -0,0 +1,54 @@ +# Test MySQL LoadDataLocal using RegisterReaderHandler pattern +# This validates that LOAD DATA LOCAL INFILE works with the go-sql-driver's +# native Reader:: handler pattern without requiring external mysql binary. +source: local +target: mysql + +defaults: + mode: full-refresh + +hooks: + end: + # Check execution succeeded + - type: check + check: execution.status.error == 0 + on_failure: break + + # Verify data was loaded + - type: query + connection: '{target.name}' + query: SELECT COUNT(*) as cnt FROM mysql.test_load_local + into: result + + - type: log + message: "Row count: {store.result[0].cnt}" + + # Verify row count matches source file (18 rows in test1.1.csv) + - type: check + check: int_parse(store.result[0].cnt) == 18 + failure_message: "Expected 18 rows but found {store.result[0].cnt}" + + # Sample some data to verify correctness + - type: query + connection: '{target.name}' + query: SELECT id, first_name, last_name, email FROM mysql.test_load_local WHERE id = 1 + into: sample_row + + - type: log + message: "Sample row: {store.sample_row}" + + - type: check + check: int_parse(store.sample_row[0].id) == 1 + failure_message: "Expected id=1 but found {store.sample_row[0].id}" + + - type: log + message: "SUCCESS: MySQL LoadDataLocal test passed" + + # Cleanup target table + - type: query + connection: '{target.name}' + query: DROP TABLE IF EXISTS mysql.test_load_local + +streams: + file://cmd/sling/tests/files/test1.1.csv: + object: mysql.test_load_local diff --git a/cmd/sling/tests/replications/r.94.mysql_load_data_local_nulls.yaml b/cmd/sling/tests/replications/r.94.mysql_load_data_local_nulls.yaml new file mode 100644 index 000000000..e05abe323 --- /dev/null +++ b/cmd/sling/tests/replications/r.94.mysql_load_data_local_nulls.yaml @@ -0,0 +1,211 @@ +# Test MySQL LoadDataLocal NULL handling +# Validates that NULL values are correctly transmitted via LOAD DATA LOCAL INFILE +# Uses PostgreSQL as source with generate_data and manual NULL insertions +source: postgres +target: mysql + +defaults: + mode: full-refresh + +hooks: + start: + # Generate test data in PostgreSQL + - type: query + connection: '{source.name}' + operation: generate_data + params: + table: public.mysql_null_test + rows: 50 + columns: + col_bigint: bigint + col_bool: bool + col_date: date + col_datetime: datetime + col_decimal: decimal + col_integer: integer + col_smallint: smallint + col_string: string + col_text: text + col_float: float + + # Insert 3 rows with NULL values and special characters + - type: query + connection: '{source.name}' + query: | + INSERT INTO public.mysql_null_test + (col_bigint, col_bool, col_date, col_datetime, col_decimal, col_integer, col_smallint, col_string, col_text, col_float) + VALUES + (NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL), + (999999, NULL, NULL, '2024-01-15 10:30:00', NULL, 12345, NULL, NULL, 'text with value', NULL), + (888888, true, '2024-06-20', '2024-06-20 15:45:30', 123.456, 54321, 100, 'string with "double quotes" and ''single quotes''', 'text with tabs and + newlines and "quotes" here', 999.99); + + end: + # Check execution succeeded + - type: check + check: execution.status.error == 0 + on_failure: break + + # Verify row count matches (50 generated + 3 manual = 53) + - type: query + connection: '{source.name}' + query: SELECT COUNT(*) as cnt FROM public.mysql_null_test + into: source_count + + - type: query + connection: '{target.name}' + query: SELECT COUNT(*) as cnt FROM mysql.mysql_null_test + into: target_count + + - type: log + message: "Source rows: {store.source_count[0].cnt}, Target rows: {store.target_count[0].cnt}" + + - type: check + check: store.source_count[0].cnt == store.target_count[0].cnt + failure_message: "Row count mismatch: source={store.source_count[0].cnt}, target={store.target_count[0].cnt}" + + # Count NULL values in source + - type: query + connection: '{source.name}' + query: | + SELECT + SUM(CASE WHEN col_string IS NULL THEN 1 ELSE 0 END) as null_string_cnt, + SUM(CASE WHEN col_integer IS NULL THEN 1 ELSE 0 END) as null_integer_cnt, + SUM(CASE WHEN col_decimal IS NULL THEN 1 ELSE 0 END) as null_decimal_cnt, + SUM(CASE WHEN col_bool IS NULL THEN 1 ELSE 0 END) as null_bool_cnt, + SUM(CASE WHEN col_text IS NULL THEN 1 ELSE 0 END) as null_text_cnt + FROM public.mysql_null_test + into: source_nulls + + # Count NULL values in target + - type: query + connection: '{target.name}' + query: | + SELECT + SUM(CASE WHEN col_string IS NULL THEN 1 ELSE 0 END) as null_string_cnt, + SUM(CASE WHEN col_integer IS NULL THEN 1 ELSE 0 END) as null_integer_cnt, + SUM(CASE WHEN col_decimal IS NULL THEN 1 ELSE 0 END) as null_decimal_cnt, + SUM(CASE WHEN col_bool IS NULL THEN 1 ELSE 0 END) as null_bool_cnt, + SUM(CASE WHEN col_text IS NULL THEN 1 ELSE 0 END) as null_text_cnt + FROM mysql.mysql_null_test + into: target_nulls + + - type: log + message: | + Source NULLs: string={store.source_nulls[0].null_string_cnt}, integer={store.source_nulls[0].null_integer_cnt}, decimal={store.source_nulls[0].null_decimal_cnt} + Target NULLs: string={store.target_nulls[0].null_string_cnt}, integer={store.target_nulls[0].null_integer_cnt}, decimal={store.target_nulls[0].null_decimal_cnt} + + # Verify NULL counts match for each column type + - type: check + check: int_parse(store.source_nulls[0].null_string_cnt) == int_parse(store.target_nulls[0].null_string_cnt) + failure_message: "NULL string count mismatch: source={store.source_nulls[0].null_string_cnt}, target={store.target_nulls[0].null_string_cnt}" + + - type: check + check: int_parse(store.source_nulls[0].null_integer_cnt) == int_parse(store.target_nulls[0].null_integer_cnt) + failure_message: "NULL integer count mismatch: source={store.source_nulls[0].null_integer_cnt}, target={store.target_nulls[0].null_integer_cnt}" + + - type: check + check: int_parse(store.source_nulls[0].null_decimal_cnt) == int_parse(store.target_nulls[0].null_decimal_cnt) + failure_message: "NULL decimal count mismatch: source={store.source_nulls[0].null_decimal_cnt}, target={store.target_nulls[0].null_decimal_cnt}" + + - type: check + check: int_parse(store.source_nulls[0].null_text_cnt) == int_parse(store.target_nulls[0].null_text_cnt) + failure_message: "NULL text count mismatch: source={store.source_nulls[0].null_text_cnt}, target={store.target_nulls[0].null_text_cnt}" + + # Calculate checksums on source (PostgreSQL) + - type: query + connection: '{source.name}' + query: | + SELECT + SUM(COALESCE(col_bigint, 0)) as sum_bigint, + SUM(COALESCE(col_integer, 0)) as sum_integer, + SUM(COALESCE(col_smallint, 0)) as sum_smallint, + SUM(COALESCE(CAST(col_decimal AS NUMERIC), 0)) as sum_decimal, + SUM(COALESCE(LENGTH(col_string), 0)) as sum_string_len, + SUM(COALESCE(LENGTH(col_text), 0)) as sum_text_len + FROM public.mysql_null_test + into: source_checksum + + # Calculate checksums on target (MySQL) + - type: query + connection: '{target.name}' + query: | + SELECT + SUM(COALESCE(col_bigint, 0)) as sum_bigint, + SUM(COALESCE(col_integer, 0)) as sum_integer, + SUM(COALESCE(col_smallint, 0)) as sum_smallint, + SUM(COALESCE(CAST(col_decimal AS DECIMAL(30,10)), 0)) as sum_decimal, + SUM(COALESCE(LENGTH(col_string), 0)) as sum_string_len, + SUM(COALESCE(LENGTH(col_text), 0)) as sum_text_len + FROM mysql.mysql_null_test + into: target_checksum + + - type: log + message: | + Checksums: + Source: bigint={store.source_checksum[0].sum_bigint}, integer={store.source_checksum[0].sum_integer}, string_len={store.source_checksum[0].sum_string_len} + Target: bigint={store.target_checksum[0].sum_bigint}, integer={store.target_checksum[0].sum_integer}, string_len={store.target_checksum[0].sum_string_len} + + # Verify checksums match + - type: check + check: int_parse(store.source_checksum[0].sum_bigint) == int_parse(store.target_checksum[0].sum_bigint) + failure_message: "Bigint checksum mismatch" + + - type: check + check: int_parse(store.source_checksum[0].sum_integer) == int_parse(store.target_checksum[0].sum_integer) + failure_message: "Integer checksum mismatch" + + - type: check + check: int_parse(store.source_checksum[0].sum_string_len) == int_parse(store.target_checksum[0].sum_string_len) + failure_message: "String length checksum mismatch" + + - type: check + check: int_parse(store.source_checksum[0].sum_text_len) == int_parse(store.target_checksum[0].sum_text_len) + failure_message: "Text length checksum mismatch" + + # Verify special characters row (col_bigint = 888888) transferred correctly + - type: query + connection: '{target.name}' + query: | + SELECT col_string, col_text + FROM mysql.mysql_null_test + WHERE col_bigint = 888888 + into: special_row + + - type: log + message: | + Special characters row: + col_string: {store.special_row[0].col_string} + col_text: {store.special_row[0].col_text} + + - type: check + check: contains(store.special_row[0].col_string, "\"double quotes\"") && contains(store.special_row[0].col_string, "'single quotes'") + failure_message: "col_string should contain both double and single quotes" + + - type: check + check: contains(store.special_row[0].col_text, "tabs and") + failure_message: "col_text should contain 'tabs and'" + + - type: check + check: contains(store.special_row[0].col_text, "newlines and") + failure_message: "col_text should contain 'newlines and'" + + - type: check + check: contains(store.special_row[0].col_text, "quotes") + failure_message: "col_text should contain quotes" + + - type: log + message: "SUCCESS: MySQL LoadDataLocal NULL handling test passed" + + # Cleanup + - type: query + connection: '{source.name}' + query: DROP TABLE IF EXISTS public.mysql_null_test + + - type: query + connection: '{target.name}' + query: DROP TABLE IF EXISTS mysql.mysql_null_test + +streams: + public.mysql_null_test: + object: mysql.mysql_null_test diff --git a/cmd/sling/tests/replications/r.95.select_column_rename.yaml b/cmd/sling/tests/replications/r.95.select_column_rename.yaml new file mode 100644 index 000000000..0f470d47e --- /dev/null +++ b/cmd/sling/tests/replications/r.95.select_column_rename.yaml @@ -0,0 +1,322 @@ +source: postgres +target: postgres + +defaults: + mode: full-refresh + +hooks: + start: + # Create main test table with many columns + - type: query + connection: '{source.name}' + query: | + DROP TABLE IF EXISTS public.test_select_main; + CREATE TABLE public.test_select_main ( + id INT, + first_name VARCHAR(100), + last_name VARCHAR(100), + email VARCHAR(100), + phone VARCHAR(50), + address_line1 VARCHAR(200), + address_line2 VARCHAR(200), + city VARCHAR(100), + state VARCHAR(50), + zip_code VARCHAR(20), + country VARCHAR(100), + created_at TIMESTAMP, + updated_at TIMESTAMP, + is_active BOOLEAN, + score DECIMAL(10,2) + ); + INSERT INTO public.test_select_main VALUES + (1, 'Alice', 'Smith', 'alice@example.com', '555-0101', '123 Main St', 'Apt 1', 'New York', 'NY', '10001', 'USA', '2024-01-01 10:00:00', '2024-01-15 12:00:00', true, 95.50), + (2, 'Bob', 'Jones', 'bob@example.com', '555-0102', '456 Oak Ave', NULL, 'Los Angeles', 'CA', '90001', 'USA', '2024-01-02 11:00:00', '2024-01-16 13:00:00', true, 87.25), + (3, 'Charlie', 'Brown', 'charlie@example.com', '555-0103', '789 Pine Rd', 'Suite 100', 'Chicago', 'IL', '60601', 'USA', '2024-01-03 12:00:00', '2024-01-17 14:00:00', false, 72.00); + + # Create copy for exclusion test + - type: query + connection: '{source.name}' + query: | + DROP TABLE IF EXISTS public.test_select_main_exclude; + CREATE TABLE public.test_select_main_exclude AS SELECT * FROM public.test_select_main; + + # Create copy for wildcard test + - type: query + connection: '{source.name}' + query: | + DROP TABLE IF EXISTS public.test_select_main_wildcard; + CREATE TABLE public.test_select_main_wildcard AS SELECT * FROM public.test_select_main; + + # Create secondary test table for custom SQL tests + - type: query + connection: '{source.name}' + query: | + DROP TABLE IF EXISTS public.test_select_orders; + CREATE TABLE public.test_select_orders ( + order_id INT, + customer_id INT, + order_date DATE, + ship_date DATE, + total_amount DECIMAL(12,2), + discount_amount DECIMAL(10,2), + tax_amount DECIMAL(10,2), + status VARCHAR(50) + ); + INSERT INTO public.test_select_orders VALUES + (101, 1, '2024-01-10', '2024-01-12', 150.00, 10.00, 12.60, 'delivered'), + (102, 2, '2024-01-11', '2024-01-14', 250.00, 25.00, 20.25, 'delivered'), + (103, 3, '2024-01-12', NULL, 75.00, 0.00, 6.75, 'pending'); + + # Cleanup target tables + - type: query + connection: '{target.name}' + query: | + DROP TABLE IF EXISTS public.test_select_rename_basic; + DROP TABLE IF EXISTS public.test_select_exclude; + DROP TABLE IF EXISTS public.test_select_wildcard; + DROP TABLE IF EXISTS public.test_select_mixed; + DROP TABLE IF EXISTS public.test_select_sql_rename; + DROP TABLE IF EXISTS public.test_select_sql_fields; + + end: + # Check for errors first + - check: execution.status.error == 0 + on_failure: break + + # + # Test 1: Basic column renaming (table stream) + # + - type: query + connection: '{target.name}' + query: SELECT column_name FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'test_select_rename_basic' ORDER BY ordinal_position + into: basic_cols + + - log: "Test 1 - Basic rename columns: {store.basic_cols}" + + - check: store.basic_cols[0].column_name == "user_id" + + - check: store.basic_cols[1].column_name == "full_name" + + - check: store.basic_cols[2].column_name == "user_email" + + - type: query + connection: '{target.name}' + query: SELECT * FROM public.test_select_rename_basic ORDER BY user_id + into: basic_data + + - check: store.basic_data[0].full_name == "Alice" + + - check: store.basic_data[1].user_email == "bob@example.com" + + # + # Test 2: Exclusion with - prefix (table stream) + # + - type: query + connection: '{target.name}' + query: SELECT column_name FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'test_select_exclude' ORDER BY ordinal_position + into: exclude_cols + + - log: "Test 2 - Exclude columns: {store.exclude_cols}" + + # Verify excluded columns are not present (address_line1, address_line2, city, state, zip_code, country) + - type: query + connection: '{target.name}' + query: SELECT COUNT(*) as cnt FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'test_select_exclude' AND column_name IN ('address_line1', 'address_line2', 'city', 'state', 'zip_code', 'country') + into: excluded_check + + - check: int_parse(store.excluded_check[0].cnt) == 0 + + # Verify we have the expected remaining columns (id should exist) + - type: query + connection: '{target.name}' + query: SELECT COUNT(*) as cnt FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'test_select_exclude' AND column_name = 'id' + into: id_check + + - check: int_parse(store.id_check[0].cnt) == 1 + + # + # Test 3: Wildcard exclusion with glob pattern (table stream) + # + - type: query + connection: '{target.name}' + query: SELECT column_name FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'test_select_wildcard' ORDER BY ordinal_position + into: wildcard_cols + + - log: "Test 3 - Wildcard exclude columns: {store.wildcard_cols}" + + # Should NOT have address_line1, address_line2 (excluded by address_*) + - type: query + connection: '{target.name}' + query: SELECT COUNT(*) as cnt FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'test_select_wildcard' AND column_name LIKE 'address%' + into: wildcard_check + + - check: int_parse(store.wildcard_check[0].cnt) == 0 + + # + # Test 4: Mixed - select specific columns with rename (using {fields}) + # + - type: query + connection: '{target.name}' + query: SELECT column_name FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'test_select_mixed' ORDER BY ordinal_position + into: mixed_cols + + - log: "Test 4 - Mixed select columns: {store.mixed_cols}" + + # Verify exactly 4 columns + - type: query + connection: '{target.name}' + query: SELECT COUNT(*) as cnt FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'test_select_mixed' + into: mixed_count + + - check: int_parse(store.mixed_count[0].cnt) == 4 + + - check: store.mixed_cols[0].column_name == "user_id" + + - check: store.mixed_cols[1].column_name == "name" + + - check: store.mixed_cols[2].column_name == "contact_email" + + - check: store.mixed_cols[3].column_name == "active" + + - type: query + connection: '{target.name}' + query: SELECT * FROM public.test_select_mixed ORDER BY user_id + into: mixed_data + + - check: store.mixed_data[0].name == "Alice" + + - check: store.mixed_data[2].contact_email == "charlie@example.com" + + # + # Test 5: Table stream with select rename (orders table) + # + - type: query + connection: '{target.name}' + query: SELECT column_name FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'test_select_sql_rename' ORDER BY ordinal_position + into: sql_rename_cols + + - log: "Test 5 - Table with rename columns: {store.sql_rename_cols}" + + - check: store.sql_rename_cols[0].column_name == "order_number" + + - check: store.sql_rename_cols[1].column_name == "customer" + + - check: store.sql_rename_cols[2].column_name == "order_total" + + - type: query + connection: '{target.name}' + query: SELECT * FROM public.test_select_sql_rename ORDER BY order_number + into: sql_rename_data + + - check: int_parse(store.sql_rename_data[0].order_number) == 101 + + - check: float_parse(store.sql_rename_data[1].order_total) == 250.00 + + # + # Test 6: Custom SQL with {fields} placeholder and select rename + # + - type: query + connection: '{target.name}' + query: SELECT column_name FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'test_select_sql_fields' ORDER BY ordinal_position + into: sql_fields_cols + + - log: "Test 6 - SQL {fields} placeholder columns: {store.sql_fields_cols}" + + # Verify exactly 3 columns + - type: query + connection: '{target.name}' + query: SELECT COUNT(*) as cnt FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'test_select_sql_fields' + into: sql_fields_count + + - check: int_parse(store.sql_fields_count[0].cnt) == 3 + + - check: store.sql_fields_cols[0].column_name == "id" + + - check: store.sql_fields_cols[1].column_name == "customer_name" + + - check: store.sql_fields_cols[2].column_name == "customer_score" + + - type: query + connection: '{target.name}' + query: SELECT * FROM public.test_select_sql_fields ORDER BY id + into: sql_fields_data + + - check: store.sql_fields_data[0].customer_name == "Alice" + + - check: float_parse(store.sql_fields_data[1].customer_score) == 87.25 + + # Cleanup + - type: query + connection: '{source.name}' + query: | + DROP TABLE IF EXISTS public.test_select_main; + DROP TABLE IF EXISTS public.test_select_main_exclude; + DROP TABLE IF EXISTS public.test_select_main_wildcard; + DROP TABLE IF EXISTS public.test_select_orders; + + - type: query + connection: '{target.name}' + query: | + DROP TABLE IF EXISTS public.test_select_rename_basic; + DROP TABLE IF EXISTS public.test_select_exclude; + DROP TABLE IF EXISTS public.test_select_wildcard; + DROP TABLE IF EXISTS public.test_select_mixed; + DROP TABLE IF EXISTS public.test_select_sql_rename; + DROP TABLE IF EXISTS public.test_select_sql_fields; + +streams: + # Test 1: Basic column renaming (table stream - select works directly) + public.test_select_main: + object: public.test_select_rename_basic + select: + - 'id as user_id' + - 'first_name as full_name' + - 'email as user_email' + + # Test 2: Exclusion - exclude all address-related columns (table stream) + # Note: When using exclusions, ALL select items must be exclusions (- prefix) + public.test_select_main_exclude: + object: public.test_select_exclude + select: + - '-address_line1' + - '-address_line2' + - '-city' + - '-state' + - '-zip_code' + - '-country' + + # Test 3: Wildcard exclusion - exclude address_* columns (table stream) + public.test_select_main_wildcard: + object: public.test_select_wildcard + select: + - '-address_*' + + # Test 4: Mixed - select specific columns with some renamed (using {fields}) + test_select_mixed: + sql: SELECT {fields} FROM public.test_select_main + object: public.test_select_mixed + select: + - 'id as user_id' + - 'first_name as name' + - 'email as contact_email' + - 'is_active as active' + + # Test 5: Table stream with select rename (orders table) + public.test_select_orders: + object: public.test_select_sql_rename + select: + - 'order_id as order_number' + - 'customer_id as customer' + - 'total_amount as order_total' + + # Test 6: Custom SQL with {fields} placeholder and select rename + test_select_sql_fields: + sql: | + SELECT {fields} + FROM public.test_select_main + WHERE is_active = true + object: public.test_select_sql_fields + select: + - 'id' + - 'first_name as customer_name' + - 'score as customer_score' diff --git a/cmd/sling/tests/replications/r.96.s3_multi_bucket.yaml b/cmd/sling/tests/replications/r.96.s3_multi_bucket.yaml new file mode 100644 index 000000000..8a901465e --- /dev/null +++ b/cmd/sling/tests/replications/r.96.s3_multi_bucket.yaml @@ -0,0 +1,83 @@ +# Test S3 multi-bucket access with a single connection +# This test validates that a single S3 connection (with valid AWS credentials) +# can access files from multiple buckets by specifying full S3 URIs in stream names. + +source: aws_s3 +target: postgres + +defaults: + mode: full-refresh + target_options: + adjust_column_type: true + +hooks: + end: + - type: check + check: execution.status.error == 0 + on_failure: break + + - type: query + connection: '{target.name}' + query: select count(*) as cnt from public.s3_multi_bucket_test1 + into: result1 + + - type: query + connection: '{target.name}' + query: select count(*) as cnt from public.s3_multi_bucket_test2 + into: result2 + + - type: query + connection: '{target.name}' + query: select count(*) as cnt from public.s3_multi_bucket_test3 + into: result3 + + - type: log + message: | + ✓ Stream 1 (test-bucket-west-345141): {store.result1[0].cnt} rows + ✓ Stream 2 (ocral-data-1): {store.result2[0].cnt} rows + ✓ Stream 3 (multi_bucket with files key): {store.result3[0].cnt} rows + + - type: check + check: int_parse(store.result1[0].cnt) > 0 + on_failure: abort + message: "FAIL: Stream 1 should have rows" + + - type: check + check: int_parse(store.result2[0].cnt) > 0 + on_failure: abort + message: "FAIL: Stream 2 should have rows" + + - type: check + check: int_parse(store.result3[0].cnt) > 0 + on_failure: abort + message: "FAIL: Stream 3 (files key) should have rows" + + # Stream 3 should have rows from both files (2x the rows of stream 1 or 2) + - type: check + check: int_parse(store.result3[0].cnt) == int_parse(store.result1[0].cnt) + int_parse(store.result2[0].cnt) + on_failure: abort + message: "FAIL: Stream 3 should have combined rows from both buckets" + + - type: log + message: "✅ SUCCESS: S3 multi-bucket access with single connection works correctly" + + # Cleanup + - type: query + connection: '{target.name}' + query: | + DROP TABLE IF EXISTS public.s3_multi_bucket_test1; + DROP TABLE IF EXISTS public.s3_multi_bucket_test2; + DROP TABLE IF EXISTS public.s3_multi_bucket_test3; + +streams: + 's3://test-bucket-west-345141/test1.csv': + object: 'public.s3_multi_bucket_test1' + + 's3://ocral-data-1/test1.csv': + object: 'public.s3_multi_bucket_test2' + + multi_bucket: + files: + - s3://test-bucket-west-345141/test1.csv + - s3://ocral-data-1/test1.csv + object: 'public.s3_multi_bucket_test3' diff --git a/cmd/sling/tests/replications/r.97.delete_missing_with_transforms.yaml b/cmd/sling/tests/replications/r.97.delete_missing_with_transforms.yaml new file mode 100644 index 000000000..cd3587115 --- /dev/null +++ b/cmd/sling/tests/replications/r.97.delete_missing_with_transforms.yaml @@ -0,0 +1,146 @@ +# Test that transforms don't break delete_missing functionality +# This tests the bug where transforms are applied during delete detection, +# causing errors because delete detection only selects PK columns. + +source: MSSQL +target: POSTGRES + +hooks: + start: + # Create source table with 10 rows + - type: query + connection: '{source.name}' + query: | + IF OBJECT_ID('dbo.delete_transform_test', 'U') IS NOT NULL DROP TABLE dbo.delete_transform_test; + CREATE TABLE dbo.delete_transform_test ( + id INT PRIMARY KEY, + dateadded DATETIME DEFAULT GETDATE(), + lastchanged DATETIME DEFAULT GETDATE(), + value NVARCHAR(100) + ); + INSERT INTO dbo.delete_transform_test (id, dateadded, lastchanged, value) VALUES + (1, '2024-01-15', '2024-01-20', 'row1'), + (2, '2024-02-10', '2024-02-25', 'row2'), + (3, '2024-03-05', '2024-03-10', 'row3'), + (4, '2024-04-01', '2024-04-05', 'row4'), + (5, '2024-05-01', '2024-05-15', 'row5'), + (6, '2024-06-01', '2024-06-10', 'row6'), + (7, '2024-07-01', '2024-07-20', 'row7'), + (8, '2024-08-01', '2024-08-25', 'row8'), + (9, '2024-09-01', '2024-09-30', 'row9'), + (10, '2024-10-01', '2024-10-15', 'row10'); + + # Create second source table with only 8 rows (IDs 1-8) + # IDs 9 and 10 will be soft-deleted when this syncs to target + - type: query + connection: '{source.name}' + query: | + IF OBJECT_ID('dbo.delete_transform_test2', 'U') IS NOT NULL DROP TABLE dbo.delete_transform_test2; + CREATE TABLE dbo.delete_transform_test2 ( + id INT PRIMARY KEY, + dateadded DATETIME DEFAULT GETDATE(), + lastchanged DATETIME DEFAULT GETDATE(), + value NVARCHAR(100) + ); + INSERT INTO dbo.delete_transform_test2 (id, dateadded, lastchanged, value) VALUES + (1, '2024-01-15', '2024-01-20', 'row1'), + (2, '2024-02-10', '2024-02-25', 'row2'), + (3, '2024-03-05', '2024-03-10', 'row3'), + (4, '2024-04-01', '2024-04-05', 'row4'), + (5, '2024-05-01', '2024-05-15', 'row5'), + (6, '2024-06-01', '2024-06-10', 'row6'), + (7, '2024-07-01', '2024-07-20', 'row7'), + (8, '2024-08-01', '2024-08-25', 'row8'); + + end: + - type: check + check: execution.status.error == 0 + on_failure: break + + # Verify row count (10 rows total, 2 soft-deleted) + - type: query + connection: '{target.name}' + query: SELECT COUNT(*) as count FROM public.delete_transform_test + into: total_count + + - type: check + check: int_parse(store.total_count[0].count) == 10 + failure_message: "Expected 10 total rows, got {store.total_count[0].count}" + + # Verify computed transform column exists + - type: query + connection: '{target.name}' + query: | + SELECT COUNT(*) as col_exists + FROM information_schema.columns + WHERE table_schema = 'public' + AND table_name = 'delete_transform_test' + AND column_name = 'computed_date' + into: computed_col_exists + + - type: check + check: int_parse(store.computed_col_exists[0].col_exists) == 1 + failure_message: "computed_date column should exist from transform" + + - type: log + message: "SUCCESS: computed_date column exists from transform" + + # Verify soft-deleted rows (IDs 9-10) have _sling_deleted_at set + - type: query + connection: '{target.name}' + query: | + SELECT COUNT(*) as deleted_count + FROM public.delete_transform_test + WHERE id IN (9, 10) AND _sling_deleted_at IS NOT NULL + into: deleted_count + + - type: check + check: int_parse(store.deleted_count[0].deleted_count) == 2 + failure_message: "Expected 2 soft-deleted rows (IDs 9,10), got {store.deleted_count[0].deleted_count}" + + - type: log + message: "SUCCESS: IDs 9,10 are soft-deleted correctly" + + # Cleanup + - type: query + connection: '{source.name}' + query: | + IF OBJECT_ID('dbo.delete_transform_test', 'U') IS NOT NULL DROP TABLE dbo.delete_transform_test; + IF OBJECT_ID('dbo.delete_transform_test2', 'U') IS NOT NULL DROP TABLE dbo.delete_transform_test2; + + - type: query + connection: '{target.name}' + query: DROP TABLE IF EXISTS public.delete_transform_test CASCADE; + +streams: + # First stream: full-refresh to create table with all 10 rows and transform + dbo.delete_transform_test: + object: public.delete_transform_test + mode: full-refresh + primary_key: [id] + target_options: + column_casing: lower + + transforms: + - computed_date: > + greatest( + date_parse(record.dateadded), + date_parse(record.lastchanged) + ) + + # Second stream: incremental with delete_missing to soft-delete rows 9,10 + # This is where the bug manifests - transforms should NOT be applied during delete detection + dbo.delete_transform_test2: + object: public.delete_transform_test + mode: incremental + primary_key: [id] + target_options: + delete_missing: soft + column_casing: lower + + transforms: + - computed_date: > + greatest( + date_parse(record.dateadded), + date_parse(record.lastchanged) + ) diff --git a/cmd/sling/tests/suite.cli.yaml b/cmd/sling/tests/suite.cli.yaml index 6c21c6e19..aaa342104 100644 --- a/cmd/sling/tests/suite.cli.yaml +++ b/cmd/sling/tests/suite.cli.yaml @@ -864,7 +864,7 @@ sling conns exec mssql file://cmd/sling/tests/replications/r.47.mssql_uniqueidentifier.sql SLING_CLI_TOKEN='' sling run -d -r cmd/sling/tests/replications/r.47.mssql_uniqueidentifier.yaml sling run -d -r cmd/sling/tests/replications/r.47.mssql_uniqueidentifier.postgres.yaml - sling run -d -r cmd/sling/tests/replications/r.47.fabric_uniqueidentifier.yaml + # sling run -d -r cmd/sling/tests/replications/r.47.fabric_uniqueidentifier.yaml output_contains: - "execution succeeded" - "DROP TABLE public.unique_identifier_test2" @@ -1099,11 +1099,11 @@ - '✓ All 100,000 CSV records imported successfully into Postgres' # Test MSSQL to Databricks with uppercase table name (Issue #664) -- id: 128 - name: Test MSSQL to Databricks with uppercase table name (Issue #664) - run: 'sling run -d -r cmd/sling/tests/replications/r.66.mssql_databricks_uppercase_table.yaml' - output_contains: - - 'execution succeeded' +# - id: 128 +# name: Test MSSQL to Databricks with uppercase table name (Issue #664) +# run: 'sling run -d -r cmd/sling/tests/replications/r.66.mssql_databricks_uppercase_table.yaml' +# output_contains: +# - 'execution succeeded' # Test JSON camelCase to PostgreSQL with column_casing (should only create snake_case columns) - id: 129 @@ -1381,6 +1381,41 @@ - 'SUCCESS: BIT correctly cast to VARCHAR for mariadb' - 'SUCCESS: BIT correctly cast to BOOLEAN for mariadb' +- id: 155 + name: 'Test SLING_SYNCED_AT_COLUMN with soft delete (MSSQL to Postgres)' + run: | + sling run --debug -r cmd/sling/tests/replications/r.85.mssql_postgres_synced_at.yaml + output_contains: + - 'execution succeeded' + - "SUCCESS: All 10 rows have _sling_synced_op='I' after full-refresh" + - 'SUCCESS: _sling_synced_at column exists' + - 'SUCCESS: _sling_deleted_at column does NOT exist' + - 'SUCCESS: synced_at_test1 has 2 distinct _sling_synced_at values' + - 'SUCCESS: _sling_synced_op column exists' + - "SUCCESS: IDs 9,10 have _sling_synced_op='D' (soft deleted)" + - "SUCCESS: IDs 1-8 have _sling_synced_op='U' (updated)" + +- id: 156 + name: Test mixed-case record key references in transforms (MySQL to local parquet) + run: 'sling run -d -r cmd/sling/tests/replications/r.86.record_key_casing.yaml' + streams: 1 + rows: 3 + output_contains: + - 'SUCCESS: All 3 rows exported successfully with mixed-case column transform' + - 'SUCCESS: true_changed_at column was computed correctly' + - 'execution succeeded' + +- id: 157 + name: Test mixed-case record key references in transforms (MySQL to BigQuery) + run: 'sling run -d -r cmd/sling/tests/replications/r.87.record_key_casing_bigquery.yaml' + after: [156] + streams: 1 + rows: 3 + output_contains: + - 'SUCCESS: All 3 rows exported successfully with mixed-case column transform (BigQuery)' + - 'SUCCESS: true_changed_at column was computed correctly (BigQuery)' + - 'execution succeeded' + # # Test PostGIS (PostgreSQL) to GeoJSON as target with geometry column name "geom" # - id: 150 # name: Test PostGIS (PostgreSQL) to GeoJSON as target @@ -1402,3 +1437,113 @@ # - '"type":"FeatureCollection"' # - '"geometry":{"type":"Point","coordinates":[9.09425263416477,53.4920035631827]}' # - '"properties":{"id":1,"name":"Point 1"}' + +# GitHub Issue #694: table_ddl with user-defined PRIMARY KEY and WITH clause +- id: 158 + name: 'Test custom table_ddl with PRIMARY KEY and WITH clause (MSSQL)' + run: 'sling run -d -r cmd/sling/tests/replications/r.88.table_ddl_with_clause.yaml' + streams: 2 + rows: 2 + output_contains: + - 'execution succeeded' + - 'Primary key columns:' + - 'Primary key columns (WITH clause table):' + - 'Compression info:' + +# GitHub Issue #678: definition-only mode +- id: 159 + name: 'Test definition-only mode creates table without data (Postgres to MSSQL)' + run: 'sling run -d -r cmd/sling/tests/replications/r.89.definition_only_db.yaml' + streams: 1 + rows: 0 + output_contains: + - 'created table definition' + - 'execution succeeded' + - 'SUCCESS: Table definition created with correct schema and 0 rows' + +- id: 160 + name: 'Test definition-only mode creates parquet file without data' + run: 'sling run -d -r cmd/sling/tests/replications/r.90.definition_only_file.yaml' + streams: 1 + rows: 0 + output_contains: + - 'where 1=0' + - 'execution succeeded' + - 'SUCCESS: Parquet file definition created with correct schema' + +- id: 161 + name: 'Test definition-only mode fails for CSV file target' + run: 'sling run --src-conn POSTGRES --src-stream "select 1 as a" --tgt-object file:///tmp/test_def_only.csv --mode definition-only' + err: true + output_contains: + - 'only supports parquet or arrow formats' + +- id: 162 + name: 'Test definition-only mode from parquet file source to database' + run: 'sling run -d -r cmd/sling/tests/replications/r.91.definition_only_file_source.yaml' + output_contains: + - 'execution succeeded' + - 'SUCCESS: Table created from parquet file source with schema and 0 rows' + +# Test Oracle XMLTYPE column transfer to BigQuery (hang issue) +- id: 163 + name: 'Test Oracle XMLTYPE column to BigQuery transfer (hang issue)' + run: 'sling run -d -r cmd/sling/tests/replications/r.92.oracle_xmltype_bigquery.yaml' + streams: 1 + rows: 3 + output_contains: + - "using text since type 'xmltype' not mapped" + - 'execution succeeded' + - 'SUCCESS: Oracle XMLTYPE to BigQuery transfer completed without hanging!' + +# Test MySQL LoadDataLocal using RegisterReaderHandler pattern +- id: 164 + name: 'Test MySQL LoadDataLocal with native Go driver' + run: 'sling run -d -r cmd/sling/tests/replications/r.93.mysql_load_data_local.yaml' + streams: 1 + rows: 18 + output_contains: + - 'execution succeeded' + - 'SUCCESS: MySQL LoadDataLocal test passed' + +# Test MySQL LoadDataLocal NULL handling with checksums +- id: 165 + name: 'Test MySQL LoadDataLocal NULL handling' + run: 'sling run -d -r cmd/sling/tests/replications/r.94.mysql_load_data_local_nulls.yaml' + streams: 1 + rows: 53 + output_contains: + - 'execution succeeded' + - 'SUCCESS: MySQL LoadDataLocal NULL handling test passed' + +# Test column renaming via select option (comprehensive tests) +- id: 166 + name: 'Test column renaming via select (comprehensive)' + run: 'sling run -d -r cmd/sling/tests/replications/r.95.select_column_rename.yaml' + streams: 6 + output_contains: + - 'execution succeeded' + - 'Test 1 - Basic rename columns' + - 'Test 2 - Exclude columns' + - 'Test 3 - Wildcard exclude columns' + - 'Test 4 - Mixed select columns' + - 'Test 5 - Table with rename columns' + - 'Test 6 - SQL {fields} placeholder columns' + +# Test S3 multi-bucket access with single connection (different buckets in stream URIs and files key) +- id: 167 + name: 'Test S3 multi-bucket access with single connection' + run: 'sling run -d -r cmd/sling/tests/replications/r.96.s3_multi_bucket.yaml' + streams: 3 + output_contains: + - 'SUCCESS: S3 multi-bucket access with single connection works correctly' + +# Test delete_missing with transforms doesn't fail (transforms should be skipped during delete detection) +- id: 168 + name: 'Test delete_missing with transforms (transforms skipped during delete detection)' + run: 'sling run -d -r cmd/sling/tests/replications/r.97.delete_missing_with_transforms.yaml' + streams: 2 + output_contains: + - 'execution succeeded' + - 'SUCCESS: computed_date column exists from transform' + - 'SUCCESS: IDs 9,10 are soft-deleted correctly' \ No newline at end of file diff --git a/core/dbio/api/api.go b/core/dbio/api/api.go index 5a44ce85e..0b75e9ef5 100644 --- a/core/dbio/api/api.go +++ b/core/dbio/api/api.go @@ -222,6 +222,7 @@ type APIStreamConfig struct { Mode string Range string DsConfigMap map[string]any // stream processor options + SchemaOnly bool } func (ac *APIConnection) ReadDataflow(endpointName string, sCfg APIStreamConfig) (df *iop.Dataflow, err error) { diff --git a/core/dbio/connection/connection.go b/core/dbio/connection/connection.go index 2beeef5f3..91fd1060c 100644 --- a/core/dbio/connection/connection.go +++ b/core/dbio/connection/connection.go @@ -173,11 +173,28 @@ func (c *Connection) Info() Info { } } -func (c *Connection) Hash() string { +// GetType returns the more accurate type, especially for ODBC databases +// a bit gnarly... +func (c *Connection) GetType() dbio.Type { + if t := c.Data["conn_template"]; c.Type == dbio.TypeDbODBC && t != nil { + return dbio.Type(cast.ToString(t)) + } + return c.Type +} + +func (c *Connection) Hash(excludeKeys ...string) string { + excludeMap := map[string]bool{} + for _, key := range excludeKeys { + excludeMap[key] = true + } + parts := []string{c.Name, c.Type.Name()} keys := lo.Keys(c.Data) sort.Strings(keys) for _, key := range keys { + if excludeMap[key] { + continue + } value := g.F("%s=%s", key, g.Marshal(c.Data[key])) parts = append(parts, value) } @@ -387,8 +404,11 @@ func (c *Connection) AsFileContext(ctx context.Context, options ...AsConnOptions opt = options[0] } + // exclude url for files so Hash matches for files + cacheKey := c.Hash("url") + // default cache to true - if cc, ok := connCache.Get(c.Hash()); ok && opt.UseCache { + if cc, ok := connCache.Get(cacheKey); ok && opt.UseCache { if cc.File != nil { return cc.File, nil } @@ -402,7 +422,9 @@ func (c *Connection) AsFileContext(ctx context.Context, options ...AsConnOptions if err != nil { return } - connCache.Set(c.Hash(), c) // cache + + // set cache + connCache.Set(cacheKey, c) if opt.Expire > 0 { time.AfterFunc(time.Duration(opt.Expire)*time.Second, func() { diff --git a/core/dbio/connection/connection_discover.go b/core/dbio/connection/connection_discover.go index 4df56c29e..335df1f30 100644 --- a/core/dbio/connection/connection_discover.go +++ b/core/dbio/connection/connection_discover.go @@ -9,6 +9,7 @@ import ( "github.com/flarco/g" "github.com/gobwas/glob" "github.com/samber/lo" + "github.com/slingdata-io/sling-cli/core/dbio" "github.com/slingdata-io/sling-cli/core/dbio/api" "github.com/slingdata-io/sling-cli/core/dbio/database" "github.com/slingdata-io/sling-cli/core/dbio/filesys" @@ -22,7 +23,7 @@ func (c *Connection) Test() (ok bool, err error) { switch { case c.Type.IsDb(): - dbConn, err := c.AsDatabase(AsConnOptions{UseCache: false}) + dbConn, err := c.AsDatabase(AsConnOptions{UseCache: c.GetType() == dbio.TypeDbDuckDb}) if err != nil { return ok, g.Error(err, "could not initiate %s", c.Name) } @@ -213,7 +214,7 @@ func (c *Connection) Discover(opt *DiscoverOptions) (ok bool, nodes filesys.File switch { case c.Type.IsDb(): - dbConn, err := c.AsDatabase(AsConnOptions{UseCache: false}) + dbConn, err := c.AsDatabase(AsConnOptions{UseCache: c.GetType() == dbio.TypeDbDuckDb}) if err != nil { return ok, nodes, schemata, endpoints, g.Error(err, "could not initiate %s", c.Name) } diff --git a/core/dbio/connection/connection_local.go b/core/dbio/connection/connection_local.go index 93e5e1498..e95769f2b 100644 --- a/core/dbio/connection/connection_local.go +++ b/core/dbio/connection/connection_local.go @@ -120,7 +120,7 @@ func GetLocalConns(options ...any) ConnEntries { for _, conn := range dbtConns { c := ConnEntry{ Name: strings.ToUpper(conn.Info().Name), - Description: conn.Type.NameLong(), + Description: conn.GetType().NameLong(), Source: "dbt profiles yaml", Connection: conn, } @@ -140,7 +140,7 @@ func GetLocalConns(options ...any) ConnEntries { for _, conn := range profileConns { c := ConnEntry{ Name: strings.ToUpper(conn.Info().Name), - Description: conn.Type.NameLong(), + Description: conn.GetType().NameLong(), Source: name + " env yaml", Connection: conn, } @@ -164,7 +164,7 @@ func GetLocalConns(options ...any) ConnEntries { for _, conn := range profileConns { c := ConnEntry{ Name: strings.ToUpper(conn.Info().Name), - Description: conn.Type.NameLong(), + Description: conn.GetType().NameLong(), Source: "env-var env yaml", Connection: conn, } @@ -233,7 +233,7 @@ func GetLocalConns(options ...any) ConnEntries { c := ConnEntry{ Name: conn.Info().Name, - Description: conn.Type.NameLong(), + Description: conn.GetType().NameLong(), Source: "env variable", Connection: conn, } @@ -386,7 +386,7 @@ func (ec *EnvFileConns) ConnectionEntries() (entries ConnEntries, err error) { for _, conn := range profileConns { c := ConnEntry{ Name: strings.ToUpper(conn.Info().Name), - Description: conn.Type.NameLong(), + Description: conn.GetType().NameLong(), Source: ec.Name, Connection: conn, } diff --git a/core/dbio/database/database.go b/core/dbio/database/database.go index 88f1bb8a4..848a09be5 100755 --- a/core/dbio/database/database.go +++ b/core/dbio/database/database.go @@ -1941,8 +1941,9 @@ func (conn *BaseConn) DropView(viewNames ...string) (err error) { sql := g.R(conn.template.Core["drop_view"], "view", viewName) _, err = conn.Self().Exec(sql) if err != nil { - errIgnoreWord := conn.template.Variable["error_ignore_drop_view"] - if !(errIgnoreWord != "" && strings.Contains(cast.ToString(err), errIgnoreWord)) { + errMsg := strings.ToLower(err.Error()) + errIgnoreWord := strings.ToLower(conn.Template().Variable["error_ignore_drop_view"]) + if !(errIgnoreWord != "" && strings.Contains(errMsg, errIgnoreWord)) { return g.Error(err, "Error for "+sql) } g.Debug("view %s does not exist", viewName) @@ -2228,7 +2229,7 @@ func (conn *BaseConn) GetAnalysis(analysisName string, values map[string]interfa // CastColumnForSelect casts to the correct target column type func (conn *BaseConn) CastColumnForSelect(srcCol iop.Column, tgtCol iop.Column) string { - return conn.Self().Quote(srcCol.Name) + return conn.Template().Quote(srcCol.Name) } // CastColumnsForSelect cast the source columns into the target Column types @@ -2248,7 +2249,7 @@ func (conn *BaseConn) CastColumnsForSelect(srcColumns iop.Columns, tgtColumns io } // don't normalize name, leave as is - selectExpr := conn.Self().Quote(srcCol.Name) + selectExpr := conn.Template().Quote(srcCol.Name) if !strings.EqualFold(srcCol.DbType, tgtCol.DbType) { g.Debug( @@ -2272,7 +2273,7 @@ func (conn *BaseConn) CastColumnsForSelect(srcColumns iop.Columns, tgtColumns io } // add alias - qName := conn.Self().Quote(srcCol.Name) + qName := conn.Template().Quote(srcCol.Name) selectExprs = append(selectExprs, g.F("%s as %s", selectExpr, qName)) } @@ -2281,7 +2282,7 @@ func (conn *BaseConn) CastColumnsForSelect(srcColumns iop.Columns, tgtColumns io func (conn *BaseConn) castBoolForSelect(srcCol iop.Column, tgtCol iop.Column) (selectStr string) { - qName := conn.Self().Quote(srcCol.Name) + qName := conn.Template().Quote(srcCol.Name) castFunc := conn.GetType().GetTemplateValue("function.cast_as") @@ -2396,7 +2397,7 @@ func (conn *BaseConn) GenerateInsertStatement(tableName string, cols iop.Columns for i, field := range fields { c++ values[i] = conn.bindVar(i+1, field, n, c) - qFields[i] = conn.Self().Quote(field) + qFields[i] = conn.Template().Quote(field) } valuesStr += fmt.Sprintf("(%s),", strings.Join(values, ", ")) } @@ -2420,8 +2421,8 @@ func (conn *BaseConn) GetNativeType(col iop.Column) (nativeType string, err erro return col.GetNativeType(conn.Self().GetType(), ct) } -// GenerateDDL genrate a DDL based on a dataset -func (conn *BaseConn) GenerateDDL(table Table, data iop.Dataset, temporary bool) (string, error) { +// GenerateDDL generate a DDL based on a dataset +func (conn *BaseConn) GenerateDDL(table Table, data iop.Dataset, temporary bool) (ddl string, err error) { if !data.Inferred || data.SafeInference { if len(data.Columns) > 0 && data.Columns[0].Stats.TotalCnt == 0 && data.Columns[0].Type == "" { @@ -2491,7 +2492,7 @@ func (conn *BaseConn) GenerateDDL(table Table, data iop.Dataset, temporary bool) g.Trace("%s - %s %s", col.Name, col.Type, g.Marshal(col.Stats)) } - columnDDL := conn.Self().Quote(col.Name) + " " + nativeType + columnDDL := conn.Template().Quote(col.Name) + " " + nativeType columnsDDL = append(columnsDDL, columnDDL) } @@ -2504,12 +2505,50 @@ func (conn *BaseConn) GenerateDDL(table Table, data iop.Dataset, temporary bool) createTemplate = table.DDL } - ddl := g.R( + ddl = g.R( createTemplate, "table", table.FullName(), "col_types", strings.Join(columnsDDL, ",\n "), ) + partitionBy := "" + if keys, ok := table.Keys[iop.PartitionKey]; ok { + // allow custom SQL expression for partitioning + partitionBy = g.F("partition by %s", strings.Join(keys, ", ")) + } else if keyCols := data.Columns.GetKeys(iop.PartitionKey); len(keyCols) > 0 { + colNames := conn.Template().QuoteNames(keyCols.Names()...) + partitionBy = g.F("partition by %s", strings.Join(colNames, ", ")) + } + ddl = strings.ReplaceAll(ddl, "{partition_by}", partitionBy) + + clusterBy := "" + if keyCols := data.Columns.GetKeys(iop.ClusterKey); len(keyCols) > 0 { + colNames := conn.Template().QuoteNames(keyCols.Names()...) + clusterBy = g.F("cluster by %s", strings.Join(colNames, ", ")) + } + ddl = strings.ReplaceAll(ddl, "{cluster_by}", clusterBy) + + distKey := "" + if keyCols := data.Columns.GetKeys(iop.DistributionKey); len(keyCols) > 0 { + colNames := conn.Template().QuoteNames(keyCols.Names()...) + distKey = g.F("distkey(%s)", strings.Join(colNames, ", ")) + } + ddl = strings.ReplaceAll(ddl, "{dist_key}", distKey) + + sortKey := "" + if keyCols := data.Columns.GetKeys(iop.SortKey); len(keyCols) > 0 { + colNames := conn.Template().QuoteNames(keyCols.Names()...) + sortKey = g.F("compound sortkey(%s)", strings.Join(colNames, ", ")) + } + ddl = strings.ReplaceAll(ddl, "{sort_key}", sortKey) + + primaryKeyExpr := "" + if keyCols := columns.GetKeys(iop.PrimaryKey); len(keyCols) > 0 { + colNames := conn.Template().QuoteNames(keyCols.Names()...) + primaryKeyExpr = g.F("%s", strings.Join(colNames, ", ")) + } + ddl = strings.ReplaceAll(ddl, "{primary_key}", primaryKeyExpr) + return ddl, nil } @@ -2779,7 +2818,10 @@ func (conn *BaseConn) GenerateMergeSQL(srcTable string, tgtTable string, pkField "pk_fields", mc.Map["pk_fields"], "set_fields", mc.Map["set_fields"], "insert_fields", mc.Map["insert_fields"], + "src_insert_fields", mc.Map["src_insert_fields"], "src_fields", mc.Map["src_fields"], + "tgt_fields", mc.Map["tgt_fields"], + "placeholder_fields", mc.Map["placeholder_fields"], ) return @@ -2791,6 +2833,10 @@ type MergeConfig struct { Map map[string]string } +func (mc MergeConfig) TemplatePath() string { + return g.F("core.merge_%s", mc.Strategy) +} + // GenerateMergeConfig returns the merge config func (conn *BaseConn) GenerateMergeConfig(srcTable string, tgtTable string, pkFields []string) (mc MergeConfig, err error) { @@ -2840,6 +2886,7 @@ func (conn *BaseConn) GenerateMergeConfig(srcTable string, tgtTable string, pkFi setFieldsAll := []string{} insertFields := []string{} placeholderFields := []string{} + srcInsertFields := []string{} for _, tgtColName := range tgtCols.Names() { srcCol := g.PtrVal(srcColumns.GetColumn(tgtColName)) // should be found tgtCol := tgtColumns.GetColumn(tgtColName) @@ -2858,7 +2905,16 @@ func (conn *BaseConn) GenerateMergeConfig(srcTable string, tgtTable string, pkFi phExpr := strings.ReplaceAll(colExpr, srcColNameQ, g.F("ph.%s", srcColNameQ)) placeholderFields = append(placeholderFields, phExpr) + srcExpr := strings.ReplaceAll(colExpr, srcColNameQ, g.F("src.%s", srcColNameQ)) + srcInsertFields = append(srcInsertFields, srcExpr) + setSrcExpr := strings.ReplaceAll(colExpr, srcColNameQ, g.F("src.%s", srcColNameQ)) + + // set sync operation to `U` for update update + if strings.EqualFold(tgtCol.Name, env.ReservedFields.SyncedOp) { + setSrcExpr = "'U'" + } + setField := g.F("%s = %s", tgtColNameQ, setSrcExpr) setFieldsAll = append(setFieldsAll, setField) if _, ok := pkFieldMap[tgtCol.Name]; !ok { @@ -2884,6 +2940,7 @@ func (conn *BaseConn) GenerateMergeConfig(srcTable string, tgtTable string, pkFi "src_fields": strings.Join(srcFields, ", "), "tgt_fields": strings.Join(tgtFields, ", "), "insert_fields": strings.Join(insertFields, ", "), + "src_insert_fields": strings.Join(srcInsertFields, ", "), "pk_fields": strings.Join(pkFields, ", "), "src_pk_fields": strings.Join(srcPkFields, ", "), "tgt_pk_fields": strings.Join(tgtPkFields, ", "), @@ -2900,11 +2957,10 @@ func (conn *BaseConn) GenerateMergeConfig(srcTable string, tgtTable string, pkFi return mc, g.Error("invalid merge strategy %s", mc.Strategy) } - key := g.F("core.merge_%s", mc.Strategy) - mc.Template = conn.GetTemplateValue(key) + mc.Template = conn.GetTemplateValue(mc.TemplatePath()) if mc.Template == "" { - return mc, g.Error("merge strategy `%s` not supported for %s (did not find SQL template key `%s`)", mc.Strategy, conn.GetType(), key) + return mc, g.Error("merge strategy `%s` not supported for %s (did not find SQL template key `%s`)", mc.Strategy, conn.GetType(), mc.TemplatePath()) } return @@ -3083,23 +3139,23 @@ func GetOptimizeTableStatements(conn Connection, table *Table, newColumns iop.Co return } for i, key := range pKey { - pKey[i] = conn.Self().Quote(key) + pKey[i] = conn.Template().Quote(key) } // add new column with new type ddlParts = append(ddlParts, g.R( conn.GetTemplateValue("core.add_column"), "table", table.FullName(), - "column", conn.Self().Quote(colNameTemp), + "column", conn.Template().Quote(colNameTemp), "type", col.DbType, )) // update set to cast old values oldColCasted := conn.Self().CastColumnForSelect(oldCols[index], col) - if oldColCasted == conn.Self().Quote(col.Name) { + if oldColCasted == conn.Template().Quote(col.Name) { oldColCasted = g.R( conn.GetTemplateValue("function.cast_as"), - "field", conn.Self().Quote(col.Name), + "field", conn.Template().Quote(col.Name), "type", col.DbType, ) } @@ -3116,7 +3172,7 @@ func GetOptimizeTableStatements(conn Connection, table *Table, newColumns iop.Co "table", table.FullName(), "set_fields", g.R( "{temp_column} = {old_column_casted}", - "temp_column", conn.Self().Quote(colNameTemp), + "temp_column", conn.Template().Quote(colNameTemp), "old_column_casted", oldColCasted, ), "fields", strings.Join(fields, ", "), @@ -3128,13 +3184,13 @@ func GetOptimizeTableStatements(conn Connection, table *Table, newColumns iop.Co ddlParts = append(ddlParts, g.R( conn.GetTemplateValue("core.drop_column"), "table", table.FullName(), - "column", conn.Self().Quote(col.Name), + "column", conn.Template().Quote(col.Name), )) // rename new column to old name tableName := table.FullName() - oldColName := conn.Self().Quote(colNameTemp) - newColName := conn.Self().Quote(col.Name) + oldColName := conn.Template().Quote(colNameTemp) + newColName := conn.Template().Quote(col.Name) if conn.Self().GetType().IsSQLServer() { tableName = conn.Unquote(table.FullName()) @@ -3259,8 +3315,8 @@ func (conn *BaseConn) CompareChecksums(tableName string, columns iop.Columns) (e expr = "0" } colName := fieldsMap[strings.ToLower(col.Name)] - expr = g.R(expr, "field", conn.Self().Quote(cast.ToString(colName))) - exprs = append(exprs, g.F("sum(%s) as %s", expr, conn.Self().Quote(cast.ToString(colName)))) + expr = g.R(expr, "field", conn.Template().Quote(cast.ToString(colName))) + exprs = append(exprs, g.F("sum(%s) as %s", expr, conn.Template().Quote(cast.ToString(colName)))) exprMap[strings.ToLower(col.Name)] = g.F("sum(%s)", expr) } @@ -3464,7 +3520,7 @@ func (conn *BaseConn) AddMissingColumns(table Table, newCols iop.Columns) (ok bo sql := g.R( conn.Template().Core["add_column"], "table", table.FullName(), - "column", conn.Self().Quote(col.Name), + "column", conn.Template().Quote(col.Name), "type", nativeType, ) diff --git a/core/dbio/database/database_fabric.go b/core/dbio/database/database_fabric.go index f46791c9e..cd75e0113 100644 --- a/core/dbio/database/database_fabric.go +++ b/core/dbio/database/database_fabric.go @@ -113,8 +113,8 @@ func (conn *MsFabricConn) makeABFSClient() (fs filesys.FileSysClient, err error) return fs, nil } -// getOneLakePath generates a OneLake path for temporary staging -func (conn *MsFabricConn) getOneLakePath(tableFName string) string { +// getStagingPath generates a staging path for file uploads (uses DFS endpoint) +func (conn *MsFabricConn) getStagingPath(tableFName string) string { endpoint := conn.GetProp("abfs_endpoint") filesystem := conn.GetProp("abfs_filesystem") parent := conn.GetProp("abfs_parent") @@ -131,6 +131,20 @@ func (conn *MsFabricConn) getOneLakePath(tableFName string) string { return fmt.Sprintf("%s/%s/%s/%s", basePath, tempCloudStorageFolder, cleanTableName, cast.ToString(g.Now())) } +// getCopyIntoPath converts a staging path to the appropriate endpoint for COPY INTO +// Per Microsoft docs, the .blob endpoint yields best performance for COPY INTO +// See: https://learn.microsoft.com/en-us/sql/t-sql/statements/copy-into-transact-sql +func (conn *MsFabricConn) getCopyIntoPath(stagingPath string) string { + // Check if user explicitly set a copy_into_endpoint override + if copyIntoEndpoint := conn.GetProp("copy_into_endpoint"); copyIntoEndpoint != "" { + // Replace the endpoint in the path with the user-specified one + endpoint := conn.GetProp("abfs_endpoint") + return strings.Replace(stagingPath, endpoint, copyIntoEndpoint, 1) + } + + return stagingPath +} + // CopyFromOneLake uses the COPY INTO command to load data from OneLake func (conn *MsFabricConn) CopyFromOneLake(tableFName, oneLakePath string, columns iop.Columns, fileFormat dbio.FileType) (err error) { // Prepare target columns @@ -202,8 +216,8 @@ func (conn *MsFabricConn) BulkImportFlow(tableFName string, df *iop.Dataflow) (c settingMppBulkImportFlow(conn, iop.GzipCompressorType) - // Get OneLake path - oneLakePath := conn.getOneLakePath(tableFName) + // Get staging path (for ABFS uploads - uses DFS endpoint) + stagingPath := conn.getStagingPath(tableFName) // Create ABFS client abfsFs, err := conn.makeABFSClient() @@ -212,15 +226,15 @@ func (conn *MsFabricConn) BulkImportFlow(tableFName string, df *iop.Dataflow) (c } // Delete any existing files at path - err = filesys.Delete(abfsFs, oneLakePath) + err = filesys.Delete(abfsFs, stagingPath) if err != nil { - return df.Count(), g.Error(err, "Could not delete existing files: "+oneLakePath) + return df.Count(), g.Error(err, "Could not delete existing files: "+stagingPath) } // Set up cleanup df.Defer(func() { if !cast.ToBool(os.Getenv("SLING_KEEP_TEMP")) { - filesys.Delete(abfsFs, oneLakePath) + filesys.Delete(abfsFs, stagingPath) } }) @@ -241,21 +255,24 @@ func (conn *MsFabricConn) BulkImportFlow(tableFName string, df *iop.Dataflow) (c // Fabric COPY INTO treats empty fields as NULL (no NULL_IF parameter available) abfsFs.SetProp("null_as", ``) abfsFs.SetProp("compression", `gzip`) - bw, err = filesys.WriteDataflow(abfsFs, df, oneLakePath) + bw, err = filesys.WriteDataflow(abfsFs, df, stagingPath) case dbio.FileTypeParquet: if env.UseDuckDbCompute() { - bw, err = filesys.WriteDataflowViaDuckDB(abfsFs, df, oneLakePath) + bw, err = filesys.WriteDataflowViaDuckDB(abfsFs, df, stagingPath) } else { - bw, err = filesys.WriteDataflow(abfsFs, df, oneLakePath) + bw, err = filesys.WriteDataflow(abfsFs, df, stagingPath) } } if err != nil { return df.Count(), g.Error(err, "Error writing to OneLake") } - g.Debug("total written: %s to %s", humanize.Bytes(cast.ToUint64(bw)), oneLakePath) + g.Debug("total written: %s to %s", humanize.Bytes(cast.ToUint64(bw)), stagingPath) + + // Get COPY INTO path (may convert DFS to Blob endpoint for better compatibility) + copyIntoPath := conn.getCopyIntoPath(stagingPath) // Execute COPY INTO - err = conn.CopyFromOneLake(tableFName, oneLakePath, df.Columns, fileFormat) + err = conn.CopyFromOneLake(tableFName, copyIntoPath, df.Columns, fileFormat) if err != nil { return df.Count(), g.Error(err, "Error copying into Fabric from OneLake") } diff --git a/core/dbio/database/database_mysql.go b/core/dbio/database/database_mysql.go index 55327e24a..b385747ac 100755 --- a/core/dbio/database/database_mysql.go +++ b/core/dbio/database/database_mysql.go @@ -13,6 +13,7 @@ import ( "cloud.google.com/go/cloudsqlconn" cloudsqlmysql "cloud.google.com/go/cloudsqlconn/mysql/mysql" "github.com/go-sql-driver/mysql" + "github.com/google/uuid" "github.com/jmoiron/sqlx" "github.com/slingdata-io/sling-cli/core/dbio" "github.com/spf13/cast" @@ -25,9 +26,10 @@ import ( // MySQLConn is a MySQL or MariaDB connection type MySQLConn struct { BaseConn - URL string - isCloudSQL bool - cloudSQLCleanup func() + URL string + isCloudSQL bool + localInfileEnabled bool + cloudSQLCleanup func() } // Init initiates the object @@ -45,9 +47,11 @@ func (conn *MySQLConn) Init() error { // the LoadDataOutFile needs special circumstances conn.BaseConn.SetProp("allow_bulk_export", "false") - // InsertBatchStream is faster than LoadDataInFile - if conn.BaseConn.GetProp("allow_bulk_import") == "" { - conn.BaseConn.SetProp("allow_bulk_import", "false") + // Enable allowAllFiles for LOAD DATA LOCAL INFILE via RegisterReaderHandler + // This is required for the go-sql-driver to use Reader:: syntax + if conn.BaseConn.GetProp("allow_all_files") == "" && + conn.BaseConn.GetProp("allowAllFiles") == "" { + conn.BaseConn.SetProp("allow_all_files", "true") } instance := Connection(conn) @@ -56,6 +60,24 @@ func (conn *MySQLConn) Init() error { return conn.BaseConn.Init() } +// checkLocalInfileEnabled checks if the MySQL server allows LOCAL INFILE +// and caches the result in the struct field for subsequent calls +func (conn *MySQLConn) checkLocalInfileEnabled() { + // Query the server using raw DB to avoid datastream context issues + var varName, varValue string + err := conn.db.QueryRow("SHOW GLOBAL VARIABLES LIKE 'local_infile'").Scan(&varName, &varValue) + if err != nil { + g.Debug("could not check local_infile variable: %v", err) + return + } + + conn.localInfileEnabled = strings.ToLower(varValue) == "on" || varValue == "1" + + if conn.localInfileEnabled { + g.Debug("local_infile is enabled on MySQL server") + } +} + // GetURL returns the processed URL func (conn *MySQLConn) GetURL(newURL ...string) string { connURL := conn.BaseConn.URL @@ -142,7 +164,15 @@ func (conn *MySQLConn) Connect(timeOut ...int) (err error) { mysql.RegisterTLSConfig(conn.GetProp("sling_conn_id"), tlsConfig) } - return conn.BaseConn.Connect(timeOut...) + err = conn.BaseConn.Connect(timeOut...) + if err != nil { + return err + } + + // Check and cache local_infile setting after connect + conn.checkLocalInfileEnabled() + + return nil } // connectCloudSQL establishes a connection to Google Cloud SQL MySQL using IAM authentication @@ -285,6 +315,9 @@ func (conn *MySQLConn) connectCloudSQL(timeOut ...int) error { conn.postConnect() + // Check and cache local_infile setting after connect + conn.checkLocalInfileEnabled() + return nil } @@ -396,21 +429,13 @@ func (conn *MySQLConn) BulkExportStream(table Table) (ds *iop.Datastream, err er // BulkImportStream bulk import stream func (conn *MySQLConn) BulkImportStream(tableFName string, ds *iop.Datastream) (count uint64, err error) { - + // Check ADBC first if conn.UseADBC() { conn.Commit() return conn.adbc.BulkImportStream(tableFName, ds) } - _, err = exec.LookPath("mysql") - if err != nil { - g.Trace("mysql not found in path. Using cursor...") - return conn.BaseConn.InsertBatchStream(tableFName, ds) - } else if conn.GetProp("allow_bulk_import") != "true" { - return conn.BaseConn.InsertBatchStream(tableFName, ds) - } - - // needs to get columns to shape stream + // Get columns to shape stream columns, err := conn.GetColumns(tableFName) if err != nil { err = g.Error(err, "could not get column list") @@ -423,7 +448,29 @@ func (conn *MySQLConn) BulkImportStream(tableFName string, ds *iop.Datastream) ( return } - return conn.LoadDataInFile(tableFName, ds) + // Check server capability (cached from connect) + // Note: LoadDataLocal keeps the connection busy, so we can't use it when + // adjust_column_type is enabled (it requires queries during load) + useBulk := conn.GetProp("use_bulk") != "false" + adjustColumnType := cast.ToBool(conn.GetProp("adjust_column_type")) + + if conn.localInfileEnabled && useBulk && !adjustColumnType { + // Use native Go driver - no external binary needed + return conn.LoadDataLocal(tableFName, ds) + } + + // Log why we're not using LoadDataLocal + if !conn.localInfileEnabled { + g.Debug("local_infile is disabled on server, using fallback") + } else if !useBulk { + g.Debug("use_bulk is false, using InsertBatchStream") + } else if adjustColumnType { + g.Debug("adjust_column_type enabled, using InsertBatchStream to allow concurrent queries") + } + + // Final fallback: InsertBatchStream + g.Trace("using InsertBatchStream as fallback") + return conn.BaseConn.InsertBatchStream(tableFName, ds) } // LoadDataOutFile Bulk Export @@ -481,51 +528,35 @@ func (conn *MySQLConn) LoadDataOutFile(ctx *g.Context, sql string) (stdOutReader return stdOutReader, err } -// LoadDataInFile Bulk Import -func (conn *MySQLConn) LoadDataInFile(tableFName string, ds *iop.Datastream) (count uint64, err error) { - var stderr bytes.Buffer - - connURL := conn.URL - if su := conn.GetProp("ssh_url"); su != "" { - connURL = su // use ssh url if specified - } - - url, err := dburl.Parse(connURL) - if err != nil { - err = g.Error(err, "Error dburl.Parse(conn.URL)") - return - } - - password, _ := url.User.Password() - host := strings.ReplaceAll(url.Host, ":"+url.Port(), "") - database := strings.ReplaceAll(url.Path, "/", "") - - loadQuery := g.R(`LOAD DATA LOCAL INFILE '/dev/stdin' INTO TABLE {table} FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' ESCAPED BY '"' IGNORE 1 LINES;`, "table", tableFName) - proc := exec.Command( - "mysql", - "--local-infile=1", - "-h", host, - "-P", url.Port(), - "-u", url.User.Username(), - "-p"+password, - database, - "-e", loadQuery, +// LoadDataLocal uses go-sql-driver/mysql's RegisterReaderHandler for LOAD DATA LOCAL INFILE +// This allows streaming data directly to MySQL without requiring the external mysql binary +func (conn *MySQLConn) LoadDataLocal(tableFName string, ds *iop.Datastream) (count uint64, err error) { + // Generate unique handler name to avoid conflicts in concurrent operations + handlerName := "sling_" + uuid.New().String() + + // Register the reader handler for streaming CSV with header + // The MySQL LOAD DATA template uses IGNORE 1 LINES, so we need the header + // BoolAsInt is required because MySQL's LOAD DATA doesn't convert true/false to 1/0 + cfg := iop.LoaderStreamConfig(true) + cfg.BoolAsInt = true + mysql.RegisterReaderHandler(handlerName, func() io.Reader { + return ds.NewCsvReader(cfg) + }) + defer mysql.DeregisterReaderHandler(handlerName) + + // Get the template and build the query + tmpl := conn.GetTemplateValue("core.load_data_local_reader") + loadQuery := g.R(tmpl, + "handler_name", handlerName, + "table", tableFName, ) - proc.Stderr = &stderr - proc.Stdin = ds.NewCsvReader(iop.DefaultStreamConfig()) + g.Trace("LoadDataLocal query: %s", loadQuery) - err = proc.Run() + // Execute the LOAD DATA statement + _, err = conn.Exec(loadQuery) if err != nil { - cmdStr := strings.ReplaceAll(strings.Join(proc.Args, " "), password, "****") - err = g.Error( - err, - fmt.Sprintf( - "MySQL Import Command -> %s\nMySQL Import Error -> %s", - cmdStr, stderr.String(), - ), - ) - return ds.Count, err + return 0, g.Error(err, "LoadDataLocal failed for table %s", tableFName) } return ds.Count, nil diff --git a/core/dbio/database/database_oracle.go b/core/dbio/database/database_oracle.go index a5085ad16..8c383b0bd 100755 --- a/core/dbio/database/database_oracle.go +++ b/core/dbio/database/database_oracle.go @@ -599,6 +599,11 @@ func (conn *OracleConn) CastColumnForSelect(srcCol iop.Column, tgtCol iop.Column tgtCol.DbPrecision = lo.Ternary(tgtCol.DbPrecision == 0, 4000, tgtCol.DbPrecision) switch { + case srcDbType == "xmltype": + // XMLTYPE columns cause the go-ora driver to hang when reading directly. + // Cast to CLOB to extract XML content as text. + // See: https://github.com/sijms/go-ora/issues/562 + selectStr = g.F("(%s).getclobval()", qName) case srcDbType != "clob" && tgtDbType == "clob": selectStr = g.F("to_clob(%s)", qName) case srcDbType == "clob" && tgtCol.IsString() && tgtDbType != "clob": diff --git a/core/dbio/database/database_prometheus.go b/core/dbio/database/database_prometheus.go index 08a08d460..61f4f8c88 100644 --- a/core/dbio/database/database_prometheus.go +++ b/core/dbio/database/database_prometheus.go @@ -767,6 +767,9 @@ func (conn *PrometheusConn) StreamRowsChunked(queryContext *g.Context, query str } ds.SetConfig(props) + // Start the bytes-written processor to prevent blocking on bwRows channel + ds.StartBwProcessor() + // Process in chunks go func() { defer ds.Close() diff --git a/core/dbio/database/schemata.go b/core/dbio/database/schemata.go index 538293c9f..4a5670fa7 100644 --- a/core/dbio/database/schemata.go +++ b/core/dbio/database/schemata.go @@ -5,6 +5,7 @@ import ( "encoding/json" "runtime/debug" "strings" + "sync" "unicode" "github.com/flarco/g" @@ -274,6 +275,15 @@ func (t *Table) Select(Opts ...SelectOptions) (sql string) { if f == "*" || strings.Contains(f, "(") { return f } + + // Parse for "field as alias" syntax + original, alias, _, _ := iop.ParseSelectExpr(f) + if alias != "" { + // Generate: "original_col" AS "alias_name" + origQuoted := q + strings.ReplaceAll(original, q, "") + q + aliasQuoted := q + strings.ReplaceAll(alias, q, "") + q + return origQuoted + " as " + aliasQuoted + } return q + strings.ReplaceAll(f, q, "") + q }) @@ -284,9 +294,51 @@ func (t *Table) Select(Opts ...SelectOptions) (sql string) { fieldsStr := lo.Ternary(len(fields) > 0, strings.Join(fields, ", "), "*") - // auto convert to json if needed + // auto convert complex types as needed { switch t.Dialect { + case dbio.TypeDbOracle: + // XMLTYPE columns cause the go-ora driver to hang when reading directly. + // Cast to CLOB to extract XML content as text. + // See: https://github.com/sijms/go-ora/issues/562 + var xmlTypeCols iop.Columns + for _, col := range t.Columns { + if strings.EqualFold(col.DbType, "xmltype") { + xmlTypeCols = append(xmlTypeCols, col) + } + } + + if len(xmlTypeCols) > 0 { + if len(fields) == 0 || (len(fields) == 1 && fields[0] == "*") { + // Need to explicitly list all columns with XMLTYPE casted + fieldExprs := []string{} + for _, col := range t.Columns { + colQ := t.Dialect.Quote(col.Name) + if xmlTypeCols.GetColumn(col.Name) != nil { + // Cast XMLTYPE to CLOB using getclobval() + expr := g.F("(%s).getclobval() as %s", colQ, colQ) + fieldExprs = append(fieldExprs, expr) + } else { + fieldExprs = append(fieldExprs, colQ) + } + } + fieldsStr = strings.Join(fieldExprs, ", ") + } else { + fieldExprs := []string{} + for _, field := range opts.Fields { + field = strings.TrimSpace(field) + colQ := t.Dialect.Quote(field) + if xmlTypeCols.GetColumn(field) != nil { + expr := g.F("(%s).getclobval() as %s", colQ, colQ) + fieldExprs = append(fieldExprs, expr) + } else { + fieldExprs = append(fieldExprs, colQ) + } + } + fieldsStr = strings.Join(fieldExprs, ", ") + } + } + case dbio.TypeDbBigQuery: var toJsonCols iop.Columns @@ -963,6 +1015,7 @@ func GetTablesSchemata(conn Connection, tableNames ...string) (schemata Schemata // GetSchemataAll obtains the schemata for all databases detected func GetSchemataAll(conn Connection) (schemata Schemata, err error) { schemata = Schemata{Databases: map[string]Database{}} + var mu sync.Mutex connInfo := conn.Info() @@ -997,13 +1050,14 @@ func GetSchemataAll(conn Connection) (schemata Schemata, err error) { } // pull down schemata - newSchemata, err := newConn.GetSchemata("", "") + newSchemata, err := newConn.GetSchemata(SchemataLevelColumn, "") if err != nil { g.Warn("could not obtain schemata for database: %s. %s", dbName, err) return } - // merge all schematas + // merge all schematas with mutex protection + mu.Lock() for name, database := range newSchemata.Databases { g.Debug( " collected %d columns, in %d tables/views from database %s", @@ -1013,6 +1067,7 @@ func GetSchemataAll(conn Connection) (schemata Schemata, err error) { ) schemata.Databases[name] = database } + mu.Unlock() } // loop an connect to each @@ -1032,10 +1087,44 @@ func (t *Table) AddPrimaryKeyToDDL(ddl string, columns iop.Columns) (string, err if pkCols := columns.GetKeys(iop.PrimaryKey); len(pkCols) > 0 { ddl = strings.TrimSpace(ddl) - // add pk right before the last parenthesis - lastParen := strings.LastIndex(ddl, ")") - if lastParen == -1 { - return ddl, g.Error("could not find last parenthesis") + // Find the closing parenthesis of the column definitions + // We need to find the first balanced closing paren that matches the opening + // paren of the CREATE TABLE column list, not just the last paren in the DDL + // This handles cases like: CREATE TABLE t (col1 int) WITH (data_compression=page) + + // Find "CREATE TABLE" pattern to locate start of statement + createTableIdx := strings.Index(strings.ToUpper(ddl), "CREATE TABLE") + if createTableIdx == -1 { + return ddl, g.Error("could not find CREATE TABLE in DDL") + } + + // Find the opening paren after CREATE TABLE (this is the column list) + openParen := strings.Index(ddl[createTableIdx:], "(") + if openParen == -1 { + return ddl, g.Error("could not find opening parenthesis for column list") + } + openParen += createTableIdx + + // Find the matching closing paren by counting balanced parens + depth := 1 + closeParen := -1 + for i := openParen + 1; i < len(ddl); i++ { + switch ddl[i] { + case '(': + depth++ + case ')': + depth-- + if depth == 0 { + closeParen = i + } + } + if closeParen != -1 { + break + } + } + + if closeParen == -1 { + return ddl, g.Error("could not find closing parenthesis for column list") } prefix := "primary key" @@ -1045,7 +1134,7 @@ func (t *Table) AddPrimaryKeyToDDL(ddl string, columns iop.Columns) (string, err } quotedNames := t.Dialect.QuoteNames(pkCols.Names()...) - ddl = ddl[:lastParen] + g.F(", %s (%s)", prefix, strings.Join(quotedNames, ", ")) + ddl[lastParen:] + ddl = ddl[:closeParen] + g.F(", %s (%s)", prefix, strings.Join(quotedNames, ", ")) + ddl[closeParen:] } return ddl, nil diff --git a/core/dbio/database/schemata_test.go b/core/dbio/database/schemata_test.go index fef055aaf..b4a24fca5 100644 --- a/core/dbio/database/schemata_test.go +++ b/core/dbio/database/schemata_test.go @@ -6,6 +6,9 @@ import ( "github.com/flarco/g" "github.com/slingdata-io/sling-cli/core/dbio" + "github.com/slingdata-io/sling-cli/core/dbio/iop" + "github.com/slingdata-io/sling-cli/core/env" + "github.com/spf13/cast" "github.com/stretchr/testify/assert" ) @@ -297,6 +300,127 @@ func TestParseSQLMultiStatements(t *testing.T) { } } +func TestGetSchemataAll(t *testing.T) { + ef := env.LoadSlingEnvFile() + + url := cast.ToStringMap(ef.Connections["POSTGRES"])["url"] + if url == nil { + t.Skip("POSTGRES env var not set") + } + + conn, err := NewConn(cast.ToString(url)) + if !assert.NoError(t, err) { + return + } + defer conn.Close() + + schemata, err := GetSchemataAll(conn) + if !assert.NoError(t, err) { + return + } + + // Count all tables from all databases + tableCount := 0 + for _, db := range schemata.Databases { + tableCount += len(db.Tables()) + } + + assert.Greater(t, tableCount, 0, "expected at least one table across all databases") +} + +func TestAddPrimaryKeyToDDL(t *testing.T) { + // Verifies that primary key is placed correctly in column definitions + // when table_ddl contains WITH clause or other suffixes after ({col_types}) + + type testCase struct { + name string + dialect dbio.Type + ddl string + pkCols []string + expected string + } + + cases := []testCase{ + { + name: "simple DDL without WITH clause", + dialect: dbio.TypeDbSQLServer, + ddl: `create table "dbo"."test" ("col1" nvarchar(10), "col2" nvarchar(6))`, + pkCols: []string{"col1", "col2"}, + expected: `create table "dbo"."test" ("col1" nvarchar(10), "col2" nvarchar(6), primary key ("col1", "col2"))`, + }, + { + name: "DDL with WITH clause (GitHub issue #694)", + dialect: dbio.TypeDbSQLServer, + ddl: `create table "dbo"."test" ("col1" nvarchar(10), "col2" nvarchar(6)) WITH (data_compression=page)`, + pkCols: []string{"col1", "col2"}, + expected: `create table "dbo"."test" ("col1" nvarchar(10), "col2" nvarchar(6), primary key ("col1", "col2")) WITH (data_compression=page)`, + }, + { + name: "DDL with multiple WITH options", + dialect: dbio.TypeDbSQLServer, + ddl: `create table "dbo"."test" ("col1" int, "col2" int) WITH (PAD_INDEX = ON, FILLFACTOR = 90)`, + pkCols: []string{"col1"}, + expected: `create table "dbo"."test" ("col1" int, "col2" int, primary key ("col1")) WITH (PAD_INDEX = ON, FILLFACTOR = 90)`, + }, + { + name: "DDL with nested parentheses in column type", + dialect: dbio.TypeDbSQLServer, + ddl: `create table "dbo"."test" ("col1" decimal(10,2), "col2" varchar(100)) WITH (LOCK_ESCALATION = TABLE)`, + pkCols: []string{"col1"}, + expected: `create table "dbo"."test" ("col1" decimal(10,2), "col2" varchar(100), primary key ("col1")) WITH (LOCK_ESCALATION = TABLE)`, + }, + { + name: "Postgres DDL without suffix", + dialect: dbio.TypeDbPostgres, + ddl: `create table if not exists "public"."test" ("col1" integer, "col2" text)`, + pkCols: []string{"col1"}, + expected: `create table if not exists "public"."test" ("col1" integer, "col2" text, primary key ("col1"))`, + }, + { + name: "Postgres DDL with PARTITION BY clause", + dialect: dbio.TypeDbPostgres, + ddl: `create table if not exists "public"."test" ("col1" integer, "col2" date) PARTITION BY RANGE (col2)`, + pkCols: []string{"col1"}, + expected: `create table if not exists "public"."test" ("col1" integer, "col2" date, primary key ("col1")) PARTITION BY RANGE (col2)`, + }, + { + name: "Oracle DDL with named constraint", + dialect: dbio.TypeDbOracle, + ddl: `create table "SCHEMA"."TEST" ("COL1" NUMBER, "COL2" VARCHAR2(100))`, + pkCols: []string{"COL1"}, + expected: `create table "SCHEMA"."TEST" ("COL1" NUMBER, "COL2" VARCHAR2(100), constraint test_pkey primary key ("COL1"))`, + }, + { + name: "no primary key columns", + dialect: dbio.TypeDbSQLServer, + ddl: `create table "dbo"."test" ("col1" int, "col2" int) WITH (FILLFACTOR = 90)`, + pkCols: []string{}, + expected: `create table "dbo"."test" ("col1" int, "col2" int) WITH (FILLFACTOR = 90)`, + }, + } + + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + table := &Table{ + Name: "test", + Dialect: c.dialect, + } + + // Create columns with primary key flag + var cols iop.Columns + for _, name := range c.pkCols { + col := iop.Column{Name: name} + col.SetMetadata(iop.PrimaryKey.MetadataKey(), "true") + cols = append(cols, col) + } + + result, err := table.AddPrimaryKeyToDDL(c.ddl, cols) + assert.NoError(t, err) + assert.Equal(t, c.expected, result) + }) + } +} + func TestTrimSQLComments(t *testing.T) { type testCase struct { name string diff --git a/core/dbio/dbio_types.go b/core/dbio/dbio_types.go index 95d47477c..38d793f97 100644 --- a/core/dbio/dbio_types.go +++ b/core/dbio/dbio_types.go @@ -10,6 +10,7 @@ import ( "github.com/flarco/g" "github.com/slingdata-io/sling-cli/core/env" + "github.com/spf13/cast" "gopkg.in/yaml.v2" ) @@ -200,6 +201,10 @@ func (t Type) DefPort() int { // DBNameUpperCase returns true is upper case is default func (t Type) DBNameUpperCase() bool { + tp, _ := t.Template() + if val := tp.Value("variable.column_upper"); val != "" { + return cast.ToBool(val) + } return g.In(t, TypeDbOracle, TypeDbSnowflake, TypeDbExasol) } @@ -283,6 +288,7 @@ func (t Type) NameLong() string { TypeDbDatabricks: "DB - Databricks", TypeDbExasol: "DB - Exasol", TypeDbD1: "DB - D1", + Type("db2"): "DB - DB2", TypeDbSQLite: "DB - SQLite", TypeDbDuckDb: "DB - DuckDB", TypeDbDuckLake: "DB - DuckLake", @@ -333,6 +339,7 @@ func (t Type) Name() string { TypeDbDatabricks: "Databricks", TypeDbExasol: "Exasol", TypeDbD1: "D1", + Type("db2"): "DB2", TypeDbSQLite: "SQLite", TypeDbDuckDb: "DuckDB", TypeDbDuckLake: "DuckLake", @@ -539,7 +546,9 @@ func (tp Template) Quote(field string) string { // always normalize if case is uniform. Why would you quote and not normalize? if !HasVariedCase(field) && Normalize { - if tp.Type.DBNameUpperCase() { + if val := tp.Value("variable.column_upper"); val != "" && cast.ToBool(val) { + field = strings.ToUpper(field) + } else if tp.Type.DBNameUpperCase() { field = strings.ToUpper(field) } else { field = strings.ToLower(field) @@ -549,6 +558,7 @@ func (tp Template) Quote(field string) string { field = tp.Type.Unquote(field) return q + field + q } + func (tp Template) QuoteNames(names ...string) (newNames []string) { newNames = make([]string, len(names)) for i := range names { diff --git a/core/dbio/filesys/fs.go b/core/dbio/filesys/fs.go index d9f33807b..4dc649224 100755 --- a/core/dbio/filesys/fs.go +++ b/core/dbio/filesys/fs.go @@ -275,6 +275,21 @@ func NormalizeURI(fs FileSysClient, uri string) string { return fs.Prefix("/") + path } return fs.Prefix("/") + strings.TrimLeft(strings.TrimPrefix(uri, fs.Prefix()), "/") + case dbio.TypeFileS3, dbio.TypeFileGoogle: + // For S3/GCS, if URI already has the scheme prefix (e.g., s3://bucket/path), + // return it as-is to allow accessing different buckets with the same credentials. + // This enables multi-bucket access with a single connection. + scheme := fs.FsType().String() + "://" + if strings.HasPrefix(uri, scheme) { + // Ensure there's a trailing slash after the bucket name if no path is specified. + // This is required for ParseURL to correctly extract an empty path. + // e.g., "s3://bucket" -> "s3://bucket/" + if !strings.Contains(strings.TrimPrefix(uri, scheme), "/") { + return uri + "/" + } + return uri + } + fallthrough default: return fs.Prefix("/") + strings.TrimLeft(strings.TrimPrefix(uri, fs.Prefix()), "/") } @@ -485,6 +500,7 @@ func (fs *BaseFileSysClient) GetDatastream(uri string, cfg ...iop.FileStreamConf ds = iop.NewDatastreamContext(fs.Context().Ctx, nil) ds.SafeInference = true + ds.SchemaOnly = Cfg.SchemaOnly ds.SetMetadata(fs.GetProp("METADATA")) ds.Metadata.StreamURL.Value = uri ds.SetConfig(fs.Props()) @@ -635,16 +651,45 @@ func (fs *BaseFileSysClient) ReadDataflow(url string, cfg ...iop.FileStreamConfi if g.In(Cfg.Format, dbio.FileTypeIceberg, dbio.FileTypeDelta) || Cfg.SQL != "" { nodes = FileNodes{FileNode{URI: url}} } else if prefixes := Cfg.FileSelect; len(prefixes) > 0 { - rootPath := GetDeepestPartitionParent(url) - g.Trace("listing path: %s", rootPath) - nodes, err = fs.Self().ListRecursive(rootPath) - if err != nil { - err = g.Error(err, "Error getting paths") - return + // Check if any FileSelect entries are full URIs with scheme prefix. + // If so, they may reference different buckets (multi-bucket access). + fullURIPrefixes := []string{} + relativePrefixes := []string{} + + for _, prefix := range prefixes { + if strings.Contains(prefix, "://") { + fullURIPrefixes = append(fullURIPrefixes, prefix) + } else { + relativePrefixes = append(relativePrefixes, prefix) + } } - // select only prefixes - nodes = nodes.SelectWithPrefix(prefixes...) + // Handle full URI prefixes (may be from different buckets) + if len(fullURIPrefixes) > 0 { + for _, uri := range fullURIPrefixes { + g.Trace("listing path (full URI): %s", uri) + uriNodes, err := fs.Self().ListRecursive(uri) + if err != nil { + err = g.Error(err, "Error getting paths for %s", uri) + return df, err + } + nodes = append(nodes, uriNodes...) + } + } + + // Handle relative prefixes (original behavior) + if len(relativePrefixes) > 0 { + rootPath := GetDeepestPartitionParent(url) + g.Trace("listing path: %s", rootPath) + pathNodes, err := fs.Self().ListRecursive(rootPath) + if err != nil { + err = g.Error(err, "Error getting paths") + return df, err + } + // select only prefixes + pathNodes = pathNodes.SelectWithPrefix(relativePrefixes...) + nodes = append(nodes, pathNodes...) + } } else { g.Trace("listing path: %s", url) nodes, err = fs.Self().ListRecursive(url) @@ -1246,6 +1291,7 @@ func GetDataflowViaDuckDB(fs FileSysClient, uri string, nodes FileNodes, cfg iop ds := iop.NewDatastreamContext(fs.Context().Ctx, nil) ds.SafeInference = true + ds.SchemaOnly = cfg.SchemaOnly ds.SetMetadata(fs.GetProp("METADATA")) ds.Metadata.StreamURL.Value = uri ds.SetConfig(fs.Props()) @@ -1611,6 +1657,7 @@ func MergeReaders(fs FileSysClient, fileType dbio.FileType, nodes FileNodes, cfg url := fs.GetProp("url") ds = iop.NewDatastreamContext(fs.Context().Ctx, nil) ds.SafeInference = true + ds.SchemaOnly = cfg.SchemaOnly ds.SetMetadata(fs.GetProp("METADATA")) ds.Metadata.StreamURL.Value = url ds.SetConfig(fs.Client().Props()) diff --git a/core/dbio/filesys/fs_google.go b/core/dbio/filesys/fs_google.go index 1cbcb43e8..1d6c7dffc 100644 --- a/core/dbio/filesys/fs_google.go +++ b/core/dbio/filesys/fs_google.go @@ -59,8 +59,10 @@ func (fs *GoogleFileSysClient) GetPath(uri string) (path string, err error) { return } - if fs.bucket != host { - err = g.Error("URL bucket differs from connection bucket. %s != %s", host, fs.bucket) + // If URI specifies a different bucket, update fs.bucket to use it. + // This allows multi-bucket access with a single connection. + if fs.bucket != host && host != "" { + fs.bucket = host } return path, err diff --git a/core/dbio/filesys/fs_local.go b/core/dbio/filesys/fs_local.go index 299ae9b09..c8b62e8ec 100644 --- a/core/dbio/filesys/fs_local.go +++ b/core/dbio/filesys/fs_local.go @@ -120,6 +120,7 @@ func (fs *LocalFileSysClient) GetDatastream(uri string, cfg ...iop.FileStreamCon ds = iop.NewDatastreamContext(fs.Context().Ctx, nil) ds.SafeInference = true + ds.SchemaOnly = Cfg.SchemaOnly ds.SetMetadata(fs.GetProp("METADATA")) ds.Metadata.StreamURL.Value = path ds.SetConfig(fs.Props()) diff --git a/core/dbio/filesys/fs_s3.go b/core/dbio/filesys/fs_s3.go index 4690c7da1..580ae711c 100644 --- a/core/dbio/filesys/fs_s3.go +++ b/core/dbio/filesys/fs_s3.go @@ -98,13 +98,35 @@ func (fs *S3FileSysClient) GetPath(uri string) (path string, err error) { return } - if fs.bucket != host { - err = g.Error("URL bucket differs from connection bucket. %s != %s", host, fs.bucket) + // If URI specifies a different bucket, update fs.bucket to use it. + // This allows multi-bucket access with a single connection. + if fs.bucket != host && host != "" { + fs.bucket = host } return path, err } +// getBucketAndPath extracts the bucket and key from a URI without mutating fs.bucket. +// This is safe for concurrent use when reading from multiple buckets. +func (fs *S3FileSysClient) getBucketAndPath(uri string) (bucket, path string, err error) { + // normalize, in case url is provided without prefix + uri = NormalizeURI(fs, uri) + + host, path, err := ParseURL(uri) + if err != nil { + return + } + + // Use the bucket from the URI if specified, otherwise fall back to connection's bucket + bucket = host + if bucket == "" { + bucket = fs.bucket + } + + return bucket, path, err +} + const defaultRegion = "us-east-1" type fakeWriterAt struct { @@ -255,43 +277,49 @@ func (fs *S3FileSysClient) Connect() (err error) { // getSession returns the aws config and sets the region based on the bucket func (fs *S3FileSysClient) getConfig() aws.Config { + return fs.getConfigForBucket(fs.bucket) +} + +// getConfigForBucket returns the aws config with the region set for the specified bucket. +// This is safe for concurrent use when reading from multiple buckets. +func (fs *S3FileSysClient) getConfigForBucket(bucket string) aws.Config { fs.mux.Lock() defer fs.mux.Unlock() endpoint := fs.GetProp("ENDPOINT") region := fs.GetProp("REGION") - if fs.bucket == "" { + if bucket == "" { return fs.awsConfig } else if region != "" { - fs.RegionMap[fs.bucket] = region + fs.RegionMap[bucket] = region } else if strings.HasSuffix(endpoint, ".digitaloceanspaces.com") { region := strings.TrimSuffix(endpoint, ".digitaloceanspaces.com") region = strings.TrimPrefix(region, "https://") - fs.RegionMap[fs.bucket] = region + fs.RegionMap[bucket] = region } else if strings.HasSuffix(endpoint, ".cloudflarestorage.com") { - fs.RegionMap[fs.bucket] = "auto" - } else if endpoint == "" && fs.RegionMap[fs.bucket] == "" { + fs.RegionMap[bucket] = "auto" + } else if endpoint == "" && fs.RegionMap[bucket] == "" { s3Client := s3.NewFromConfig(fs.awsConfig) - region, err := manager.GetBucketRegion(fs.Context().Ctx, s3Client, fs.bucket, func(o *s3.Options) { + region, err := manager.GetBucketRegion(fs.Context().Ctx, s3Client, bucket, func(o *s3.Options) { o.Region = defaultRegion }) if err != nil { var apiErr smithy.APIError if errors.As(err, &apiErr) && apiErr.ErrorCode() == "NotFound" { - g.Debug("unable to find bucket %s's region not found", fs.bucket) - g.Debug("Region not found for " + fs.bucket) + g.Debug("unable to find bucket %s's region not found", bucket) + g.Debug("Region not found for " + bucket) } else { - g.Debug(g.Error(err, "Error getting Region for "+fs.bucket).Error()) + g.Debug(g.Error(err, "Error getting Region for "+bucket).Error()) } } else { - fs.RegionMap[fs.bucket] = region + fs.RegionMap[bucket] = region } } // Create a copy of the config with the appropriate region configCopy := fs.awsConfig.Copy() - if fs.RegionMap[fs.bucket] != "" { - configCopy.Region = fs.RegionMap[fs.bucket] + if fs.RegionMap[bucket] != "" { + configCopy.Region = fs.RegionMap[bucket] } else { configCopy.Region = defaultRegion } @@ -394,16 +422,19 @@ func (r *S3ReaderWrapper) closeOnce() error { // path should specify the full path with scheme: // `s3://my_bucket/key/to/file.txt` or `s3://my_bucket/key/to/directory` func (fs *S3FileSysClient) GetReader(uri string) (reader io.Reader, err error) { - key, err := fs.GetPath(uri) + // Use getBucketAndKey to extract bucket and key without mutating fs.bucket. + // This is safe for concurrent use when reading from multiple buckets. + bucket, key, err := fs.getBucketAndPath(uri) if err != nil { return } - svc := s3.NewFromConfig(fs.getConfig()) + // Get config for the specific bucket (handles region lookup) + svc := s3.NewFromConfig(fs.getConfigForBucket(bucket)) // Use GetObject directly for streaming result, err := svc.GetObject(fs.Context().Ctx, &s3.GetObjectInput{ - Bucket: aws.String(fs.bucket), + Bucket: aws.String(bucket), Key: aws.String(key), }) if err != nil { diff --git a/core/dbio/filesys/fs_test.go b/core/dbio/filesys/fs_test.go index 244698ee5..005aff5a4 100755 --- a/core/dbio/filesys/fs_test.go +++ b/core/dbio/filesys/fs_test.go @@ -702,7 +702,7 @@ func TestFileSysDOSpaces(t *testing.T) { "ENDPOINT=nyc3.digitaloceanspaces.com", "ACCESS_KEY_ID="+os.Getenv("DOS_ACCESS_KEY_ID"), "SECRET_ACCESS_KEY="+os.Getenv("DOS_SECRET_ACCESS_KEY"), - "METADATA="+g.Marshal(iop.Metadata{LoadedAt: iop.KeyValue{"loaded_at", time.Now().Unix()}, StreamURL: iop.KeyValue{"url", ""}}), + "METADATA="+g.Marshal(iop.Metadata{SyncedAt: iop.KeyValue{"loaded_at", time.Now().Unix()}, StreamURL: iop.KeyValue{"url", ""}}), ) assert.NoError(t, err) diff --git a/core/dbio/iop/README.md b/core/dbio/iop/README.md deleted file mode 100755 index 175ffcc31..000000000 --- a/core/dbio/iop/README.md +++ /dev/null @@ -1,2 +0,0 @@ - -## Input-Process-Output (ipo) \ No newline at end of file diff --git a/core/dbio/iop/datastream.go b/core/dbio/iop/datastream.go index cee8f507d..9a1ec7c7b 100644 --- a/core/dbio/iop/datastream.go +++ b/core/dbio/iop/datastream.go @@ -52,6 +52,7 @@ type Datastream struct { Bytes atomic.Uint64 Sp *StreamProcessor SafeInference bool + SchemaOnly bool NoDebug bool Inferred bool deferFuncs []func() @@ -87,6 +88,7 @@ type FileStreamConfig struct { IncrementalValue string `json:"incremental_value"` FileSelect []string `json:"file_select"` // a list of files to include. DuckDBFilename bool `json:"duckdb_filename"` // stream URL + SchemaOnly bool `json:"schema_only"` Props map[string]string `json:"props"` } @@ -118,7 +120,8 @@ type KeyValue struct { type Metadata struct { StreamURL KeyValue `json:"stream_url"` - LoadedAt KeyValue `json:"loaded_at"` + SyncedAt KeyValue `json:"synced_at"` + SyncedOp KeyValue `json:"synced_op"` RowNum KeyValue `json:"row_num"` RowID KeyValue `json:"row_id"` ExecID KeyValue `json:"exec_id"` @@ -261,6 +264,13 @@ func (ds *Datastream) processBwRows() { } } +// StartBwProcessor starts the bytes-written processor goroutine. +// This should be called when creating a datastream that pushes rows +// directly without calling Start() (e.g., chunked streaming). +func (ds *Datastream) StartBwProcessor() { + go ds.processBwRows() +} + // SetReady sets the ds.ready func (ds *Datastream) SetReady() { if !ds.Ready { @@ -804,29 +814,47 @@ skipBuffer: return name } - if ds.Metadata.LoadedAt.Key != "" && ds.Metadata.LoadedAt.Value != nil { - ds.Metadata.LoadedAt.Key = ensureName(ds.Metadata.LoadedAt.Key) + if ds.Metadata.SyncedAt.Key != "" && ds.Metadata.SyncedAt.Value != nil { + ds.Metadata.SyncedAt.Key = ensureName(ds.Metadata.SyncedAt.Key) // handle timestamp value isTimestamp := false - if tVal, err := cast.ToTimeE(ds.Metadata.LoadedAt.Value); err == nil { + if tVal, err := cast.ToTimeE(ds.Metadata.SyncedAt.Value); err == nil { isTimestamp = true - ds.Metadata.LoadedAt.Value = tVal + ds.Metadata.SyncedAt.Value = tVal } else { - ds.Metadata.LoadedAt.Value = cast.ToInt64(ds.Metadata.LoadedAt.Value) + ds.Metadata.SyncedAt.Value = cast.ToInt64(ds.Metadata.SyncedAt.Value) } col := Column{ - Name: ds.Metadata.LoadedAt.Key, + Name: ds.Metadata.SyncedAt.Key, Type: lo.Ternary(isTimestamp, TimestampzType, IntegerType), Position: len(ds.Columns) + 1, - Description: "Sling.Metadata.LoadedAt", - Metadata: map[string]string{"sling_metadata": "loaded_at"}, + Description: "Sling.Metadata.SyncedAt", + Metadata: map[string]string{"sling_metadata": "synced_at"}, + Sourced: true, + } + ds.Columns = append(ds.Columns, col) + metaValuesMap[col.Position-1] = func(it *Iterator) any { + return ds.Metadata.SyncedAt.Value + } + } + + if ds.Metadata.SyncedOp.Key != "" && ds.Metadata.SyncedOp.Value != nil { + ds.Metadata.SyncedOp.Key = ensureName(ds.Metadata.SyncedOp.Key) + + col := Column{ + Name: ds.Metadata.SyncedOp.Key, + Type: StringType, + DbPrecision: 4, + Position: len(ds.Columns) + 1, + Description: "Sling.Metadata.SyncedOp", + Metadata: map[string]string{"sling_metadata": "synced_op"}, Sourced: true, } ds.Columns = append(ds.Columns, col) metaValuesMap[col.Position-1] = func(it *Iterator) any { - return ds.Metadata.LoadedAt.Value + return ds.Metadata.SyncedOp.Value } } @@ -956,6 +984,10 @@ skipBuffer: loop: for ds.it.next() { + if ds.SchemaOnly { + break // don't push any rows + } + schemaChgLoop: for { // reprocess row if needed (to expand it as needed) diff --git a/core/dbio/iop/datatype.go b/core/dbio/iop/datatype.go index 10f3281d7..b8ee0f76f 100755 --- a/core/dbio/iop/datatype.go +++ b/core/dbio/iop/datatype.go @@ -1626,7 +1626,7 @@ func NewSelector(selectExprs []string, casing ColumnCasing) *Selector { continue } - field, newName, isExclude, _ := parseSelectExpr(expr) + field, newName, isExclude, _ := ParseSelectExpr(expr) fieldLower := strings.ToLower(field) if isExclude { @@ -1694,7 +1694,7 @@ func (s *Selector) compute(name, nameLower string) string { // 4. Check glob exclusions for _, pattern := range s.excludeGlobs { - if matchesSelectGlob(nameLower, pattern) { + if MatchesSelectGlob(nameLower, pattern) { return "" } } @@ -1711,7 +1711,7 @@ func (s *Selector) compute(name, nameLower string) string { // 7. Check glob inclusions for _, pattern := range s.includeGlobs { - if matchesSelectGlob(nameLower, pattern) { + if MatchesSelectGlob(nameLower, pattern) { return name } } @@ -1774,7 +1774,7 @@ func applySelectAllMode(fields []string, fieldMap map[string]int, selectExprs [] continue } - field, newName, isExclude, err := parseSelectExpr(expr) + field, newName, isExclude, err := ParseSelectExpr(expr) if err != nil { return nil, err } @@ -1784,7 +1784,7 @@ func applySelectAllMode(fields []string, fieldMap map[string]int, selectExprs [] if strings.Contains(field, "*") { // Glob exclusion for i, f := range fields { - if matchesSelectGlob(strings.ToLower(f), strings.ToLower(field)) { + if MatchesSelectGlob(strings.ToLower(f), strings.ToLower(field)) { excluded[i] = true } } @@ -1831,7 +1831,7 @@ func applySelectExplicitMode(fields []string, fieldMap map[string]int, selectExp for _, expr := range selectExprs { expr = strings.TrimSpace(expr) - field, newName, isExclude, err := parseSelectExpr(expr) + field, newName, isExclude, err := ParseSelectExpr(expr) if err != nil { return nil, err } @@ -1847,7 +1847,7 @@ func applySelectExplicitMode(fields []string, fieldMap map[string]int, selectExp if added[i] { continue } - if matchesSelectGlob(strings.ToLower(f), strings.ToLower(field)) { + if MatchesSelectGlob(strings.ToLower(f), strings.ToLower(field)) { newFields = append(newFields, f) added[i] = true } @@ -1873,7 +1873,7 @@ func applySelectExplicitMode(fields []string, fieldMap map[string]int, selectExp return newFields, nil } -// parseSelectExpr parses a single select expression +// ParseSelectExpr parses a single select expression // Returns: (fieldName, newName, isExclusion, error) // Examples: // @@ -1882,7 +1882,7 @@ func applySelectExplicitMode(fields []string, fieldMap map[string]int, selectExp // "field as new" -> ("field", "new", false, nil) // "prefix*" -> ("prefix*", "", false, nil) // "-*suffix" -> ("*suffix", "", true, nil) -func parseSelectExpr(expr string) (field string, newName string, exclude bool, err error) { +func ParseSelectExpr(expr string) (field string, newName string, exclude bool, err error) { expr = strings.TrimSpace(expr) // Check for exclusion prefix @@ -1908,9 +1908,9 @@ func parseSelectExpr(expr string) (field string, newName string, exclude bool, e return field, "", exclude, nil } -// matchesSelectGlob checks if name matches a simple glob pattern (prefix* or *suffix) +// MatchesSelectGlob checks if name matches a simple glob pattern (prefix* or *suffix) // Both name and pattern should be lowercase for case-insensitive matching -func matchesSelectGlob(name, pattern string) bool { +func MatchesSelectGlob(name, pattern string) bool { if !strings.Contains(pattern, "*") { return name == pattern } diff --git a/core/dbio/iop/duckdb.go b/core/dbio/iop/duckdb.go index 927adea80..de9520f34 100644 --- a/core/dbio/iop/duckdb.go +++ b/core/dbio/iop/duckdb.go @@ -1426,13 +1426,17 @@ func (duck *DuckDb) MakeScanQuery(format dbio.FileType, uri string, fsc FileStre } // reserved word to use for timestamp comparison (when listing) - const slingLoadedAtColumn = "_sling_loaded_at" - if fsc.IncrementalKey != "" && fsc.IncrementalKey != slingLoadedAtColumn && + if fsc.IncrementalKey != "" && fsc.IncrementalKey != env.ReservedFields.LoadedAt && fsc.IncrementalValue != "" { incrementalWhereCond = g.F("%s > %s", dbio.TypeDbDuckDb.Quote(fsc.IncrementalKey), fsc.IncrementalValue) where = g.F("where %s", incrementalWhereCond) } + // we need only the types + if fsc.SchemaOnly { + where = "where 1=0" + } + if format == dbio.FileTypeNone { g.Warn("duck.MakeScanQuery: format is empty, cannot determine stream_scanner") } diff --git a/core/dbio/iop/stream_processor.go b/core/dbio/iop/stream_processor.go index 4f7288d75..9ab6decac 100644 --- a/core/dbio/iop/stream_processor.go +++ b/core/dbio/iop/stream_processor.go @@ -1111,6 +1111,10 @@ func (sp *StreamProcessor) CastToStringE(val any) (valString string, err error) if err != nil { return "", g.Error(err, "could not cast to string: %#v", v) } + case time.Time: + valString = v.Format(time.RFC3339Nano) + case *time.Time: + valString = v.Format(time.RFC3339Nano) case chJSON: // Clickhouse JSON / Variant or any with MarshalJSON() var sBytes []byte sBytes, err = v.MarshalJSON() @@ -1582,6 +1586,7 @@ func (sp *StreamProcessor) CastRow(row []any, columns Columns) []any { for i, val := range row { col := &columns[i] row[i] = sp.CastVal(i, val, col) + // g.Warn("%d | col %s | nVal => %#v", sp.N, col.Name, row[i]) if row[i] != nil && row[i] != "" { sp.colStats[i].LastVal = row[i] } diff --git a/core/dbio/templates/azuredwh.yaml b/core/dbio/templates/azuredwh.yaml index 43398def0..a143928fa 100755 --- a/core/dbio/templates/azuredwh.yaml +++ b/core/dbio/templates/azuredwh.yaml @@ -9,8 +9,8 @@ core: update {table} as t1 set {set_fields2} from (select * from {temp_table}) as t2 where {pk_fields_equal} - insert: insert into {table} ({cols}) values ({values}) - insert_temp: insert into {table} ({cols}) select {cols} from {temp_table} + insert: insert into {table} ({fields}) values ({values}) + insert_temp: insert into {table} ({fields}) select {cols} from {temp_table} insert_ignore: insert into {table} ({fields}) values ({values}) on conflict ({pk_fields}) do nothing insert_ignore_temp: insert into {table} ({names}) select {names} from {temp_table} on conflict ({pk_fields}) do nothing update_temp: | diff --git a/core/dbio/templates/azuresql.yaml b/core/dbio/templates/azuresql.yaml index caf8b5493..9f4461667 100755 --- a/core/dbio/templates/azuresql.yaml +++ b/core/dbio/templates/azuresql.yaml @@ -9,8 +9,8 @@ core: update {table} as t1 set {set_fields2} from (select * from {temp_table}) as t2 where {pk_fields_equal} - insert: insert into {table} ({cols}) values ({values}) - insert_temp: insert into {table} ({cols}) select {cols} from {temp_table} + insert: insert into {table} ({fields}) values ({values}) + insert_temp: insert into {table} ({fields}) select {cols} from {temp_table} insert_ignore: insert into {table} ({fields}) values ({values}) on conflict ({pk_fields}) do nothing insert_ignore_temp: insert into {table} ({names}) select {names} from {temp_table} on conflict ({pk_fields}) do nothing update_temp: | diff --git a/core/dbio/templates/db2.yaml b/core/dbio/templates/db2.yaml new file mode 100644 index 000000000..1afccabc6 --- /dev/null +++ b/core/dbio/templates/db2.yaml @@ -0,0 +1,416 @@ +core: + drop_table: | + BEGIN + DECLARE CONTINUE HANDLER FOR SQLSTATE '42704' BEGIN END; + EXECUTE IMMEDIATE 'DROP TABLE {table}'; + END; + drop_view: | + BEGIN + DECLARE CONTINUE HANDLER FOR SQLSTATE '42704' BEGIN END; + EXECUTE IMMEDIATE 'DROP VIEW {view}'; + END; + drop_index: drop index {schema}.{index} + create_table: create table {table} ({col_types}) + create_index: create index {index} on {table} ({cols}) + create_unique_index: create unique index {index} on {table} ({cols}) + insert: insert into {table} ({fields}) values ({values}) + insert_temp: insert into {table} ({fields}) select {cols} from {temp_table} + sample: select {fields} from {table} TABLESAMPLE SYSTEM (50) fetch first {n} rows only + rename_table: rename table {table} to {new_table} + modify_column: alter table {table} alter column {column} set data type {type} + use_database: set current schema {schema} + delete_where_not_exist: | + delete from {target_table} + where {where} + and {unique_id} not in ( + select {unique_id} + from {temp_table} + ) + update_where_not_exist: | + update {target_table} + set {set_fields} + where {where} + and {unique_id} not in ( + select {unique_id} + from {temp_table} + ) + merge_update_insert: | + MERGE INTO {tgt_table} tgt + USING {src_table} src + ON ({src_tgt_pk_equal}) + WHEN MATCHED THEN + UPDATE SET {set_fields} + WHEN NOT MATCHED THEN + INSERT ({insert_fields}) VALUES ({src_insert_fields}) + merge_delete_insert: | + DELETE FROM {tgt_table} + WHERE EXISTS ( + SELECT 1 FROM {src_table} src + WHERE {src_tgt_pk_equal} + ); + INSERT INTO {tgt_table} ({insert_fields}) + SELECT {src_fields} FROM {src_table} + db2import: | + IMPORT FROM '{file}' OF DEL + MODIFIED BY COLDEL, CODEPAGE=1208 + METHOD P (1, 2, 3) + MESSAGES '{message_file}' + INSERT INTO {table} ({columns}) + +metadata: + current_database: + select current server from sysibm.sysdummy1 + + databases: | + select distinct dbname as name + from syscat.tables + order by dbname + + schemas: | + select schemaname as schema_name + from syscat.schemata + order by schemaname + + tables: | + select tabschema as schema_name, tabname as table_name, 'false' as is_view + from syscat.tables + where type = 'T' + {{if .schema -}} and tabschema = '{schema}' {{- end}} + order by tabschema, tabname + + views: | + select viewschema as schema_name, viewname as table_name, 'true' as is_view + from syscat.views + {{if .schema -}} where viewschema = '{schema}' {{- end}} + order by viewschema, viewname + + columns: | + select + colname as column_name, + typename as data_type, + case + when typename in ('DECIMAL', 'NUMERIC') then length + when typename in ('FLOAT', 'REAL', 'DOUBLE') then length + else null + end as precision, + case + when typename in ('DECIMAL', 'NUMERIC') then scale + else null + end as scale + from syscat.columns + where tabschema = '{schema}' + and tabname = '{table}' + order by colno + + primary_keys: | + select + constname as pk_name, + colseq as position, + colname as column_name + from syscat.keycoluse + where tabschema = '{schema}' + and tabname = '{table}' + and constname in ( + select constname + from syscat.tabconst + where tabschema = '{schema}' + and tabname = '{table}' + and type = 'P' + ) + order by colseq + + indexes: | + select + indname as index_name, + colname as column_name + from syscat.indexcoluse + where indschema = '{schema}' + and tabname = '{table}' + order by indname, colseq + + columns_full: | + select + tabschema as schema_name, + tabname as table_name, + colname as column_name, + typename as data_type, + colno as position + from syscat.columns + where 1=1 + {{if .schema -}} and tabschema = '{schema}' {{- end}} + {{if .table -}} and tabname = '{table}' {{- end}} + order by tabschema, tabname, colno + + schemata: | + select + c.tabschema as schema_name, + c.tabname as table_name, + case + when t.type = 'T' then 'false' + else 'true' + end as is_view, + c.colname as column_name, + c.typename as data_type, + c.colno as position + from syscat.columns c + left join syscat.tables t + on t.tabschema = c.tabschema + and t.tabname = c.tabname + where 1=1 + {{if .schema -}} and c.tabschema = '{schema}' {{- end}} + {{if .tables -}} and c.tabname in ({tables}) {{- end}} + order by c.tabschema, c.tabname, c.colno + + row_count_estimates: | + select + tabschema as schema_name, + tabname as table_name, + card as count + from syscat.tables + where type = 'T' + {{if .schema -}} and tabschema = '{schema}' {{- end}} + {{if .table -}} and tabname = '{table}' {{- end}} + order by card desc + + ddl_table: | + select 'CREATE TABLE "' || tabschema || '"."' || tabname || '" (' || + listagg( + '"' || colname || '" ' || typename || + case + when typename in ('VARCHAR', 'CHAR', 'GRAPHIC', 'VARGRAPHIC') then '(' || length || ')' + when typename in ('DECIMAL', 'NUMERIC') then '(' || length || ',' || scale || ')' + else '' + end || + case when nulls = 'N' then ' NOT NULL' else '' end, + ', ' + ) within group (order by colno) || ')' as ddl + from syscat.columns + where tabschema = '{schema}' + and tabname = '{table}' + group by tabschema, tabname + + ddl_view: | + select text as ddl + from syscat.views + where viewschema = '{schema}' + and viewname = '{table}' + + sessions: | + select + agent_id as sid, + application_handle, + application_name, + authid as username, + client_applname as program + from table(mon_get_connection(cast(null as bigint), -1)) + where application_name <> '' + + session_terminate: | + call admin_cmd('force application (' || {agent_id} || ')') + +analysis: + field_chars: | + select + '{schema}' as schema_nm, + '{table}' as table_nm, + '{field}' as field, + sum(case when locate(chr(10), {field}) > 0 then 1 else 0 end) as cnt_nline, + sum(case when locate(chr(9), {field}) > 0 then 1 else 0 end) as cnt_tab, + sum(case when locate(',', {field}) > 0 then 1 else 0 end) as cnt_comma, + sum(case when locate('"', {field}) > 0 then 1 else 0 end) as cnt_dquote, + min(length({field})) as f_min_len, + max(length({field})) as f_max_len + from "{schema}"."{table}" + + field_stat_len: | + select + '{schema}' as schema_nm, + '{table}' as table_nm, + '{field}' as field, + count(*) as tot_cnt, + min(length(varchar({field}))) as f_min_len, + max(length(varchar({field}))) as f_max_len + from "{schema}"."{table}" + + field_stat_deep: | + select + '{schema}' as schema_nm, + '{table}' as table_nm, + '{field}' as field, + count(*) as tot_cnt, + count({field}) as f_cnt, + count(*) - count({field}) as f_null_cnt, + round(100.0 * (count(*) - count({field})) / count(*), 1) as f_null_prct, + count(distinct {field}) as f_dstct_cnt, + round(100.0 * count(distinct {field}) / count(*), 1) as f_dstct_prct, + count(*) - count(distinct {field}) as f_dup_cnt, + cast(min({field}) as varchar(1000)) as f_min, + cast(max({field}) as varchar(1000)) as f_max, + min(length(varchar({field}))) as f_min_len, + max(length(varchar({field}))) as f_max_len + from "{schema}"."{table}" + + distro_field: | + with t1 as ( + select + '{field}' as field, + {field}, + count(*) as cnt + from "{schema}"."{table}" + group by {field} + order by count(*) desc + fetch first 1000 rows only + ), + t2 as ( + select + '{field}' as field, + count(*) as ttl_cnt + from "{schema}"."{table}" + ) + select + '{table}' as table_nm, + t1.field, + {field} as value, + cnt, + round(100.0 * cnt / ttl_cnt, 2) as prct + from t1 + join t2 on t1.field = t2.field + order by cnt desc + + distro_field_group: | + with t1 as ( + select + '{field}' as field, + {group_expr} as group_exp, + {field}, + count(*) as cnt + from "{schema}"."{table}" + group by {field}, {group_expr} + order by count(*) desc + fetch first 1000 rows only + ), + t2 as ( + select + '{field}' as field, + count(*) as ttl_cnt + from "{schema}"."{table}" + ) + select + '{table}' as table_nm, + t1.field, + t1.group_exp, + {field} as value, + cnt, + round(100.0 * cnt / ttl_cnt, 2) as prct + from t1 + join t2 on t1.field = t2.field + order by cnt desc + + distro_field_date: | + with t1 as ( + select + '{field}' as field, + year({field}) as year, + month({field}) as month, + day({field}) as day, + count(*) as cnt + from "{schema}"."{table}" + group by year({field}), month({field}), day({field}) + order by year({field}), month({field}), day({field}) + ), + t2 as ( + select '{field}' as field, count(*) as ttl_cnt + from "{schema}"."{table}" + ) + select + '{schema}' as schema_nm, + '{table}' as table_nm, + t1.field, + t1.year, + t1.month, + t1.day, + cnt, + round(100.0 * cnt / ttl_cnt, 2) as prct + from t1 + join t2 on t1.field = t2.field + order by t1.year, t1.month, t1.day + +function: + truncate_f: trunc({field}, 0) + truncate_datef: date({field}) + string_type: varchar(32672) + cast_to_string: 'varchar({field})' + cast_to_text: 'varchar({field}, 32672)' + date_to_int: days({field}) - days('1970-01-01') + number_to_int: int({field}) + sleep: call dbms_lock.sleep({seconds}) + checksum_integer: cast(abs({field}) as bigint) + checksum_bigint: cast(abs({field}) as decimal(31,0)) + checksum_decimal: cast(abs({field}) as decimal(31,0)) + checksum_date: cast((days({field}) - days('1970-01-01')) as bigint) * 86400 + checksum_datetime: cast((days({field}) - days('1970-01-01')) as bigint) * 86400 + cast(midnight_seconds({field}) as bigint) + checksum_string: cast(length({field}) as bigint) + checksum_boolean: cast(length(varchar({field})) as bigint) + checksum_json: cast(length(varchar({field})) as bigint) + now: current timestamp + concat: concat({fields}) + +variable: + tmp_folder: /tmp + bind_string: "?" + error_filter_table_exists: undefined + error_ignore_drop_table: undefined + error_ignore_drop_view: undefined + max_string_type: varchar(32672) + max_string_length: 32672 + max_column_length: 128 + column_upper: true + +native_type_map: + smallint: smallint + integer: integer + int: integer + bigint: bigint + decimal: decimal + numeric: decimal + dec: decimal + real: float + float: float + double: float + double precision: float + decfloat: float + char: string + character: string + varchar: string + character varying: string + clob: text + blob: binary + varbinary: binary + binary: binary + date: date + time: time + timestamp: timestamp + boolean: bool + xml: text + graphic: string + vargraphic: string + dbclob: string + +general_type_map: + bigint: bigint + binary: varbinary(32672) + bool: boolean + date: date + datetime: timestamp + decimal: "decimal(,)" + integer: integer + json: clob + smallint: smallint + string: "varchar()" + text: clob + timestamp: timestamp + timestampz: timestamp + float: double + time: time + timez: time + uuid: char(36) \ No newline at end of file diff --git a/core/dbio/templates/exasol.yaml b/core/dbio/templates/exasol.yaml index 86b7dfe98..0c6a08ecd 100644 --- a/core/dbio/templates/exasol.yaml +++ b/core/dbio/templates/exasol.yaml @@ -191,7 +191,7 @@ bulk: function: add_months: add_months({field}, {num}) cast: cast({field} as {type}) - concat: concat({strings}) + concat: concat({fields}) date_diff_days: days_between({date1}, {date2}) date_diff_seconds: seconds_between({date1}, {date2}) date_parse_format: to_timestamp({string}, {format}) @@ -208,6 +208,7 @@ function: uuid: sys_guid() variable: + column_upper: true bool_as: string duplicates_group_by: false handle_null_compare: true diff --git a/core/dbio/templates/fabric.yaml b/core/dbio/templates/fabric.yaml index 00e2e7c86..224622874 100755 --- a/core/dbio/templates/fabric.yaml +++ b/core/dbio/templates/fabric.yaml @@ -18,8 +18,8 @@ core: incremental_select_limit: select top {limit} {fields} from {table} where ({incremental_where_cond}){where_and} order by {update_key} asc incremental_select_limit_offset: select top {limit} * from ( select {fields} from {table} where ({incremental_where_cond}){where_and} order by {update_key} asc offset {offset} rows) as t incremental_select: select {fields} from {table} where ({incremental_where_cond}){where_and} - insert: insert into {table} ({cols}) values ({values}) - insert_temp: insert into {table} ({cols}) select {cols} from {temp_table} + insert: insert into {table} ({fields}) values ({values}) + insert_temp: insert into {table} ({fields}) select {cols} from {temp_table} insert_ignore: insert into {table} ({fields}) values ({values}) on conflict ({pk_fields}) do nothing insert_ignore_temp: insert into {table} ({names}) select {names} from {temp_table} on conflict ({pk_fields}) do nothing update_temp: | diff --git a/core/dbio/templates/firebird.yaml b/core/dbio/templates/firebird.yaml new file mode 100644 index 000000000..4e33cefd6 --- /dev/null +++ b/core/dbio/templates/firebird.yaml @@ -0,0 +1,435 @@ +core: + drop_table: | + execute block as + begin + if (exists(select 1 from rdb$relations where (rdb$relation_name) = ('{table}'))) then + execute statement 'drop table {table}'; + end + drop_view: | + execute block as + begin + if (exists(select 1 from rdb$relations where rdb$relation_name = ('{view}'))) then + execute statement 'drop view {view}'; + end + drop_index: | + execute block as + begin + if (exists(select 1 from rdb$indices where rdb$index_name = ('{index}'))) then + execute statement 'drop index {index}'; + end + create_table: create table {table} ({col_types}) + create_index: create index {index} on {table} ({cols}) + create_unique_index: create unique index {index} on {table} ({cols}) + replace: | + update or insert into {table} ({fields}) + values ({values}) + matching ({pk_fields}) + replace_temp: | + merge into {table} t1 + using {temp_table} t2 + on ({pk_fields_equal}) + when matched then + update set {set_fields2} + when not matched then + insert ({names}) values ({names2}) + insert: insert into {table} ({fields}) values ({values}) + insert_temp: insert into {table} ({fields}) select {cols} from {temp_table} + insert_ignore: | + execute block as + begin + insert into {table} ({fields}) values ({values}); + when any do + begin + -- ignore duplicate key errors + end + end + insert_ignore_temp: | + merge into {table} t1 + using {temp_table} t2 + on ({pk_fields_equal}) + when not matched then + insert ({names}) values ({names2}) + update_temp: | + merge into {table} t1 + using {temp_table} t2 + on ({pk_fields_equal}) + when matched then + update set {set_fields2} + sample: select first {n} {fields} from {table} + rename_table: alter table {table} rename to {new_table} + modify_column: alter table {table} alter column {column} type {type} + use_database: -- firebird doesn't support USE DATABASE + +metadata: + + current_database: | + select MON$DATABASE_NAME as name + from MON$DATABASE + + databases: | + select MON$DATABASE_NAME as name + from MON$DATABASE + + schemas: | + select 'main' as schema_name + from RDB$DATABASE + + tables: | + select + 'main' as schema_name, + trim(r.RDB$RELATION_NAME) as table_name, + 'false' as is_view + from RDB$RELATIONS r + where r.RDB$SYSTEM_FLAG = 0 + and r.RDB$VIEW_BLR is null + order by r.RDB$RELATION_NAME + + views: | + select + 'main' as schema_name, + trim(r.RDB$RELATION_NAME) as table_name, + 'true' as is_view + from RDB$RELATIONS r + where r.RDB$SYSTEM_FLAG = 0 + and r.RDB$VIEW_BLR is not null + order by r.RDB$RELATION_NAME + + columns: | + select + trim(rf.RDB$FIELD_NAME) as column_name, + case f.RDB$FIELD_TYPE + when 7 then 'SMALLINT' + when 8 then 'INTEGER' + when 10 then 'FLOAT' + when 12 then 'DATE' + when 13 then 'TIME' + when 14 then 'CHAR(' || f.RDB$FIELD_LENGTH || ')' + when 16 then + case f.RDB$FIELD_SUB_TYPE + when 0 then 'BIGINT' + when 1 then 'NUMERIC(' || f.RDB$FIELD_PRECISION || ',' || abs(f.RDB$FIELD_SCALE) || ')' + when 2 then 'DECIMAL(' || f.RDB$FIELD_PRECISION || ',' || abs(f.RDB$FIELD_SCALE) || ')' + end + when 27 then 'DOUBLE PRECISION' + when 35 then 'TIMESTAMP' + when 37 then 'VARCHAR(' || f.RDB$FIELD_LENGTH || ')' + when 40 then 'CSTRING(' || f.RDB$FIELD_LENGTH || ')' + when 261 then 'BLOB' + else 'UNKNOWN' + end as data_type, + f.RDB$FIELD_PRECISION as "precision", + abs(f.RDB$FIELD_SCALE) as scale + from RDB$RELATION_FIELDS rf + join RDB$FIELDS f on rf.RDB$FIELD_SOURCE = f.RDB$FIELD_NAME + where trim(rf.RDB$RELATION_NAME) = '{table}' + order by rf.RDB$FIELD_POSITION + + primary_keys: | + select + trim(rc.RDB$CONSTRAINT_NAME) as pk_name, + sg.RDB$FIELD_POSITION + 1 as position, + trim(sg.RDB$FIELD_NAME) as column_name + from RDB$RELATION_CONSTRAINTS rc + join RDB$INDEX_SEGMENTS sg on rc.RDB$INDEX_NAME = sg.RDB$INDEX_NAME + where rc.RDB$CONSTRAINT_TYPE = 'PRIMARY KEY' + and (trim(rc.RDB$RELATION_NAME)) = ('{table}') + order by sg.RDB$FIELD_POSITION + + indexes: | + select + trim(i.RDB$INDEX_NAME) as index_name, + trim(sg.RDB$FIELD_NAME) as column_name + from RDB$INDICES i + join RDB$INDEX_SEGMENTS sg on i.RDB$INDEX_NAME = sg.RDB$INDEX_NAME + where i.RDB$SYSTEM_FLAG = 0 + and (trim(i.RDB$RELATION_NAME)) = ('{table}') + order by i.RDB$INDEX_NAME, sg.RDB$FIELD_POSITION + + columns_full: | + select + 'main' as schema_name, + trim(rf.RDB$RELATION_NAME) as table_name, + trim(rf.RDB$FIELD_NAME) as column_name, + case f.RDB$FIELD_TYPE + when 7 then 'SMALLINT' + when 8 then 'INTEGER' + when 10 then 'FLOAT' + when 12 then 'DATE' + when 13 then 'TIME' + when 14 then 'CHAR(' || f.RDB$FIELD_LENGTH || ')' + when 16 then + case f.RDB$FIELD_SUB_TYPE + when 0 then 'BIGINT' + when 1 then 'NUMERIC(' || f.RDB$FIELD_PRECISION || ',' || abs(f.RDB$FIELD_SCALE) || ')' + when 2 then 'DECIMAL(' || f.RDB$FIELD_PRECISION || ',' || abs(f.RDB$FIELD_SCALE) || ')' + end + when 27 then 'DOUBLE PRECISION' + when 35 then 'TIMESTAMP' + when 37 then 'VARCHAR(' || f.RDB$FIELD_LENGTH || ')' + when 40 then 'CSTRING(' || f.RDB$FIELD_LENGTH || ')' + when 261 then 'BLOB' + else 'UNKNOWN' + end as data_type, + rf.RDB$FIELD_POSITION + 1 as position + from RDB$RELATION_FIELDS rf + join RDB$FIELDS f on rf.RDB$FIELD_SOURCE = f.RDB$FIELD_NAME + join RDB$RELATIONS r on rf.RDB$RELATION_NAME = r.RDB$RELATION_NAME + where r.RDB$SYSTEM_FLAG = 0 + {{if .table -}} and (trim(rf.RDB$RELATION_NAME)) = ('{table}') {{- end}} + order by rf.RDB$RELATION_NAME, rf.RDB$FIELD_POSITION + + schemata: | + select + 'main' as schema_name, + trim(rf.RDB$RELATION_NAME) as table_name, + case + when r.RDB$VIEW_BLR is null then false + else true + end as is_view, + trim(rf.RDB$FIELD_NAME) as column_name, + case f.RDB$FIELD_TYPE + when 7 then 'SMALLINT' + when 8 then 'INTEGER' + when 10 then 'FLOAT' + when 12 then 'DATE' + when 13 then 'TIME' + when 14 then 'CHAR(' || f.RDB$FIELD_LENGTH || ')' + when 16 then + case f.RDB$FIELD_SUB_TYPE + when 0 then 'BIGINT' + when 1 then 'NUMERIC(' || f.RDB$FIELD_PRECISION || ',' || abs(f.RDB$FIELD_SCALE) || ')' + when 2 then 'DECIMAL(' || f.RDB$FIELD_PRECISION || ',' || abs(f.RDB$FIELD_SCALE) || ')' + end + when 27 then 'DOUBLE PRECISION' + when 35 then 'TIMESTAMP' + when 37 then 'VARCHAR(' || f.RDB$FIELD_LENGTH || ')' + when 40 then 'CSTRING(' || f.RDB$FIELD_LENGTH || ')' + when 261 then 'BLOB' + else 'UNKNOWN' + end as data_type, + rf.RDB$FIELD_POSITION + 1 as position + from RDB$RELATION_FIELDS rf + join RDB$FIELDS f on rf.RDB$FIELD_SOURCE = f.RDB$FIELD_NAME + join RDB$RELATIONS r on rf.RDB$RELATION_NAME = r.RDB$RELATION_NAME + where r.RDB$SYSTEM_FLAG = 0 + {{if .tables -}} and trim(rf.RDB$RELATION_NAME) in ({tables}) {{- end}} + order by rf.RDB$RELATION_NAME, rf.RDB$FIELD_POSITION + + row_count_estimates: | + select + 'main' as schema_name, + trim(r.RDB$RELATION_NAME) as table_name, + 0 as count + from RDB$RELATIONS r + where r.RDB$SYSTEM_FLAG = 0 + and r.RDB$VIEW_BLR is null + {{if .table -}} and trim(r.RDB$RELATION_NAME) = '{table}' {{- end}} + + ddl_table: | + execute block + returns (ddl varchar(8000)) + as + declare variable field_name varchar(31); + declare variable field_type varchar(100); + declare variable field_null varchar(10); + declare variable first_field boolean = true; + begin + ddl = 'CREATE TABLE {table} ('; + + for select + trim(rf.RDB$FIELD_NAME), + case f.RDB$FIELD_TYPE + when 7 then 'SMALLINT' + when 8 then 'INTEGER' + when 10 then 'FLOAT' + when 12 then 'DATE' + when 13 then 'TIME' + when 14 then 'CHAR(' || f.RDB$FIELD_LENGTH || ')' + when 16 then + case f.RDB$FIELD_SUB_TYPE + when 0 then 'BIGINT' + when 1 then 'NUMERIC(' || f.RDB$FIELD_PRECISION || ',' || abs(f.RDB$FIELD_SCALE) || ')' + when 2 then 'DECIMAL(' || f.RDB$FIELD_PRECISION || ',' || abs(f.RDB$FIELD_SCALE) || ')' + end + when 27 then 'DOUBLE PRECISION' + when 35 then 'TIMESTAMP' + when 37 then 'VARCHAR(' || f.RDB$FIELD_LENGTH || ')' + when 40 then 'CSTRING(' || f.RDB$FIELD_LENGTH || ')' + when 261 then 'BLOB' + else 'UNKNOWN' + end, + case when rf.RDB$NULL_FLAG = 1 then 'NOT NULL' else '' end + from RDB$RELATION_FIELDS rf + join RDB$FIELDS f on rf.RDB$FIELD_SOURCE = f.RDB$FIELD_NAME + where (trim(rf.RDB$RELATION_NAME)) = ('{table}') + order by rf.RDB$FIELD_POSITION + into :field_name, :field_type, :field_null + do + begin + if (not first_field) then + ddl = ddl || ', '; + ddl = ddl || field_name || ' ' || field_type || ' ' || field_null; + first_field = false; + end + + ddl = ddl || ')'; + suspend; + end + + ddl_view: | + select RDB$VIEW_SOURCE as ddl + from RDB$RELATIONS + where (trim(RDB$RELATION_NAME)) = ('{table}') + + sessions: | + select + a.MON$ATTACHMENT_ID as pid, + a.MON$USER as username, + a.MON$REMOTE_ADDRESS as client_addr, + a.MON$STATE as state, + s.MON$SQL_TEXT as query + from MON$ATTACHMENTS a + left join MON$STATEMENTS s on a.MON$ATTACHMENT_ID = s.MON$ATTACHMENT_ID + where a.MON$ATTACHMENT_ID <> current_connection + + session_terminate: delete from MON$ATTACHMENTS where MON$ATTACHMENT_ID = {pid} + +analysis: + field_chars: | + select + 'main' as schema_nm, + '{table}' as table_nm, + '{field}' as field, + sum(case when {field} containing ascii_char(10) then 1 else 0 end) as cnt_nline, + sum(case when {field} containing ascii_char(9) then 1 else 0 end) as cnt_tab, + sum(case when {field} containing ',' then 1 else 0 end) as cnt_comma, + sum(case when {field} containing '"' then 1 else 0 end) as cnt_dquote, + min(char_length({field})) as f_min_len, + max(char_length({field})) as f_max_len + from {table} + + field_stat_len: | + select + 'main' as schema_nm, + '{table}' as table_nm, + '{field}' as field, + count(*) as tot_cnt, + min(char_length(cast({field} as varchar(8000)))) as f_min_len, + max(char_length(cast({field} as varchar(8000)))) as f_max_len + from {table} + + field_stat_deep: | + select + 'main' as schema_nm, + '{table}' as table_nm, + '{field}' as field, + count(*) as tot_cnt, + count({field}) as f_cnt, + count(*) - count({field}) as f_null_cnt, + cast(100.0 * (count(*) - count({field})) / count(*) as numeric(5,1)) as f_null_prct, + count(distinct {field}) as f_dstct_cnt, + cast(100.0 * count(distinct {field}) / count(*) as numeric(5,1)) as f_dstct_prct, + count(*) - count(distinct {field}) as f_dup_cnt, + cast(min({field}) as varchar(255)) as f_min, + cast(max({field}) as varchar(255)) as f_max, + min(char_length(cast({field} as varchar(8000)))) as f_min_len, + max(char_length(cast({field} as varchar(8000)))) as f_max_len + from {table} + + distro_field: | + select first 1000 + '{table}' as table_nm, + '{field}' as field, + {field} as value, + count(*) as cnt, + cast(100.0 * count(*) / (select count(*) from {table}) as numeric(5,2)) as prct + from {table} + group by {field} + order by count(*) desc + + distro_field_group: | + select first 1000 + '{table}' as table_nm, + '{field}' as field, + {group_expr} as group_exp, + {field} as value, + count(*) as cnt, + cast(100.0 * count(*) / (select count(*) from {table}) as numeric(5,2)) as prct + from {table} + group by {field}, {group_expr} + order by count(*) desc + + distro_field_date: | + select + 'main' as schema_nm, + '{table}' as table_nm, + '{field}' as field, + extract(year from {field}) as year, + extract(month from {field}) as month, + extract(day from {field}) as day, + count(*) as cnt, + cast(100.0 * count(*) / (select count(*) from {table}) as numeric(5,2)) as prct + from {table} + group by 1, 2, 3, 4, 5, 6 + order by 4, 5, 6 + +function: + truncate_f: cast({field} as integer) + truncate_datef: cast({field} as date) + string_type: varchar(8000) + cast_to_string: 'cast({field} as varchar(8000))' + cast_to_text: 'cast({field} as varchar(8000))' + date_to_int: datediff(day, date '1900-01-01', {field}) + number_to_int: cast({field} as integer) + sleep: -- firebird doesn't have a sleep function + checksum_datetime: cast(datediff(second, timestamp '1970-01-01 00:00:00', {field}) * 1000000 as bigint) + checksum_string: char_length({field}) + checksum_boolean: char_length(cast({field} as varchar(10))) + checksum_json: char_length(replace({field}, ' ', '')) + now: current_timestamp + +variable: + tmp_folder: /tmp + bind_string: '?' + error_filter_table_exists: already exists + max_string_type: varchar(8000) + max_string_length: 8000 + max_column_length: 31 + +native_type_map: + smallint: smallint + short: smallint + long: integer + integer: integer + bigint: bigint + float: float + double precision: float + numeric: decimal + decimal: decimal + date: date + time: time + timestamp: timestamp + char: string + varchar: text + blob: text + boolean: bool + varying: text + +general_type_map: + bigint: bigint + binary: blob + bool: boolean + date: date + datetime: timestamp + decimal: "decimal(,)" + integer: integer + json: varchar(8000) + smallint: smallint + string: "varchar()" + text: varchar(8000) + timestamp: timestamp + timestampz: timestamp with local time zone + float: double precision + time: time + timez: time + uuid: varchar(36) \ No newline at end of file diff --git a/core/dbio/templates/mariadb.yaml b/core/dbio/templates/mariadb.yaml index cdf61e684..d783be6f1 100644 --- a/core/dbio/templates/mariadb.yaml +++ b/core/dbio/templates/mariadb.yaml @@ -9,6 +9,15 @@ core: alter_columns: alter table {table} modify {col_ddl} modify_column: '{column} {type}' + load_data_local_reader: | + LOAD DATA LOCAL INFILE 'Reader::{handler_name}' + INTO TABLE {table} + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + ESCAPED BY '\\' + LINES TERMINATED BY '\n' + IGNORE 1 LINES + metadata: current_database: select database() as name from dual diff --git a/core/dbio/templates/mysql.yaml b/core/dbio/templates/mysql.yaml index e1e81ed3a..3ee0cf3a8 100755 --- a/core/dbio/templates/mysql.yaml +++ b/core/dbio/templates/mysql.yaml @@ -9,6 +9,15 @@ core: alter_columns: alter table {table} modify {col_ddl} modify_column: '{column} {type}' + load_data_local_reader: | + LOAD DATA LOCAL INFILE 'Reader::{handler_name}' + INTO TABLE {table} + FIELDS TERMINATED BY ',' + OPTIONALLY ENCLOSED BY '"' + ESCAPED BY '\\' + LINES TERMINATED BY '\n' + IGNORE 1 LINES + metadata: current_database: select database() as name from dual diff --git a/core/dbio/templates/postgres.yaml b/core/dbio/templates/postgres.yaml index ff2a99214..7c3b9668d 100755 --- a/core/dbio/templates/postgres.yaml +++ b/core/dbio/templates/postgres.yaml @@ -14,8 +14,8 @@ core: update {table} as t1 set {set_fields2} from (select * from {temp_table}) as t2 where {pk_fields_equal} - insert: insert into {table} ({cols}) values ({values}) - insert_temp: insert into {table} ({cols}) select {cols} from {temp_table} + insert: insert into {table} ({fields}) values ({values}) + insert_temp: insert into {table} ({fields}) select {cols} from {temp_table} insert_ignore: insert into {table} ({fields}) values ({values}) on conflict ({pk_fields}) do nothing insert_ignore_temp: insert into {table} ({names}) select {names} from {temp_table} on conflict ({pk_fields}) do nothing update_temp: | diff --git a/core/dbio/templates/redshift.yaml b/core/dbio/templates/redshift.yaml index 3ae76dd54..76ec0be8a 100755 --- a/core/dbio/templates/redshift.yaml +++ b/core/dbio/templates/redshift.yaml @@ -26,7 +26,7 @@ core: # optimize_column: | # alter table {table} rename to {table_old}; # create table {table} ( {col_ddl} ); - # insert into {table} ({cols}) + # insert into {table} ({fields}) # select {cols} # from {table_old}; # drop table {table_old}; diff --git a/core/dbio/templates/sqlserver.yaml b/core/dbio/templates/sqlserver.yaml index 62d883e54..bbee555bc 100755 --- a/core/dbio/templates/sqlserver.yaml +++ b/core/dbio/templates/sqlserver.yaml @@ -21,8 +21,8 @@ core: incremental_select_limit: select top {limit} {fields} from {table} where ({incremental_where_cond}){where_and} order by {update_key} asc incremental_select_limit_offset: select top {limit} * from ( select {fields} from {table} where ({incremental_where_cond}){where_and} order by {update_key} asc offset {offset} rows) as t incremental_select: select {fields} from {table} where ({incremental_where_cond}){where_and} - insert: insert into {table} ({cols}) values ({values}) - insert_temp: insert into {table} ({cols}) select {cols} from {temp_table} + insert: insert into {table} ({fields}) values ({values}) + insert_temp: insert into {table} ({fields}) select {cols} from {temp_table} insert_ignore: insert into {table} ({fields}) values ({values}) on conflict ({pk_fields}) do nothing insert_ignore_temp: insert into {table} ({names}) select {names} from {temp_table} on conflict ({pk_fields}) do nothing update_temp: | diff --git a/core/dbio/templates/trino.yaml b/core/dbio/templates/trino.yaml index 704d16b7b..f02fb8260 100755 --- a/core/dbio/templates/trino.yaml +++ b/core/dbio/templates/trino.yaml @@ -11,13 +11,13 @@ core: update {table} as t1 set {set_fields2} from (select * from {temp_table}) as t2 where {pk_fields_equal} - insert: insert into {table} ({cols}) values ({values}) + insert: insert into {table} ({fields}) values ({values}) limit: select {fields} from {table} offset {offset} limit {limit} limit_sql: | select * from ( {sql} ) as t offset {offset} limit {limit} - insert_temp: insert into {table} ({cols}) select {cols} from {temp_table} + insert_temp: insert into {table} ({fields}) select {cols} from {temp_table} insert_ignore: insert into {table} ({fields}) values ({values}) on conflict ({pk_fields}) do nothing insert_ignore_temp: insert into {table} ({names}) select {names} from {temp_table} on conflict ({pk_fields}) do nothing update_temp: | diff --git a/core/env/env.go b/core/env/env.go index b81c2bad7..0621f41d7 100755 --- a/core/env/env.go +++ b/core/env/env.go @@ -19,6 +19,7 @@ import ( ) var ( + Marker = "Sling CLI | https://slingdata.io" HomeDir = os.Getenv("SLING_HOME_DIR") HomeDirEnvFile = "" Env = &EnvFile{} @@ -47,6 +48,26 @@ var ( return path.Join(RuntimeFolder(), g.F("%s.json", name)) } setupOtel = func() {} + + ReservedFields = struct { + LoadedAt string + SyncedAt string + SyncedOp string + DeletedAt string + StreamURL string + RowNum string + RowID string + ExecID string + }{ + LoadedAt: "_sling_loaded_at", + SyncedAt: "_sling_synced_at", + SyncedOp: "_sling_synced_op", + DeletedAt: "_sling_deleted_at", + StreamURL: "_sling_stream_url", + RowNum: "_sling_row_num", + RowID: "_sling_row_id", + ExecID: "_sling_exec_id", + } ) const ( diff --git a/core/sling/config.go b/core/sling/config.go index 8bdd7d4f6..28088c989 100644 --- a/core/sling/config.go +++ b/core/sling/config.go @@ -15,6 +15,7 @@ import ( "github.com/slingdata-io/sling-cli/core/dbio/connection" "github.com/slingdata-io/sling-cli/core/dbio/database" "github.com/slingdata-io/sling-cli/core/dbio/filesys" + "github.com/slingdata-io/sling-cli/core/env" "github.com/spf13/cast" "github.com/flarco/g" @@ -41,6 +42,8 @@ const ( SnapshotMode Mode = "snapshot" // BackfillMode is to backfill BackfillMode Mode = "backfill" + // DefinitionOnlyMode is to create table/file definition without data + DefinitionOnlyMode Mode = "definition-only" ) var AllMode = []struct { @@ -52,6 +55,7 @@ var AllMode = []struct { {TruncateMode, "TruncateMode"}, {SnapshotMode, "SnapshotMode"}, {BackfillMode, "BackfillMode"}, + {DefinitionOnlyMode, "DefinitionOnlyMode"}, } // NewConfig return a config object from a YAML / JSON string @@ -158,7 +162,14 @@ func (cfg *Config) SetDefault() { cfg.Mode = FullRefreshMode } - if val := os.Getenv("SLING_LOADED_AT_COLUMN"); val != "" { + if val := os.Getenv("SLING_SYNCED_AT_COLUMN"); val != "" { + if cast.ToBool(val) { + cfg.MetadataSyncedAt = g.Bool(true) + env.ReservedFields.DeletedAt = env.ReservedFields.SyncedAt // deleted_at becomes synched_at + } else { + cfg.MetadataSyncedAt = g.Bool(false) + } + } else if val := os.Getenv("SLING_LOADED_AT_COLUMN"); val != "" { if cast.ToBool(val) || val == "unix" || val == "timestamp" { cfg.MetadataLoadedAt = g.Bool(true) } else { @@ -339,9 +350,9 @@ func (cfg *Config) DetermineType() (Type JobType, err error) { } } - validMode := g.In(cfg.Mode, FullRefreshMode, IncrementalMode, BackfillMode, SnapshotMode, TruncateMode) + validMode := g.In(cfg.Mode, FullRefreshMode, IncrementalMode, BackfillMode, SnapshotMode, TruncateMode, DefinitionOnlyMode) if !validMode { - err = g.Error("must specify valid mode: full-refresh, incremental, backfill, snapshot or truncate") + err = g.Error("must specify valid mode: full-refresh, incremental, backfill, snapshot, truncate, or definition-only") return } @@ -359,9 +370,11 @@ func (cfg *Config) DetermineType() (Type JobType, err error) { // OK, no need for update key } else if srcApiProvided { // OK, no need for update key/pk, API uses SLING_STATE for tracking - } else if srcFileProvided && cfg.Source.UpdateKey == slingLoadedAtColumn { + } else if srcFileProvided && cfg.Source.UpdateKey == env.ReservedFields.LoadedAt { // need to loaded_at column for file incremental cfg.MetadataLoadedAt = g.Bool(true) + } else if srcFileProvided && cfg.Source.UpdateKey == env.ReservedFields.SyncedAt { + cfg.MetadataSyncedAt = g.Bool(true) } else if cfg.Source.UpdateKey == "" && len(cfg.Source.PrimaryKey()) == 0 { err = g.Error("must specify value for 'update_key' and/or 'primary_key' for incremental mode. See docs for more details: https://docs.slingdata.io/sling-cli/run/configuration") if args := os.Getenv("SLING_CLI_ARGS"); strings.Contains(args, "-src-conn") || strings.Contains(args, "-tgt-conn") { @@ -386,6 +399,15 @@ func (cfg *Config) DetermineType() (Type JobType, err error) { } } else if cfg.Mode == SnapshotMode { cfg.MetadataLoadedAt = g.Bool(true) // needed for snapshot mode + } else if cfg.Mode == DefinitionOnlyMode { + // For file targets, only parquet and arrow formats are supported + if tgtFileProvided { + format := cfg.Target.ObjectFileFormat() + if !g.In(format, dbio.FileTypeParquet, dbio.FileTypeArrow) { + err = g.Error("definition-only mode for file targets only supports parquet or arrow formats, got: %s", format) + return + } + } } if srcDbProvided && tgtDbProvided { @@ -860,8 +882,10 @@ func (cfg *Config) FormatTargetObjectName() (err error) { cfg.Target.Object = strings.TrimSpace(renderedObject) if cfg.TgtConn.Type.IsDb() { + dbType := cfg.TgtConn.GetType() + // normalize casing of object names - table, err := database.ParseTableName(cfg.Target.Object, cfg.TgtConn.Type) + table, err := database.ParseTableName(cfg.Target.Object, dbType) if err != nil { return g.Error(err, "could not parse target table name") } else if table.IsQuery() { @@ -875,7 +899,7 @@ func (cfg *Config) FormatTargetObjectName() (err error) { if tgtOpts := cfg.Target.Options; tgtOpts != nil { tgtOpts.TableTmp = strings.TrimSpace(g.Rm(tgtOpts.TableTmp, m)) if tgtOpts.TableTmp != "" { - tableTmp, err := database.ParseTableName(tgtOpts.TableTmp, cfg.TgtConn.Type) + tableTmp, err := database.ParseTableName(tgtOpts.TableTmp, dbType) if err != nil { return g.Error(err, "could not parse temp table name") } else if tableTmp.Schema == "" { @@ -885,17 +909,17 @@ func (cfg *Config) FormatTargetObjectName() (err error) { } } - if cfg.TgtConn.Type.DBNameUpperCase() { + if dbType.DBNameUpperCase() { tableTmp.Name = strings.ToUpper(tableTmp.Name) } tgtOpts.TableTmp = tableTmp.FullName() - } else if g.In(cfg.TgtConn.Type, dbio.TypeDbDuckDb, dbio.TypeDbDuckLake) { + } else if g.In(dbType, dbio.TypeDbDuckDb, dbio.TypeDbDuckLake) { // for duckdb and ducklake, we'll use a temp table, which uses the 'main' schema - tableTmp := makeTempTableName(cfg.TgtConn.Type, table, "_sling_duckdb_tmp") + tableTmp := makeTempTableName(dbType, table, "_sling_duckdb_tmp") tableTmp.Schema = "main" tgtOpts.TableTmp = tableTmp.FullName() } else { - tableTmp := makeTempTableName(cfg.TgtConn.Type, table, "_tmp") + tableTmp := makeTempTableName(dbType, table, "_tmp") tgtOpts.TableTmp = tableTmp.FullName() } } @@ -944,8 +968,13 @@ func (cfg *Config) GetFormatMap() (m map[string]any, err error) { m["target_name"] = strings.ToLower(cfg.Target.Conn) } - if cfg.ReplicationStream != nil && cfg.ReplicationStream.ID != "" { - m["stream_run_id"] = cfg.ReplicationStream.ID + if cfg.ReplicationStream != nil { + if cfg.ReplicationStream.ID != "" { + m["stream_run_id"] = cfg.ReplicationStream.ID + } + if cfg.ReplicationStream.Description != "" { + m["stream_description"] = cfg.ReplicationStream.Description + } } if cfg.SrcConn.Type.IsDb() { @@ -1227,6 +1256,7 @@ type Config struct { IncrementalGTE bool `json:"incremental_gte,omitempty" yaml:"incremental_gte,omitempty"` MetadataLoadedAt *bool `json:"-" yaml:"-"` + MetadataSyncedAt *bool `json:"-" yaml:"-"` MetadataStreamURL bool `json:"-" yaml:"-"` MetadataRowNum bool `json:"-" yaml:"-"` MetadataRowID bool `json:"-" yaml:"-"` diff --git a/core/sling/task.go b/core/sling/task.go index 53411ea3a..2057e03e5 100644 --- a/core/sling/task.go +++ b/core/sling/task.go @@ -341,29 +341,34 @@ func (t *TaskExecution) GetRate(secWindow int) (rowRate, byteRate int64) { } func (t *TaskExecution) setGetMetadata() (metadata iop.Metadata) { - if t.Config.MetadataLoadedAt != nil && *t.Config.MetadataLoadedAt { - metadata.LoadedAt.Key = slingLoadedAtColumn + if t.Config.MetadataSyncedAt != nil && *t.Config.MetadataSyncedAt { + metadata.SyncedAt.Key = env.ReservedFields.SyncedAt + metadata.SyncedAt.Value = *t.StartTime // only timestamp + metadata.SyncedOp.Key = env.ReservedFields.SyncedOp + metadata.SyncedOp.Value = "I" // default to insert operation + } else if t.Config.MetadataLoadedAt != nil && *t.Config.MetadataLoadedAt { + metadata.SyncedAt.Key = env.ReservedFields.LoadedAt if os.Getenv("SLING_LOADED_AT_COLUMN") == "timestamp" { - metadata.LoadedAt.Value = *t.StartTime + metadata.SyncedAt.Value = *t.StartTime } else { - metadata.LoadedAt.Value = t.StartTime.Unix() + metadata.SyncedAt.Value = t.StartTime.Unix() } } if t.Config.MetadataStreamURL { - metadata.StreamURL.Key = slingStreamURLColumn + metadata.StreamURL.Key = env.ReservedFields.StreamURL } if t.Config.MetadataRowID { - metadata.RowID.Key = slingRowIDColumn + metadata.RowID.Key = env.ReservedFields.RowID } if t.Config.MetadataExecID { - metadata.ExecID.Key = slingExecIDColumn + metadata.ExecID.Key = env.ReservedFields.ExecID metadata.ExecID.Value = t.ExecID } if t.Config.MetadataRowNum { - metadata.RowNum.Key = slingRowNumColumn + metadata.RowNum.Key = env.ReservedFields.RowNum } // StarRocks: add _sling_row_id column if there is no primary, @@ -385,8 +390,8 @@ func (t *TaskExecution) setGetMetadata() (metadata iop.Metadata) { } if addRowIDCol { - metadata.RowID.Key = slingRowIDColumn - t.Config.Target.Options.TableKeys[iop.HashKey] = []string{slingRowIDColumn} + metadata.RowID.Key = env.ReservedFields.RowID + t.Config.Target.Options.TableKeys[iop.HashKey] = []string{env.ReservedFields.RowID} } } @@ -539,7 +544,7 @@ func (t *TaskExecution) getSourceOptionsMap() (options map[string]any) { } // set target type for column casing, name length validation - options["target_type"] = string(t.Config.TgtConn.Type) + options["target_type"] = string(t.Config.TgtConn.GetType()) return } @@ -568,7 +573,7 @@ func (t *TaskExecution) getTargetOptionsMap() (options map[string]any) { } // set target type for column casing, name length validation - options["target_type"] = string(t.Config.TgtConn.Type) + options["target_type"] = string(t.Config.TgtConn.GetType()) // set to delete file/folder options["delete_file"] = true diff --git a/core/sling/task_run.go b/core/sling/task_run.go index e9ab204f4..99d061197 100644 --- a/core/sling/task_run.go +++ b/core/sling/task_run.go @@ -25,13 +25,7 @@ import ( ) var ( - start time.Time - slingLoadedAtColumn = "_sling_loaded_at" - slingDeletedAtColumn = "_sling_deleted_at" - slingStreamURLColumn = "_sling_stream_url" - slingRowNumColumn = "_sling_row_num" - slingRowIDColumn = "_sling_row_id" - slingExecIDColumn = "_sling_exec_id" + start time.Time ) var deleteMissing func(*TaskExecution, database.Connection, database.Connection) error = func(_ *TaskExecution, _, _ database.Connection) error { @@ -483,7 +477,7 @@ func (t *TaskExecution) runFileToDB() (err error) { t.Context.Map.Set("incremental_value", t.Config.IncrementalValStr) } else if t.isIncrementalWithUpdateKey() && !t.Config.IsIncrementalWithRange() { if t.Config.Source.UpdateKey == "." { - t.Config.Source.UpdateKey = slingLoadedAtColumn + t.Config.Source.UpdateKey = env.ReservedFields.LoadedAt } t.SetProgress("getting checkpoint value (%s)", t.Config.Source.UpdateKey) diff --git a/core/sling/task_run_read.go b/core/sling/task_run_read.go index 5995e0c32..c38afaf6e 100644 --- a/core/sling/task_run_read.go +++ b/core/sling/task_run_read.go @@ -12,6 +12,7 @@ import ( "github.com/slingdata-io/sling-cli/core/dbio/database" "github.com/slingdata-io/sling-cli/core/dbio/filesys" "github.com/slingdata-io/sling-cli/core/dbio/iop" + "github.com/slingdata-io/sling-cli/core/env" "github.com/spf13/cast" ) @@ -45,9 +46,19 @@ func (t *TaskExecution) ReadFromDB(cfg *Config, srcConn database.Connection) (df if len(cfg.Source.Select) > 0 { selectFields = lo.Map(cfg.Source.Select, func(f string, i int) string { - // lookup column name - col := sTable.Columns.GetColumn(srcConn.Unquote(f)) + // Parse the expression to extract original column name + original, alias, isExclude, _ := iop.ParseSelectExpr(f) + + if isExclude { + return f // Pass through exclusion as-is for later handling + } + + // Lookup the original column (for case correction) + col := sTable.Columns.GetColumn(srcConn.Unquote(original)) if col != nil { + if alias != "" { + return col.Name + " as " + alias + } return col.Name } return f @@ -63,9 +74,12 @@ func (t *TaskExecution) ReadFromDB(cfg *Config, srcConn database.Connection) (df } includedCols := lo.Filter(sTable.Columns, func(c iop.Column, i int) bool { + colNameLower := strings.ToLower(c.Name) for _, exField := range excluded { exField = srcConn.Unquote(strings.TrimPrefix(exField, "-")) - if strings.EqualFold(c.Name, exField) { + exFieldLower := strings.ToLower(exField) + // Use glob matching to support patterns like "address_*" + if iop.MatchesSelectGlob(colNameLower, exFieldLower) { return false } } @@ -156,8 +170,12 @@ func (t *TaskExecution) ReadFromDB(cfg *Config, srcConn database.Connection) (df ) sFields := lo.Map(selectFields, func(sf string, i int) string { - col := sTable.Columns.GetColumn(srcConn.Unquote(sf)) + original, alias, _, _ := iop.ParseSelectExpr(sf) + col := sTable.Columns.GetColumn(srcConn.Unquote(original)) if col != nil { + if alias != "" { + return srcConn.Quote(col.Name) + " as " + srcConn.Quote(alias) + } return srcConn.Quote(col.Name) // apply quotes if match } return sf @@ -207,8 +225,12 @@ func (t *TaskExecution) ReadFromDB(cfg *Config, srcConn database.Connection) (df // if {fields} placeholder is used, replace it with selected fields to avoid double wrapping if strings.Contains(sTable.SQL, "{fields}") { sFields := lo.Map(selectFields, func(sf string, i int) string { - col := sTable.Columns.GetColumn(srcConn.Unquote(sf)) + original, alias, _, _ := iop.ParseSelectExpr(sf) + col := sTable.Columns.GetColumn(srcConn.Unquote(original)) if col != nil { + if alias != "" { + return srcConn.Quote(col.Name) + " as " + srcConn.Quote(alias) + } return srcConn.Quote(col.Name) // apply quotes if match } return sf @@ -218,6 +240,11 @@ func (t *TaskExecution) ReadFromDB(cfg *Config, srcConn database.Connection) (df selectFields = []string{"*"} } + // For definition-only mode, inject WHERE 1=0 to avoid reading data + if cfg.Mode == DefinitionOnlyMode { + cfg.Source.Where = "1=0" + } + // construct select statement for selected fields or where condition if len(selectFields) > 1 || selectFields[0] != "*" || cfg.Source.Where != "" || cfg.Source.Limit() > 0 { if sTable.SQL != "" && !cfg.SrcConn.Type.IsNoSQL() && !strings.Contains(sTable.SQL, "{fields}") { @@ -273,7 +300,7 @@ func (t *TaskExecution) ReadFromFile(cfg *Config) (df *iop.Dataflow, err error) if t.Config.HasIncrementalVal() && !t.Config.IsFileStreamWithStateAndParts() { // file stream incremental mode - if t.Config.Source.UpdateKey == slingLoadedAtColumn { + if g.In(t.Config.Source.UpdateKey, env.ReservedFields.LoadedAt, env.ReservedFields.SyncedAt) { options["SLING_FS_TIMESTAMP"] = t.Config.IncrementalValStr g.Debug(`file stream using file_sys_timestamp=%#v and update_key=%s`, t.Config.IncrementalValStr, t.Config.Source.UpdateKey) } else { @@ -305,6 +332,12 @@ func (t *TaskExecution) ReadFromFile(cfg *Config) (df *iop.Dataflow, err error) IncrementalValue: cfg.IncrementalValStr, } + // limit when definition-only + if cfg.Mode == DefinitionOnlyMode { + fsCfg.SchemaOnly = true + fsCfg.Limit = iop.SampleSize + } + // format the uri if it has placeholders // determine uri if it has part fields, find first parent folder if t.Config.IsFileStreamWithStateAndParts() { @@ -418,6 +451,12 @@ func (t *TaskExecution) ReadFromApi(cfg *Config, srcConn *api.APIConnection) (df Range: g.PtrVal(t.Config.Source.Options.Range), DsConfigMap: t.getSourceOptionsMap(), } + + if cfg.Mode == DefinitionOnlyMode { + sCfg.SchemaOnly = true + sCfg.Limit = iop.SampleSize + } + df, err = srcConn.ReadDataflow(cfg.StreamName, sCfg) if err != nil { err = g.Error(err, "Could not ReadDataflow for %s", cfg.SrcConn.Type) diff --git a/core/sling/task_run_write.go b/core/sling/task_run_write.go index f348a50c6..665a12728 100644 --- a/core/sling/task_run_write.go +++ b/core/sling/task_run_write.go @@ -29,7 +29,8 @@ func (t *TaskExecution) WriteToFile(cfg *Config, df *iop.Dataflow) (cnt uint64, dateMap := iop.GetISO8601DateMap(time.Now()) cfg.TgtConn.Set(g.M("url", g.Rm(uri, dateMap))) - if len(df.Buffer) == 0 && !cast.ToBool(os.Getenv("SLING_ALLOW_EMPTY")) { + // Skip empty buffer check for definition-only mode (we intentionally have 0 rows) + if len(df.Buffer) == 0 && cfg.Mode != DefinitionOnlyMode && !cast.ToBool(os.Getenv("SLING_ALLOW_EMPTY")) { g.Warn("No data or records found in stream. Nothing to do. To allow Sling to create empty files, set SLING_ALLOW_EMPTY=TRUE") return } @@ -205,6 +206,8 @@ func (t *TaskExecution) WriteToDb(cfg *Config, df *iop.Dataflow, tgtConn databas if directInsert || writeDirectly { if g.In(cfg.Mode, IncrementalMode, BackfillMode) && len(cfg.Source.PrimaryKey()) > 0 { g.Warn("mode '%s' with a primary-key is not supported for direct write, falling back to using a temporary table.", cfg.Mode) + } else if cfg.Mode == DefinitionOnlyMode { + // continue as normal, since only definition } else { return t.writeToDbDirectly(cfg, df, tgtConn) } @@ -245,6 +248,34 @@ func (t *TaskExecution) WriteToDb(cfg *Config, df *iop.Dataflow, tgtConn databas return 0, err } + // Handle definition-only mode: create final table and exit without data + if cfg.Mode == DefinitionOnlyMode { + setStage("5 - prepare-final") + + // Set columns and keys on target table + targetTable.Columns = sampleData.Columns + if err := targetTable.SetKeys(cfg.Source.PrimaryKey(), cfg.Source.UpdateKey, cfg.Target.Options.TableKeys); err != nil { + err = g.Error(err, "could not set keys for "+targetTable.FullName()) + return 0, err + } + + // Drop existing table if it exists + if err := tgtConn.DropTable(targetTable.FullName()); err != nil { + g.Debug("could not drop existing table %s: %v", targetTable.FullName(), err) + } + + // Create the final table with inferred schema + if err := createTable(t, tgtConn, targetTable, sampleData, false); err != nil { + err = g.Error(err, "could not create table "+targetTable.FullName()) + return 0, err + } + + df.Close() + t.SetProgress("created table definition %s with %d columns", targetTable.FullName(), len(sampleData.Columns)) + setStage("6 - closing") + return 0, nil + } + // Set table keys tableTmp.Columns = sampleData.Columns if err := tableTmp.SetKeys(cfg.Source.PrimaryKey(), cfg.Source.UpdateKey, cfg.Target.Options.TableKeys); err != nil { diff --git a/core/sling/task_state.go b/core/sling/task_state.go index 0f75eda24..7e6d5cb0b 100644 --- a/core/sling/task_state.go +++ b/core/sling/task_state.go @@ -2,6 +2,7 @@ package sling import ( "os" + "path" "time" "github.com/flarco/g" @@ -106,7 +107,8 @@ func (dts *DateTimeState) Update() { type ExecutionState struct { ID string `json:"id"` - FilePath string `json:"string"` + FilePath string `json:"file_path"` + FileName string `json:"file_name"` TotalBytes uint64 `json:"total_bytes"` TotalRows uint64 `json:"total_rows"` Status StatusMap `json:"status"` @@ -161,6 +163,7 @@ type StreamState struct { FileExt string `json:"file_ext,omitempty"` FilePath string `json:"file_path,omitempty"` Name string `json:"name,omitempty"` + Description string `json:"description,omitempty"` Schema string `json:"schema,omitempty"` SchemaLower string `json:"schema_lower,omitempty"` SchemaUpper string `json:"schema_upper,omitempty"` @@ -168,6 +171,7 @@ type StreamState struct { TableLower string `json:"table_lower,omitempty"` TableUpper string `json:"table_upper,omitempty"` FullName string `json:"full_name,omitempty"` + ID string `json:"id,omitempty"` } type ObjectState struct { @@ -193,6 +197,7 @@ func (t *TaskExecution) StateSet() { } state.Execution.FilePath = t.Config.Env["SLING_CONFIG_PATH"] + state.Execution.FileName = path.Base(state.Execution.FilePath) fMap, _ := t.Config.GetFormatMap() @@ -210,11 +215,13 @@ func (t *TaskExecution) StateSet() { } } + run.ID = runID run.Stream.FileFolder = cast.ToString(fMap["stream_file_folder"]) run.Stream.FileName = cast.ToString(fMap["stream_file_name"]) run.Stream.FileExt = cast.ToString(fMap["stream_file_ext"]) run.Stream.FilePath = cast.ToString(fMap["stream_file_path"]) run.Stream.Name = cast.ToString(fMap["stream_name"]) + run.Stream.Description = cast.ToString(fMap["stream_description"]) run.Stream.FullName = cast.ToString(fMap["stream_full_name"]) run.Stream.Schema = cast.ToString(fMap["stream_schema"]) run.Stream.SchemaLower = cast.ToString(fMap["stream_schema_lower"]) @@ -222,6 +229,7 @@ func (t *TaskExecution) StateSet() { run.Stream.Table = cast.ToString(fMap["stream_table"]) run.Stream.TableLower = cast.ToString(fMap["stream_table_lower"]) run.Stream.TableUpper = cast.ToString(fMap["stream_table_upper"]) + run.Stream.ID = t.Config.StreamID() run.Object.Name = cast.ToString(fMap["object_name"]) run.Object.FullName = cast.ToString(fMap["object_full_name"]) diff --git a/core/version.go b/core/version.go index 121cb18dc..9c5fc4bec 100755 --- a/core/version.go +++ b/core/version.go @@ -22,7 +22,7 @@ var TelProps = g.M( func init() { // dev build version is in format => 1.2.2.dev/2024-08-20 parts := strings.Split(Version, "/") - if len(parts) != 2 { + if len(parts) != 2 || os.Getenv("SLING_AGENT_ID") != "" { return } diff --git a/go.mod b/go.mod index bc8bccbcd..afcde7928 100644 --- a/go.mod +++ b/go.mod @@ -76,7 +76,7 @@ require ( github.com/shirou/gopsutil/v3 v3.24.4 github.com/shopspring/decimal v1.4.0 github.com/sijms/go-ora/v2 v2.8.24 - github.com/slingdata-io/godbc v0.0.3 + github.com/slingdata-io/godbc v0.0.4 github.com/slingdata-io/sling v0.0.0-20240426022644-3c31b1eb088e github.com/snowflakedb/gosnowflake v1.17.1 github.com/spf13/cast v1.7.1 diff --git a/go.sum b/go.sum index 4b3ae28ce..2a2d0884a 100644 --- a/go.sum +++ b/go.sum @@ -341,6 +341,8 @@ github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM= github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/fergusstrange/embedded-postgres v1.31.0 h1:JmRxw2BcPRcU141nOEuGXbIU6jsh437cBB40rmftZSk= +github.com/fergusstrange/embedded-postgres v1.31.0/go.mod h1:w0YvnCgf19o6tskInrOOACtnqfVlOvluz3hlNLY7tRk= github.com/flarco/bigquery v0.0.9 h1:WfxO6XuuHZTJV+55Bq24FhdHYpmAOzgVk9xOcJpEecY= github.com/flarco/bigquery v0.0.9/go.mod h1:IpRSw4quaXxHjFyDSXUo7B6v+XcNF2pSmnNfeqXa/gM= github.com/flarco/databricks-sql-go v0.0.0-20250613120556-51f7c1f3b4ad h1:z5mgsXmNXsgskClg/s6zelILFihJTyK6x7+zX1jUgyU= @@ -944,8 +946,8 @@ github.com/skratchdot/open-golang v0.0.0-20200116055534-eef842397966 h1:JIAuq3EE github.com/skratchdot/open-golang v0.0.0-20200116055534-eef842397966/go.mod h1:sUM3LWHvSMaG192sy56D9F7CNvL7jUJVXoqM1QKLnog= github.com/slingdata-io/arrow-adbc/go/adbc v0.0.0-20260105180115-72bb86fc9587 h1:IRiAvb/AGt2qii99UwF8QvhBAs8H+/Udu2GgVWFg3ys= github.com/slingdata-io/arrow-adbc/go/adbc v0.0.0-20260105180115-72bb86fc9587/go.mod h1:FxeVw2tiKoUgPdOOXt4pwi2BuuQ17Cop7vnJTQL/z2s= -github.com/slingdata-io/godbc v0.0.3 h1:yj+Z/E6Hud2a/XKNIxhx3qggHSHpt70WfesgkWn6rj8= -github.com/slingdata-io/godbc v0.0.3/go.mod h1:I7r2EZl10tyzeXMq4WLwGyLqD0ZlEtJXOsOSbAl+iLI= +github.com/slingdata-io/godbc v0.0.4 h1:W39fdWNpBms6PItrSEfocedK1SaGYnofkeXW/xRXsYA= +github.com/slingdata-io/godbc v0.0.4/go.mod h1:I7r2EZl10tyzeXMq4WLwGyLqD0ZlEtJXOsOSbAl+iLI= github.com/slingdata-io/pocketbase v0.22.136 h1:RtAvPvYdK0qm9EB1r8GzNeEfSiqDK+tV8jyxwbpKKBA= github.com/slingdata-io/pocketbase v0.22.136/go.mod h1:RYAdoMZtW+3OIgKqg+YhgWGIiwjtcBHGxRcVF2+1klA= github.com/snowflakedb/gosnowflake v1.17.1 h1:sBYExPDRv6hHF7fCqeXMT745L326Byw/cROxvCiEJzo= @@ -1069,6 +1071,8 @@ github.com/xeipuuv/gojsonschema v1.2.0 h1:LhYJRs+L4fBtjZUfuSZIKGeVu0QRy8e5Xi7D17 github.com/xeipuuv/gojsonschema v1.2.0/go.mod h1:anYRn/JVcOK2ZgGU+IjEV4nwlhoK5sQluxsYJ78Id3Y= github.com/xhit/go-str2duration/v2 v2.1.0 h1:lxklc02Drh6ynqX+DdPyp5pCKLUQpRT8bp8Ydu2Bstc= github.com/xhit/go-str2duration/v2 v2.1.0/go.mod h1:ohY8p+0f07DiV6Em5LKB0s2YpLtXVyJfNt1+BlmyAsU= +github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 h1:nIPpBwaJSVYIxUFsDv3M8ofmx9yWTog9BfvIu0q41lo= +github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8/go.mod h1:HUYIGzjTL3rfEspMxjDjgmT5uz5wzYJKVo23qUhYTos= github.com/xo/dburl v0.3.0 h1:KGkeJB/oQhY/DeeJoYl/1+pNE/JnF6ouAuA8nzpQEQ8= github.com/xo/dburl v0.3.0/go.mod h1:TM8VMBT+LWqC3MBOulZjb8FAthcvZq0t/qvDLwS6skU= github.com/xo/terminfo v0.0.0-20210125001918-ca9a967f8778/go.mod h1:2MuV+tbUrU1zIOPMxZ5EncGwgmMJsa+9ucAQZXxsObs= diff --git a/justfile b/justfile index 17037057b..020e73911 100644 --- a/justfile +++ b/justfile @@ -30,16 +30,16 @@ test-replication-defaults: cd cmd/sling && go test -v -run 'TestReplicationDefaults' && cd - # Test file connections -test-connections-file arg1="TestSuiteFile": +test-connections-file arg1="TestSuiteFile" arg2="" arg3="": #!/usr/bin/env bash - echo "TESTING file connections {{arg1}}" - cd cmd/sling && go test -v -parallel 3 -run "{{arg1}}" && cd - + echo "TESTING file connections {{arg1}} {{arg2}}" + cd cmd/sling && go test -v -parallel 3 -run "{{arg1}}" -- "{{arg2}}" "{{arg3}}" && cd - # Test database connections -test-connections-database arg1="TestSuiteDatabase" arg2="": +test-connections-database arg1="TestSuiteDatabase" arg2="" arg3="": #!/usr/bin/env bash echo "TESTING database connections {{arg1}} {{arg2}}" - cd cmd/sling && SKIP_CLICKHOUSE=TRUE RUN_ALL=TRUE go test -v -parallel 4 -timeout 35m -run "{{arg1}}" -- "{{arg2}}" && cd - + cd cmd/sling && SKIP_CLICKHOUSE=TRUE RUN_ALL=TRUE go test -v -parallel 4 -timeout 35m -run "{{arg1}}" -- "{{arg2}}" "{{arg3}}" && cd - # Test core (sling core functionality) test-core: @@ -68,7 +68,7 @@ test-dbio-iop: test-dbio-database: #!/usr/bin/env bash echo "TESTING dbio database" - cd core/dbio/database && go test -v -run 'TestParseTableName|TestRegexMatch|TestParseColumnName|TestParseSQLMultiStatements|TestTrimSQLComments' && cd - + cd core/dbio/database && go test -v -run 'TestParseTableName|TestRegexMatch|TestParseColumnName|TestParseSQLMultiStatements|TestTrimSQLComments|TestAddPrimaryKeyToDDL' && cd - cd core/dbio/database && go test -run TestChunkByColumnRange && cd - # Test dbio filesys