diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
index 0fdd46d27..a204f8b4e 100755
--- a/.github/workflows/build-test.yml
+++ b/.github/workflows/build-test.yml
@@ -33,7 +33,7 @@ jobs:
- name: Set up GoLang
uses: actions/setup-go@v3
with:
- go-version: "1.24"
+ go-version: "1.25"
cache: false
- name: Load Secrets
@@ -88,7 +88,7 @@ jobs:
- name: Set up GoLang
uses: actions/setup-go@v3
with:
- go-version: "1.24"
+ go-version: "1.25"
cache: false
- name: Load Secrets
@@ -137,7 +137,7 @@ jobs:
- name: Set up GoLang
uses: actions/setup-go@v3
with:
- go-version: "1.24"
+ go-version: "1.25"
cache: false
- name: Load Secrets
@@ -199,7 +199,7 @@ jobs:
- name: Set up GoLang
uses: actions/setup-go@v3
with:
- go-version: "1.24"
+ go-version: "1.25"
cache: false
- name: Load Secrets
diff --git a/.github/workflows/test-docker-build.yml b/.github/workflows/test-docker-build.yml
index 62cb4a00b..95b59f1eb 100644
--- a/.github/workflows/test-docker-build.yml
+++ b/.github/workflows/test-docker-build.yml
@@ -27,7 +27,7 @@ jobs:
- name: Set up Go
uses: actions/setup-go@v3
with:
- go-version: "1.24"
+ go-version: "1.25"
cache: false
- name: Load Secrets
@@ -65,7 +65,7 @@ jobs:
- name: Set up Go
uses: actions/setup-go@v3
with:
- go-version: "1.24"
+ go-version: "1.25"
cache: true
- name: Load Secrets
@@ -115,7 +115,7 @@ jobs:
- name: Set up Go
uses: actions/setup-go@v3
with:
- go-version: "1.24"
+ go-version: "1.25"
cache: false
- name: Load Secrets
@@ -158,7 +158,7 @@ jobs:
- name: Set up Go
uses: actions/setup-go@v3
with:
- go-version: "1.24"
+ go-version: "1.25"
cache: false
- name: Load Secrets
diff --git a/cmd/sling/DEBUG.md b/cmd/sling/DEBUG.md
new file mode 100644
index 000000000..06d101dd9
--- /dev/null
+++ b/cmd/sling/DEBUG.md
@@ -0,0 +1,13 @@
+# Debug
+
+Follow these general steps to debug an issue:
+- First step is to reproduce the issue. So build the binary (as instructed below)
+- Fetch https://f.slingdata.io/llms.txt and read it to understand sling.
+- Create a temporary replication to run with `./sling run --debug -r`
+- Use `./sling conns test` to test connectivity and `./sling conns exec` to execute any necessary queries.
+- Confirm that the issue is happening. If the issue is not observed, STOP and mention this to the user.
+- If issue is confirmed, make changes to respective files in the repo, and rebuild the binary, and re-run the temporary replication. Confirm that the issue is fixed. If not, continue to iterate.
+
+## Building the binary
+- cd into relative directory `cmd/sling`
+- run `go build .` to build the sling binary called `sling` in that folder for use.
\ No newline at end of file
diff --git a/cmd/sling/Dockerfile b/cmd/sling/Dockerfile
index e518710d6..bb309bf8c 100755
--- a/cmd/sling/Dockerfile
+++ b/cmd/sling/Dockerfile
@@ -2,7 +2,7 @@ FROM ubuntu:jammy
RUN groupadd -r sling && useradd -r -g sling sling
-RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y unzip libaio1 postgresql-client wget curl gnupg2 && \
+RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y unzip libaio1 postgresql-client wget curl gnupg2 unixodbc odbcinst && \
apt-get clean && \
rm -rf /var/lib/apt/lists /var/cache/apt
diff --git a/cmd/sling/Dockerfile.arm64 b/cmd/sling/Dockerfile.arm64
index b4cab186b..6dd3c0386 100755
--- a/cmd/sling/Dockerfile.arm64
+++ b/cmd/sling/Dockerfile.arm64
@@ -2,7 +2,7 @@ FROM --platform=linux/arm64 ubuntu:jammy
RUN groupadd -r sling && useradd -r -g sling sling
-RUN apt update || true && DEBIAN_FRONTEND=noninteractive apt install -y libaio1 postgresql-client wget curl && apt clean && rm -rf /var/lib/apt/lists /var/cache/apt
+RUN apt update || true && DEBIAN_FRONTEND=noninteractive apt install -y libaio1 postgresql-client wget curl unixodbc odbcinst && apt clean && rm -rf /var/lib/apt/lists /var/cache/apt
# Install Oracle Instant Client
RUN cd /tmp && \
diff --git a/cmd/sling/sling_run.go b/cmd/sling/sling_run.go
index 04e87923f..715640621 100755
--- a/cmd/sling/sling_run.go
+++ b/cmd/sling/sling_run.go
@@ -248,11 +248,10 @@ runReplication:
defer connection.CloseAll()
if !env.IsThreadChild {
- text := "Sling CLI | https://slingdata.io"
if env.NoColor {
- g.Info(text)
+ g.Info(env.Marker)
} else {
- g.Info(env.CyanString(text))
+ g.Info(env.CyanString(env.Marker))
}
// check for update, and print note
diff --git a/cmd/sling/sling_test.go b/cmd/sling/sling_test.go
index 7beffcb5f..49f336050 100755
--- a/cmd/sling/sling_test.go
+++ b/cmd/sling/sling_test.go
@@ -113,6 +113,7 @@ var connMap = map[dbio.Type]connTest{
dbio.TypeFileGoogleDrive: {name: "google_drive"},
dbio.TypeFileFtp: {name: "ftp_test_url"},
dbio.TypeFileAzureABFS: {name: "fabric_lake"},
+ dbio.Type("db2"): {name: "db2", adjustCol: g.Bool(false)},
}
func init() {
@@ -782,7 +783,9 @@ func runOneTask(t *testing.T, file g.FileItem, connType dbio.Type) {
// skip those
if g.In(srcType, dbio.TypeDbMongoDB, dbio.TypeDbAzureTable) ||
g.In(tgtType, dbio.TypeDbMongoDB, dbio.TypeDbAzureTable) ||
- taskCfg.TgtConn.IsADBC() || taskCfg.SrcConn.IsADBC() {
+ taskCfg.TgtConn.IsADBC() || taskCfg.SrcConn.IsADBC() ||
+ taskCfg.TgtConn.Type == dbio.TypeDbODBC ||
+ taskCfg.SrcConn.Type == dbio.TypeDbODBC {
continue
}
@@ -1077,6 +1080,7 @@ func TestSuiteDatabaseExasol(t *testing.T) {
}
func TestSuiteDatabaseDatabricks(t *testing.T) {
+ t.Skip()
t.Parallel()
// test 06 => BAD_REQUEST: Parameterized query has too many parameters: 1812 parameters were given but the limit is 256.
@@ -1096,6 +1100,10 @@ func TestSuiteDatabaseIceberg(t *testing.T) {
// testSuite(t, dbio.TypeDbIceberg, "1-4,6-12")
}
+func TestSuiteDatabaseDB2(t *testing.T) {
+ testSuite(t, dbio.Type("db2"), "1-5,7+")
+}
+
func TestSuiteDatabaseSQLServer(t *testing.T) {
t.Parallel()
testSuite(t, dbio.TypeDbSQLServer)
@@ -1111,7 +1119,7 @@ func TestSuiteDatabaseSQLServer(t *testing.T) {
}
func TestSuiteDatabaseFabric(t *testing.T) {
- // t.Skip()
+ t.Skip()
t.Parallel()
testSuite(t, dbio.TypeDbFabric)
}
@@ -1743,7 +1751,7 @@ func TestSuiteFileFtp(t *testing.T) {
}
func TestSuiteFileAzureABFS(t *testing.T) {
- // t.Skip()
+ t.Skip()
t.Parallel()
testSuite(t, dbio.TypeFileAzureABFS)
}
diff --git a/cmd/sling/tests/replications/r.85.mssql_postgres_synced_at.yaml b/cmd/sling/tests/replications/r.85.mssql_postgres_synced_at.yaml
new file mode 100644
index 000000000..46eb23a50
--- /dev/null
+++ b/cmd/sling/tests/replications/r.85.mssql_postgres_synced_at.yaml
@@ -0,0 +1,222 @@
+source: MSSQL
+target: POSTGRES
+
+env:
+ SLING_SYNCED_AT_COLUMN: true
+
+hooks:
+ start:
+ # Create table 1 with 10 rows
+ - type: query
+ connection: '{source.name}'
+ query: |
+ IF OBJECT_ID('dbo.synced_at_test1', 'U') IS NOT NULL DROP TABLE dbo.synced_at_test1;
+ CREATE TABLE dbo.synced_at_test1 (
+ id INT PRIMARY KEY,
+ name NVARCHAR(100),
+ value DECIMAL(10,2),
+ modified_at DATETIME DEFAULT GETDATE()
+ );
+ INSERT INTO dbo.synced_at_test1 (id, name, value) VALUES
+ (1, 'Row 1', 100.00), (2, 'Row 2', 200.00), (3, 'Row 3', 300.00),
+ (4, 'Row 4', 400.00), (5, 'Row 5', 500.00), (6, 'Row 6', 600.00),
+ (7, 'Row 7', 700.00), (8, 'Row 8', 800.00), (9, 'Row 9', 90.00),
+ (10, 'Row 10', 1000.00);
+
+ # Create table 2 with 8 rows (IDs 1-8), IDs 9 and 10 will be missing for soft delete test
+ - type: query
+ connection: '{source.name}'
+ query: |
+ IF OBJECT_ID('dbo.synced_at_test2', 'U') IS NOT NULL DROP TABLE dbo.synced_at_test2;
+ CREATE TABLE dbo.synced_at_test2 (
+ id INT PRIMARY KEY,
+ name NVARCHAR(100),
+ value DECIMAL(10,2),
+ modified_at DATETIME DEFAULT GETDATE()
+ );
+ INSERT INTO dbo.synced_at_test2 (id, name, value) VALUES
+ (1, 'Item 1', 10.00), (2, 'Item 2', 20.00), (3, 'Item 3', 30.00),
+ (4, 'Item 4', 40.00), (5, 'Item 5', 50.00), (6, 'Item 6', 60.00),
+ (7, 'Item 7', 70.00), (8, 'Item 8', 80.00);
+
+ end:
+ - type: check
+ check: execution.status.error == 0
+ on_failure: break
+
+ # Verify _sling_synced_at EXISTS for table1
+ - type: query
+ connection: '{target.name}'
+ query: |
+ SELECT COUNT(*) as col_exists
+ FROM information_schema.columns
+ WHERE table_schema = 'public'
+ AND table_name = 'synced_at_test1'
+ AND column_name = '_sling_synced_at'
+ into: synced_at_exists_t1
+
+ - type: check
+ check: int_parse(store.synced_at_exists_t1[0].col_exists) == 1
+ failure_message: "_sling_synced_at column missing in synced_at_test1"
+
+ - type: log
+ message: "SUCCESS: _sling_synced_at column exists in synced_at_test1"
+
+ # Verify _sling_deleted_at does NOT EXIST for table1
+ - type: query
+ connection: '{target.name}'
+ query: |
+ SELECT COUNT(*) as col_exists
+ FROM information_schema.columns
+ WHERE table_schema = 'public'
+ AND table_name = 'synced_at_test1'
+ AND column_name = '_sling_deleted_at'
+ into: deleted_at_exists_t1
+
+ - type: check
+ check: int_parse(store.deleted_at_exists_t1[0].col_exists) == 0
+ failure_message: "_sling_deleted_at column should NOT exist in synced_at_test1"
+
+ - type: log
+ message: "SUCCESS: _sling_deleted_at column does NOT exist in synced_at_test1"
+
+ # Verify data type is timestamp
+ - type: query
+ connection: '{target.name}'
+ query: |
+ SELECT data_type
+ FROM information_schema.columns
+ WHERE table_schema = 'public'
+ AND table_name = 'synced_at_test1'
+ AND column_name = '_sling_synced_at'
+ into: synced_at_type
+
+ - type: check
+ check: contains(store.synced_at_type[0].data_type, "timestamp")
+ failure_message: "_sling_synced_at should be timestamp type, got {store.synced_at_type[0].data_type}"
+
+ # Verify row counts for table1
+ - type: query
+ connection: '{target.name}'
+ query: SELECT COUNT(*) as count FROM public.synced_at_test1
+ into: t1_count
+
+ - type: check
+ check: int_parse(store.t1_count[0].count) == 10
+ failure_message: "Expected 10 rows in synced_at_test1, got {store.t1_count[0].count}"
+
+ # Verify synced_at_test1 has different _sling_synced_at values (rows 9,10 should have later timestamp)
+ - type: query
+ connection: '{target.name}'
+ query: |
+ SELECT COUNT(DISTINCT _sling_synced_at) as distinct_synced_at
+ FROM public.synced_at_test1
+ into: distinct_synced_at
+
+ - type: check
+ check: int_parse(store.distinct_synced_at[0].distinct_synced_at) == 2
+ failure_message: "Expected 2 distinct _sling_synced_at values (IDs 1-8 vs 9-10), got {store.distinct_synced_at[0].distinct_synced_at}"
+
+ - type: log
+ message: "SUCCESS: synced_at_test1 has 2 distinct _sling_synced_at values"
+
+ # Verify _sling_synced_op column EXISTS
+ - type: query
+ connection: '{target.name}'
+ query: |
+ SELECT COUNT(*) as col_exists
+ FROM information_schema.columns
+ WHERE table_schema = 'public'
+ AND table_name = 'synced_at_test1'
+ AND column_name = '_sling_synced_op'
+ into: synced_op_exists
+
+ - type: check
+ check: int_parse(store.synced_op_exists[0].col_exists) == 1
+ failure_message: "_sling_synced_op column missing"
+
+ - type: log
+ message: "SUCCESS: _sling_synced_op column exists"
+
+ # Verify soft-deleted rows (IDs 9-10) have _sling_synced_op = 'D'
+ - type: query
+ connection: '{target.name}'
+ query: |
+ SELECT COUNT(*) as deleted_count
+ FROM public.synced_at_test1
+ WHERE id IN (9, 10) AND _sling_synced_op = 'D'
+ into: deleted_op_count
+
+ - type: check
+ check: int_parse(store.deleted_op_count[0].deleted_count) == 2
+ failure_message: "Expected 2 rows with _sling_synced_op='D' for IDs 9,10, got {store.deleted_op_count[0].deleted_count}"
+
+ - type: log
+ message: "SUCCESS: IDs 9,10 have _sling_synced_op='D' (soft deleted)"
+
+ # Verify updated rows (IDs 1-8) have _sling_synced_op = 'U'
+ - type: query
+ connection: '{target.name}'
+ query: |
+ SELECT COUNT(*) as updated_count
+ FROM public.synced_at_test1
+ WHERE id BETWEEN 1 AND 8 AND _sling_synced_op = 'U'
+ into: updated_op_count
+
+ - type: check
+ check: int_parse(store.updated_op_count[0].updated_count) == 8
+ failure_message: "Expected 8 rows with _sling_synced_op='U' for IDs 1-8, got {store.updated_op_count[0].updated_count}"
+
+ - type: log
+ message: "SUCCESS: IDs 1-8 have _sling_synced_op='U' (updated)"
+
+ # Cleanup
+ - type: query
+ connection: '{source.name}'
+ query: |
+ IF OBJECT_ID('dbo.synced_at_test1', 'U') IS NOT NULL DROP TABLE dbo.synced_at_test1;
+ IF OBJECT_ID('dbo.synced_at_test2', 'U') IS NOT NULL DROP TABLE dbo.synced_at_test2;
+
+ - type: query
+ connection: '{target.name}'
+ query: |
+ DROP TABLE IF EXISTS public.synced_at_test1 CASCADE;
+
+streams:
+ dbo.synced_at_test1:
+ object: public.synced_at_test1
+ mode: full-refresh
+ primary_key: [id]
+ target_options:
+ column_casing: lower
+
+ hooks:
+ post:
+ # Verify all rows have _sling_synced_op = 'I' after full-refresh insert
+ - type: query
+ connection: '{target.name}'
+ query: |
+ SELECT COUNT(*) as insert_count
+ FROM public.synced_at_test1
+ WHERE _sling_synced_op = 'I'
+ into: insert_op_count
+
+ - type: check
+ check: int_parse(store.insert_op_count[0].insert_count) == 10
+ failure_message: "Expected 10 rows with _sling_synced_op='I' after full-refresh, got {store.insert_op_count[0].insert_count}"
+
+ - type: log
+ message: "SUCCESS: All 10 rows have _sling_synced_op='I' after full-refresh"
+
+ # Sleep 2 seconds to ensure different _sling_synced_at timestamps
+ - type: query
+ connection: '{target.name}'
+ query: SELECT pg_sleep(2)
+
+ dbo.synced_at_test2:
+ object: public.synced_at_test1
+ mode: incremental
+ primary_key: [id]
+ target_options:
+ delete_missing: soft
+ column_casing: lower
diff --git a/cmd/sling/tests/replications/r.86.record_key_casing.yaml b/cmd/sling/tests/replications/r.86.record_key_casing.yaml
new file mode 100644
index 000000000..edf5da4e0
--- /dev/null
+++ b/cmd/sling/tests/replications/r.86.record_key_casing.yaml
@@ -0,0 +1,77 @@
+# Test for mixed-case record key references in transform expressions
+
+source: mysql
+target: local
+
+hooks:
+ start:
+ # Create test table with mixed-case column names (MySQL uses backticks for identifiers)
+ - type: query
+ connection: '{source.name}'
+ query: |
+ DROP TABLE IF EXISTS mysql.test_record_key_casing;
+ CREATE TABLE mysql.test_record_key_casing (
+ id INTEGER,
+ `DateAdded` TIMESTAMP NULL,
+ `LastChanged` TIMESTAMP NULL,
+ `_sling_deleted_at` TIMESTAMP NULL,
+ value TEXT
+ );
+ INSERT INTO mysql.test_record_key_casing (id, `DateAdded`, `LastChanged`, `_sling_deleted_at`, value)
+ VALUES
+ (1, '2024-01-15 10:00:00', '2024-01-20 15:30:00', NULL, 'row1'),
+ (2, '2024-02-10 08:00:00', '2024-02-25 12:00:00', '2024-03-01 09:00:00', NULL),
+ (3, '2024-03-05 09:00:00', '2024-03-10 18:00:00', NULL, 'row3');
+
+ - type: command
+ command: mkdir -p '{env.output_dir}'
+
+ end:
+ # Check if errored, do not proceed
+ - type: check
+ check: execution.status.error == 0
+ on_failure: break
+
+ # Verify output parquet file exists and has data
+ - type: query
+ connection: duckdb
+ query: "SELECT * FROM read_parquet('{env.output_dir}/output.parquet')"
+ into: result
+
+ - type: log
+ message: |
+ Parquet output: {store.result}
+
+ # Verify true_changed_at column was computed correctly
+ - type: check
+ check: length(store.result) == 3
+ success_message: "SUCCESS: All 3 rows exported successfully with mixed-case column transform"
+
+ # Verify the computed column has correct values (should be the greatest of the three timestamps)
+ - type: check
+ check: store.result[0].true_changed_at != nil
+ success_message: "SUCCESS: true_changed_at column was computed correctly"
+
+ # Cleanup
+ - type: query
+ connection: '{source.name}'
+ query: DROP TABLE IF EXISTS mysql.test_record_key_casing;
+
+ - type: command
+ command: rm -rf '{env.output_dir}'
+
+streams:
+ mysql.test_record_key_casing:
+ object: "file://{output_dir}/output.parquet"
+ mode: full-refresh
+ target_options:
+ format: parquet
+ transforms:
+ # IMPORTANT: Record keys are normalized to lowercase internally
+ # Even though the column is named "DateAdded", use record.dateadded (lowercase)
+ # This reproduces the customer's exact expression pattern with _sling_deleted_at
+ - true_changed_at: >
+ greatest(record.dateadded, record.lastchanged, record._sling_deleted_at)
+
+env:
+ output_dir: temp/test_parquet
\ No newline at end of file
diff --git a/cmd/sling/tests/replications/r.87.record_key_casing_bigquery.yaml b/cmd/sling/tests/replications/r.87.record_key_casing_bigquery.yaml
new file mode 100644
index 000000000..7d8449f73
--- /dev/null
+++ b/cmd/sling/tests/replications/r.87.record_key_casing_bigquery.yaml
@@ -0,0 +1,81 @@
+# Test for mixed-case record key references in transforms (MySQL to BigQuery)
+# Uses the same source table created by r.86 test
+
+source: mysql
+target: bigquery
+
+hooks:
+ start:
+ # Create test table with mixed-case column names (MySQL uses backticks for identifiers)
+ - type: query
+ connection: '{source.name}'
+ query: |
+ DROP TABLE IF EXISTS mysql.test_record_key_casing;
+ CREATE TABLE mysql.test_record_key_casing (
+ `id` int unsigned NOT NULL,
+ `DateAdded` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP,
+ `LastChanged` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
+ `_sling_deleted_at` TIMESTAMP NULL,
+ value TEXT
+ );
+ INSERT INTO mysql.test_record_key_casing (id, `DateAdded`, `LastChanged`, `_sling_deleted_at`, value)
+ VALUES
+ (1, '2024-01-15 10:00:00', '2024-01-20 15:30:00', NULL, 'row1'),
+ (2, '2024-02-10 08:00:00', '2024-02-25 12:00:00', '2024-03-01 09:00:00', NULL),
+ (3, '2024-03-05 09:00:00', '2024-03-10 18:00:00', NULL, 'row3');
+
+ end:
+ # Check if errored, do not proceed
+ - type: check
+ check: execution.status.error == 0
+ on_failure: break
+
+ # Verify BigQuery table has data
+ - type: query
+ connection: '{target.name}'
+ query: "SELECT * FROM public.test_record_key_casing ORDER BY id"
+ into: result
+
+ - type: log
+ message: |
+ BigQuery output: {store.result}
+
+ # Verify row count
+ - type: check
+ check: length(store.result) == 3
+ success_message: "SUCCESS: All 3 rows exported successfully with mixed-case column transform (BigQuery)"
+
+ # Verify the computed column has correct values
+ - type: check
+ check: store.result[0].true_changed_at != nil
+ success_message: "SUCCESS: true_changed_at column was computed correctly (BigQuery)"
+
+ # Cleanup BigQuery table
+ - type: query
+ connection: '{target.name}'
+ query: DROP TABLE IF EXISTS public.test_record_key_casing
+
+ # Cleanup MySQL table
+ - type: query
+ connection: '{source.name}'
+ query: DROP TABLE IF EXISTS mysql.test_record_key_casing
+
+streams:
+ mysql.test_record_key_casing:
+ object: public.test_record_key_casing
+ mode: full-refresh
+ target_options:
+ direct_insert: true
+
+ transforms:
+ # - true_changed_at: >
+ # greatest(
+ # date_parse(record.dateadded),
+ # date_parse(record.lastchanged),
+ # date_parse(record._sling_deleted_at)
+ # )
+ - true_changed_at: >
+ greatest(
+ date_parse(replace(record.dateadded, "\"", "")),
+ date_parse(replace(record.lastchanged, "\"", ""))
+ )
diff --git a/cmd/sling/tests/replications/r.88.table_ddl_with_clause.yaml b/cmd/sling/tests/replications/r.88.table_ddl_with_clause.yaml
new file mode 100644
index 000000000..a4c94270b
--- /dev/null
+++ b/cmd/sling/tests/replications/r.88.table_ddl_with_clause.yaml
@@ -0,0 +1,169 @@
+# Test for GitHub Issue #694
+# Verifies that custom table_ddl with a WITH clause works correctly.
+# The user scenario: specifying table_ddl with PRIMARY KEY and WITH clause.
+#
+# Before the fix, this would generate invalid SQL like:
+# CREATE TABLE t (col1 int, col2 int) WITH (FILLFACTOR=90, PRIMARY KEY (col1, col2))
+#
+# After the fix, the DDL is valid:
+# CREATE TABLE t (col1 int, col2 int, PRIMARY KEY (col1, col2)) WITH (FILLFACTOR=90)
+#
+# Note: This test uses user-defined PRIMARY KEY in table_ddl since sling doesn't
+# auto-add PKs for most databases (only StarRocks).
+
+source: mssql
+target: mssql
+
+defaults:
+ mode: full-refresh
+
+hooks:
+ start:
+ - type: query
+ connection: '{source.name}'
+ query: |
+ IF OBJECT_ID('dbo.test_ddl_pk_src_88', 'U') IS NOT NULL
+ DROP TABLE dbo.test_ddl_pk_src_88;
+ CREATE TABLE dbo.test_ddl_pk_src_88 (
+ col1 nvarchar(10),
+ col2 nvarchar(6)
+ );
+ INSERT INTO dbo.test_ddl_pk_src_88 (col1, col2) VALUES ('val1', 'val2');
+
+ - type: query
+ connection: '{target.name}'
+ query: |
+ IF OBJECT_ID('dbo.test_ddl_pk_tgt_88', 'U') IS NOT NULL
+ DROP TABLE dbo.test_ddl_pk_tgt_88;
+ IF OBJECT_ID('dbo.test_ddl_pk_tgt_88_with', 'U') IS NOT NULL
+ DROP TABLE dbo.test_ddl_pk_tgt_88_with;
+
+ end:
+ - type: check
+ check: execution.status.error == 0
+ on_failure: break
+
+ # Verify the table was created and has a primary key
+ - type: query
+ connection: '{target.name}'
+ query: |
+ SELECT
+ c.name AS column_name
+ FROM sys.indexes i
+ JOIN sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id
+ JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id
+ WHERE i.is_primary_key = 1
+ AND i.object_id = OBJECT_ID('dbo.test_ddl_pk_tgt_88')
+ ORDER BY ic.key_ordinal
+ into: pk_columns
+
+ - type: log
+ message: "Primary key columns: {store.pk_columns}"
+
+ # Verify both columns are in the primary key
+ - type: check
+ check: length(store.pk_columns) == 2
+ on_failure: error
+ message: "Expected 2 primary key columns"
+
+ - type: check
+ check: store.pk_columns[0].column_name == "col1"
+ on_failure: error
+ message: "First PK column should be col1"
+
+ - type: check
+ check: store.pk_columns[1].column_name == "col2"
+ on_failure: error
+ message: "Second PK column should be col2"
+
+ # Verify Stream 2: table with WITH clause has primary key
+ - type: query
+ connection: '{target.name}'
+ query: |
+ SELECT
+ c.name AS column_name
+ FROM sys.indexes i
+ JOIN sys.index_columns ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id
+ JOIN sys.columns c ON ic.object_id = c.object_id AND ic.column_id = c.column_id
+ WHERE i.is_primary_key = 1
+ AND i.object_id = OBJECT_ID('dbo.test_ddl_pk_tgt_88_with')
+ ORDER BY ic.key_ordinal
+ into: pk_columns_with
+
+ - type: log
+ message: "Primary key columns (WITH clause table): {store.pk_columns_with}"
+
+ - type: check
+ check: length(store.pk_columns_with) == 2
+ on_failure: error
+ message: "Expected 2 primary key columns in WITH clause table"
+
+ - type: check
+ check: store.pk_columns_with[0].column_name == "col1"
+ on_failure: error
+ message: "First PK column in WITH clause table should be col1"
+
+ - type: check
+ check: store.pk_columns_with[1].column_name == "col2"
+ on_failure: error
+ message: "Second PK column in WITH clause table should be col2"
+
+ # Verify Stream 2: table has DATA_COMPRESSION enabled
+ - type: query
+ connection: '{target.name}'
+ query: |
+ SELECT data_compression_desc
+ FROM sys.partitions
+ WHERE object_id = OBJECT_ID('dbo.test_ddl_pk_tgt_88_with')
+ AND index_id <= 1
+ into: compression_info
+
+ - type: log
+ message: "Compression info: {store.compression_info}"
+
+ - type: check
+ check: store.compression_info[0].data_compression_desc == "PAGE"
+ on_failure: error
+ message: "Table should have PAGE compression enabled"
+
+ # Cleanup
+ - type: query
+ connection: '{source.name}'
+ query: |
+ IF OBJECT_ID('dbo.test_ddl_pk_src_88', 'U') IS NOT NULL
+ DROP TABLE dbo.test_ddl_pk_src_88;
+
+ - type: query
+ connection: '{target.name}'
+ query: |
+ IF OBJECT_ID('dbo.test_ddl_pk_tgt_88', 'U') IS NOT NULL
+ DROP TABLE dbo.test_ddl_pk_tgt_88;
+ IF OBJECT_ID('dbo.test_ddl_pk_tgt_88_with', 'U') IS NOT NULL
+ DROP TABLE dbo.test_ddl_pk_tgt_88_with;
+
+streams:
+ # Stream 1: Test custom table_ddl with PRIMARY KEY constraint
+ dbo.test_ddl_pk_src_88:
+ object: dbo.test_ddl_pk_tgt_88
+ target_options:
+ # User-defined table_ddl with PRIMARY KEY constraint
+ table_ddl: |
+ CREATE TABLE {table} (
+ {col_types},
+ PRIMARY KEY ("col1", "col2")
+ )
+
+ # Stream 2: Test custom table_ddl with WITH clause (GitHub #694 scenario)
+ # This tests that table options after ({col_types}) don't break the DDL
+ stream_with_clause:
+ sql: select * from dbo.test_ddl_pk_src_88
+ object: dbo.test_ddl_pk_tgt_88_with
+ target_options:
+ # User-defined table_ddl with PRIMARY KEY and WITH clause
+ # Before fix: PK would incorrectly be placed inside WITH clause
+ # After fix: PK stays inside column definitions, WITH clause follows
+ table_ddl: |
+ CREATE TABLE {table} (
+ {col_types},
+ PRIMARY KEY ("col1", "col2")
+ ) WITH (DATA_COMPRESSION = PAGE)
diff --git a/cmd/sling/tests/replications/r.89.definition_only_db.yaml b/cmd/sling/tests/replications/r.89.definition_only_db.yaml
new file mode 100644
index 000000000..02940dd01
--- /dev/null
+++ b/cmd/sling/tests/replications/r.89.definition_only_db.yaml
@@ -0,0 +1,107 @@
+source: postgres
+target: mssql
+
+defaults:
+ mode: definition-only
+
+hooks:
+ start:
+ # Create source table with various column types
+ - type: query
+ connection: '{source.name}'
+ query: |
+ DROP TABLE IF EXISTS public.test_definition_only_db;
+ CREATE TABLE public.test_definition_only_db (
+ id bigint,
+ name varchar(100),
+ created_at timestamp,
+ amount decimal(12,2),
+ is_active boolean
+ );
+ INSERT INTO public.test_definition_only_db VALUES
+ (1, 'test1', now(), 123.45, true),
+ (2, 'test2', now(), 456.78, false);
+
+ end:
+ # Check that execution succeeded
+ - type: check
+ check: execution.status.error == 0
+ on_failure: break
+
+ # Verify table was created in target
+ - type: query
+ connection: '{target.name}'
+ query: |
+ SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES
+ WHERE TABLE_SCHEMA = 'dbo' AND TABLE_NAME = 'test_definition_only_db'
+ into: table_check
+
+ - type: check
+ check: length(store.table_check) == 1
+ failure_message: "Expected table to be created but it doesn't exist"
+
+ # Verify table has 0 rows (definition-only should not copy data)
+ - type: query
+ connection: '{target.name}'
+ query: SELECT COUNT(*) as cnt FROM dbo.test_definition_only_db
+ into: row_count
+
+ - type: check
+ check: int_parse(store.row_count[0].cnt) == 0
+ failure_message: "Expected 0 rows but found {store.row_count[0].cnt}"
+
+ # Get column information from INFORMATION_SCHEMA
+ - type: query
+ connection: '{target.name}'
+ query: |
+ SELECT COLUMN_NAME, DATA_TYPE
+ FROM INFORMATION_SCHEMA.COLUMNS
+ WHERE TABLE_SCHEMA = 'dbo' AND TABLE_NAME = 'test_definition_only_db'
+ ORDER BY ORDINAL_POSITION
+ into: schema_info
+
+ - type: log
+ message: "Schema info: {store.schema_info}"
+
+ # Verify we have 5 columns
+ - type: check
+ check: length(store.schema_info) == 5
+ failure_message: "Expected 5 columns but found {length(store.schema_info)}"
+
+ # Verify column names and types
+ - type: check
+ check: store.schema_info[0].column_name == "id" && store.schema_info[0].data_type == "bigint"
+ failure_message: "Expected column 'id' with type bigint, got {store.schema_info[0].column_name} / {store.schema_info[0].data_type}"
+
+ - type: check
+ check: store.schema_info[1].column_name == "name" && contains(store.schema_info[1].data_type, "varchar")
+ failure_message: "Expected column 'name' with type varchar, got {store.schema_info[1].column_name} / {store.schema_info[1].data_type}"
+
+ - type: check
+ check: store.schema_info[2].column_name == "created_at" && contains(store.schema_info[2].data_type, "datetime")
+ failure_message: "Expected column 'created_at' with type datetime, got {store.schema_info[2].column_name} / {store.schema_info[2].data_type}"
+
+ - type: check
+ check: store.schema_info[3].column_name == "amount" && store.schema_info[3].data_type == "decimal"
+ failure_message: "Expected column 'amount' with type decimal, got {store.schema_info[3].column_name} / {store.schema_info[3].data_type}"
+
+ - type: check
+ check: store.schema_info[4].column_name == "is_active" && store.schema_info[4].data_type == "bit"
+ failure_message: "Expected column 'is_active' with type bit, got {store.schema_info[4].column_name} / {store.schema_info[4].data_type}"
+
+ - type: log
+ message: "SUCCESS: Table definition created with correct schema and 0 rows (definition-only mode)"
+
+ # Cleanup source
+ - type: query
+ connection: '{source.name}'
+ query: DROP TABLE IF EXISTS public.test_definition_only_db
+
+ # Cleanup target
+ - type: query
+ connection: '{target.name}'
+ query: DROP TABLE IF EXISTS dbo.test_definition_only_db
+
+streams:
+ public.test_definition_only_db:
+ object: dbo.test_definition_only_db
diff --git a/cmd/sling/tests/replications/r.90.definition_only_file.yaml b/cmd/sling/tests/replications/r.90.definition_only_file.yaml
new file mode 100644
index 000000000..aa4f87da8
--- /dev/null
+++ b/cmd/sling/tests/replications/r.90.definition_only_file.yaml
@@ -0,0 +1,108 @@
+source: postgres
+target: LOCAL
+
+defaults:
+ mode: definition-only
+
+env:
+ OUTPUT_PATH: temp/definition_only_test
+
+hooks:
+ start:
+ # Create source table with various column types
+ - type: query
+ connection: '{source.name}'
+ query: |
+ DROP TABLE IF EXISTS public.test_definition_only_file;
+ CREATE TABLE public.test_definition_only_file (
+ id bigint,
+ name varchar(100),
+ created_at timestamp,
+ amount decimal(12,2),
+ is_active boolean
+ );
+ INSERT INTO public.test_definition_only_file VALUES
+ (1, 'test1', now(), 123.45, true),
+ (2, 'test2', now(), 456.78, false),
+ (3, 'test3', now(), 789.01, true);
+
+ # Cleanup output file
+ - command: rm -rf temp/definition_only_test
+
+ end:
+ # Check that execution succeeded
+ - check: execution.status.error == 0
+ on_failure: break
+
+ # Verify output file exists by listing it
+ - type: list
+ id: file_list
+ location: 'local/{env.OUTPUT_PATH}'
+ only: files
+
+ - type: log
+ message: "Files found: {state.file_list}"
+
+ - type: check
+ check: length(state.file_list) >= 1
+ failure_message: "Expected output file to be created"
+
+ # Use DuckDB to describe the parquet file schema and verify column types
+ - type: query
+ connection: duckdb
+ query: DESCRIBE SELECT * FROM '{env.OUTPUT_PATH}/test_definition_only.parquet'
+ into: schema_info
+
+ - type: log
+ message: "Schema info: {store.schema_info}"
+
+ # Verify we have 5 columns
+ - type: check
+ check: length(store.schema_info) == 5
+ failure_message: "Expected 5 columns but found {length(store.schema_info)}"
+
+ # Verify column names and types
+ - type: check
+ check: store.schema_info[0].column_name == "id" && store.schema_info[0].column_type == "BIGINT"
+ failure_message: "Expected column 'id' with type BIGINT, got {store.schema_info[0].column_name} / {store.schema_info[0].column_type}"
+
+ - type: check
+ check: store.schema_info[1].column_name == "name" && contains(store.schema_info[1].column_type, "VARCHAR")
+ failure_message: "Expected column 'name' with type VARCHAR, got {store.schema_info[1].column_name} / {store.schema_info[1].column_type}"
+
+ - type: check
+ check: store.schema_info[2].column_name == "created_at" && contains(store.schema_info[2].column_type, "TIMESTAMP")
+ failure_message: "Expected column 'created_at' with type TIMESTAMP, got {store.schema_info[2].column_name} / {store.schema_info[2].column_type}"
+
+ - type: check
+ check: store.schema_info[3].column_name == "amount" && contains(store.schema_info[3].column_type, "DECIMAL")
+ failure_message: "Expected column 'amount' with type DECIMAL, got {store.schema_info[3].column_name} / {store.schema_info[3].column_type}"
+
+ - type: check
+ check: store.schema_info[4].column_name == "is_active" && store.schema_info[4].column_type == "BOOLEAN"
+ failure_message: "Expected column 'is_active' with type BOOLEAN, got {store.schema_info[4].column_name} / {store.schema_info[4].column_type}"
+
+ # Verify file has 0 rows (definition-only should not copy data)
+ - type: query
+ connection: duckdb
+ query: SELECT COUNT(*) as cnt FROM '{env.OUTPUT_PATH}/test_definition_only.parquet'
+ into: row_count
+
+ - type: check
+ check: int_parse(store.row_count[0].cnt) == 0
+ failure_message: "Expected 0 rows but found {store.row_count[0].cnt}"
+
+ - type: log
+ message: "SUCCESS: Parquet file definition created with correct schema (definition-only mode)"
+
+ # Cleanup source
+ - type: query
+ connection: '{source.name}'
+ query: DROP TABLE IF EXISTS public.test_definition_only_file
+
+ # Cleanup output file
+ - command: rm -rf temp/definition_only_test
+
+streams:
+ public.test_definition_only_file:
+ object: '{env.OUTPUT_PATH}/test_definition_only.parquet'
diff --git a/cmd/sling/tests/replications/r.91.definition_only_file_source.yaml b/cmd/sling/tests/replications/r.91.definition_only_file_source.yaml
new file mode 100644
index 000000000..cc28d007c
--- /dev/null
+++ b/cmd/sling/tests/replications/r.91.definition_only_file_source.yaml
@@ -0,0 +1,106 @@
+source: LOCAL
+target: postgres
+
+defaults:
+ mode: definition-only
+
+hooks:
+ start:
+ # Drop target table if exists
+ - type: query
+ connection: '{target.name}'
+ query: DROP TABLE IF EXISTS public.test_definition_only_file_source
+
+ end:
+ # Check that execution succeeded
+ - check: execution.status.error == 0
+ on_failure: break
+
+ # Verify table was created in target
+ - type: query
+ connection: '{target.name}'
+ query: |
+ SELECT table_name FROM information_schema.tables
+ WHERE table_schema = 'public' AND table_name = 'test_definition_only_file_source'
+ into: table_check
+
+ - type: check
+ check: length(store.table_check) == 1
+ failure_message: "Expected table to be created but it doesn't exist"
+
+ # Verify table has 0 rows (definition-only should not copy data)
+ - type: query
+ connection: '{target.name}'
+ query: SELECT COUNT(*) as cnt FROM public.test_definition_only_file_source
+ into: row_count
+
+ - type: check
+ check: int_parse(store.row_count[0].cnt) == 0
+ failure_message: "Expected 0 rows but found {store.row_count[0].cnt}"
+
+ # Get column information
+ - type: query
+ connection: '{target.name}'
+ query: |
+ SELECT column_name, data_type
+ FROM information_schema.columns
+ WHERE table_schema = 'public' AND table_name = 'test_definition_only_file_source'
+ ORDER BY ordinal_position
+ into: schema_info
+
+ - type: log
+ message: "Schema info: {store.schema_info}"
+
+ # Verify we have 10 columns (test1.parquet: id, first_name, last_name, email, target, create_dt, date, rating, code + _sling_loaded_at)
+ - type: check
+ check: length(store.schema_info) == 10
+ failure_message: "Expected 10 columns but found {length(store.schema_info)}"
+
+ # Verify column names and types
+ - type: check
+ check: store.schema_info[0].column_name == "id" && store.schema_info[0].data_type == "bigint"
+ failure_message: "Expected column 'id' with type bigint, got {store.schema_info[0].column_name} / {store.schema_info[0].data_type}"
+
+ - type: check
+ check: store.schema_info[1].column_name == "first_name" && store.schema_info[1].data_type == "text"
+ failure_message: "Expected column 'first_name' with type text, got {store.schema_info[1].column_name} / {store.schema_info[1].data_type}"
+
+ - type: check
+ check: store.schema_info[2].column_name == "last_name" && store.schema_info[2].data_type == "text"
+ failure_message: "Expected column 'last_name' with type text, got {store.schema_info[2].column_name} / {store.schema_info[2].data_type}"
+
+ - type: check
+ check: store.schema_info[3].column_name == "email" && store.schema_info[3].data_type == "text"
+ failure_message: "Expected column 'email' with type text, got {store.schema_info[3].column_name} / {store.schema_info[3].data_type}"
+
+ - type: check
+ check: store.schema_info[4].column_name == "target" && store.schema_info[4].data_type == "boolean"
+ failure_message: "Expected column 'target' with type boolean, got {store.schema_info[4].column_name} / {store.schema_info[4].data_type}"
+
+ - type: check
+ check: store.schema_info[5].column_name == "create_dt" && contains(store.schema_info[5].data_type, "timestamp")
+ failure_message: "Expected column 'create_dt' with type timestamp, got {store.schema_info[5].column_name} / {store.schema_info[5].data_type}"
+
+ - type: check
+ check: store.schema_info[6].column_name == "date" && contains(store.schema_info[6].data_type, "timestamp")
+ failure_message: "Expected column 'date' with type timestamp, got {store.schema_info[6].column_name} / {store.schema_info[6].data_type}"
+
+ - type: check
+ check: store.schema_info[7].column_name == "rating" && store.schema_info[7].data_type == "numeric"
+ failure_message: "Expected column 'rating' with type numeric, got {store.schema_info[7].column_name} / {store.schema_info[7].data_type}"
+
+ - type: check
+ check: store.schema_info[8].column_name == "code" && store.schema_info[8].data_type == "numeric"
+ failure_message: "Expected column 'code' with type numeric, got {store.schema_info[8].column_name} / {store.schema_info[8].data_type}"
+
+ - type: log
+ message: "SUCCESS: Table created from parquet file source with schema and 0 rows (definition-only mode)"
+
+ # Cleanup target
+ - type: query
+ connection: '{target.name}'
+ query: DROP TABLE IF EXISTS public.test_definition_only_file_source
+
+streams:
+ cmd/sling/tests/files/test1.parquet:
+ object: public.test_definition_only_file_source
diff --git a/cmd/sling/tests/replications/r.92.oracle_xmltype_bigquery.yaml b/cmd/sling/tests/replications/r.92.oracle_xmltype_bigquery.yaml
new file mode 100644
index 000000000..cf7f87d65
--- /dev/null
+++ b/cmd/sling/tests/replications/r.92.oracle_xmltype_bigquery.yaml
@@ -0,0 +1,125 @@
+# Test for Oracle XMLTYPE column transfer to BigQuery
+# Issue: Process hangs when Oracle table has XMLTYPE column
+# See: https://github.com/slingdata-io/sling-cli/issues/xxx
+# Debug log shows: "using text since type 'xmltype' not mapped for col 'XMLRECORD'"
+# The process then hangs for extended periods during BulkExportFlow
+
+source: ORACLE
+target: BIGQUERY
+
+defaults:
+ mode: full-refresh
+
+hooks:
+ start:
+ # Clean up any existing test table in Oracle
+ - type: query
+ connection: '{source.name}'
+ query: |
+ BEGIN
+ EXECUTE IMMEDIATE 'DROP TABLE ORACLE.TEST_XMLTYPE_TRANSFER PURGE';
+ EXCEPTION
+ WHEN OTHERS THEN
+ IF SQLCODE != -942 THEN
+ RAISE;
+ END IF;
+ END;
+
+ # Create test table with XMLTYPE column
+ - type: query
+ connection: '{source.name}'
+ query: |
+ CREATE TABLE ORACLE.TEST_XMLTYPE_TRANSFER (
+ id NUMBER PRIMARY KEY,
+ name VARCHAR2(100),
+ xmlrecord XMLTYPE,
+ created_at TIMESTAMP DEFAULT SYSTIMESTAMP
+ )
+
+ # Insert test data with XML content
+ - type: query
+ connection: '{source.name}'
+ query: |
+ INSERT INTO ORACLE.TEST_XMLTYPE_TRANSFER (id, name, xmlrecord)
+ VALUES (1, 'Record 1', XMLTYPE('- Test XML 1
100'))
+
+ - type: query
+ connection: '{source.name}'
+ query: |
+ INSERT INTO ORACLE.TEST_XMLTYPE_TRANSFER (id, name, xmlrecord)
+ VALUES (2, 'Record 2', XMLTYPE('- Test XML 2
Data'))
+
+ - type: query
+ connection: '{source.name}'
+ query: |
+ INSERT INTO ORACLE.TEST_XMLTYPE_TRANSFER (id, name, xmlrecord)
+ VALUES (3, 'Record 3', NULL)
+
+ # Clean up target table in BigQuery
+ - type: query
+ connection: '{target.name}'
+ query: DROP TABLE IF EXISTS `sling_test.test_xmltype_transfer`
+
+ end:
+ # If errored, do not proceed with verification
+ - type: check
+ check: execution.status.error == 0
+ on_failure: break
+
+ # Verify data was transferred
+ - type: query
+ connection: '{target.name}'
+ query: |
+ SELECT id, name, xmlrecord, created_at
+ FROM `sling_test.test_xmltype_transfer`
+ ORDER BY id
+ into: result
+
+ - type: log
+ message: |
+ Result from BigQuery:
+ { pretty_table(store.result) }
+
+ # Check row count (use length function)
+ - type: check
+ check: length(store.result) == 3
+ message: "Should have transferred 3 rows"
+
+ # Check that XMLTYPE was converted to text (use content from inner XML to avoid < parsing issues)
+ - type: check
+ check: contains(store.result[0].xmlrecord, "Test XML 1")
+ message: "XMLTYPE content should be preserved as text"
+
+ - type: check
+ check: contains(store.result[1].xmlrecord, "nested")
+ message: "Second XMLTYPE record should contain nested XML"
+
+ # Third record has NULL XML
+ - type: check
+ check: is_null(store.result[2].xmlrecord) || store.result[2].xmlrecord == ""
+ message: "NULL XMLTYPE should transfer as null/empty"
+
+ - type: log
+ message: "SUCCESS: Oracle XMLTYPE to BigQuery transfer completed without hanging!"
+
+ # Cleanup Oracle table
+ - type: query
+ connection: '{source.name}'
+ query: |
+ BEGIN
+ EXECUTE IMMEDIATE 'DROP TABLE ORACLE.TEST_XMLTYPE_TRANSFER PURGE';
+ EXCEPTION
+ WHEN OTHERS THEN
+ IF SQLCODE != -942 THEN
+ RAISE;
+ END IF;
+ END;
+
+ # Cleanup BigQuery table
+ - type: query
+ connection: '{target.name}'
+ query: DROP TABLE IF EXISTS `sling_test.test_xmltype_transfer`
+
+streams:
+ ORACLE.TEST_XMLTYPE_TRANSFER:
+ object: sling_test.test_xmltype_transfer
diff --git a/cmd/sling/tests/replications/r.93.mysql_load_data_local.yaml b/cmd/sling/tests/replications/r.93.mysql_load_data_local.yaml
new file mode 100644
index 000000000..715988fde
--- /dev/null
+++ b/cmd/sling/tests/replications/r.93.mysql_load_data_local.yaml
@@ -0,0 +1,54 @@
+# Test MySQL LoadDataLocal using RegisterReaderHandler pattern
+# This validates that LOAD DATA LOCAL INFILE works with the go-sql-driver's
+# native Reader:: handler pattern without requiring external mysql binary.
+source: local
+target: mysql
+
+defaults:
+ mode: full-refresh
+
+hooks:
+ end:
+ # Check execution succeeded
+ - type: check
+ check: execution.status.error == 0
+ on_failure: break
+
+ # Verify data was loaded
+ - type: query
+ connection: '{target.name}'
+ query: SELECT COUNT(*) as cnt FROM mysql.test_load_local
+ into: result
+
+ - type: log
+ message: "Row count: {store.result[0].cnt}"
+
+ # Verify row count matches source file (18 rows in test1.1.csv)
+ - type: check
+ check: int_parse(store.result[0].cnt) == 18
+ failure_message: "Expected 18 rows but found {store.result[0].cnt}"
+
+ # Sample some data to verify correctness
+ - type: query
+ connection: '{target.name}'
+ query: SELECT id, first_name, last_name, email FROM mysql.test_load_local WHERE id = 1
+ into: sample_row
+
+ - type: log
+ message: "Sample row: {store.sample_row}"
+
+ - type: check
+ check: int_parse(store.sample_row[0].id) == 1
+ failure_message: "Expected id=1 but found {store.sample_row[0].id}"
+
+ - type: log
+ message: "SUCCESS: MySQL LoadDataLocal test passed"
+
+ # Cleanup target table
+ - type: query
+ connection: '{target.name}'
+ query: DROP TABLE IF EXISTS mysql.test_load_local
+
+streams:
+ file://cmd/sling/tests/files/test1.1.csv:
+ object: mysql.test_load_local
diff --git a/cmd/sling/tests/replications/r.94.mysql_load_data_local_nulls.yaml b/cmd/sling/tests/replications/r.94.mysql_load_data_local_nulls.yaml
new file mode 100644
index 000000000..e05abe323
--- /dev/null
+++ b/cmd/sling/tests/replications/r.94.mysql_load_data_local_nulls.yaml
@@ -0,0 +1,211 @@
+# Test MySQL LoadDataLocal NULL handling
+# Validates that NULL values are correctly transmitted via LOAD DATA LOCAL INFILE
+# Uses PostgreSQL as source with generate_data and manual NULL insertions
+source: postgres
+target: mysql
+
+defaults:
+ mode: full-refresh
+
+hooks:
+ start:
+ # Generate test data in PostgreSQL
+ - type: query
+ connection: '{source.name}'
+ operation: generate_data
+ params:
+ table: public.mysql_null_test
+ rows: 50
+ columns:
+ col_bigint: bigint
+ col_bool: bool
+ col_date: date
+ col_datetime: datetime
+ col_decimal: decimal
+ col_integer: integer
+ col_smallint: smallint
+ col_string: string
+ col_text: text
+ col_float: float
+
+ # Insert 3 rows with NULL values and special characters
+ - type: query
+ connection: '{source.name}'
+ query: |
+ INSERT INTO public.mysql_null_test
+ (col_bigint, col_bool, col_date, col_datetime, col_decimal, col_integer, col_smallint, col_string, col_text, col_float)
+ VALUES
+ (NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL),
+ (999999, NULL, NULL, '2024-01-15 10:30:00', NULL, 12345, NULL, NULL, 'text with value', NULL),
+ (888888, true, '2024-06-20', '2024-06-20 15:45:30', 123.456, 54321, 100, 'string with "double quotes" and ''single quotes''', 'text with tabs and
+ newlines and "quotes" here', 999.99);
+
+ end:
+ # Check execution succeeded
+ - type: check
+ check: execution.status.error == 0
+ on_failure: break
+
+ # Verify row count matches (50 generated + 3 manual = 53)
+ - type: query
+ connection: '{source.name}'
+ query: SELECT COUNT(*) as cnt FROM public.mysql_null_test
+ into: source_count
+
+ - type: query
+ connection: '{target.name}'
+ query: SELECT COUNT(*) as cnt FROM mysql.mysql_null_test
+ into: target_count
+
+ - type: log
+ message: "Source rows: {store.source_count[0].cnt}, Target rows: {store.target_count[0].cnt}"
+
+ - type: check
+ check: store.source_count[0].cnt == store.target_count[0].cnt
+ failure_message: "Row count mismatch: source={store.source_count[0].cnt}, target={store.target_count[0].cnt}"
+
+ # Count NULL values in source
+ - type: query
+ connection: '{source.name}'
+ query: |
+ SELECT
+ SUM(CASE WHEN col_string IS NULL THEN 1 ELSE 0 END) as null_string_cnt,
+ SUM(CASE WHEN col_integer IS NULL THEN 1 ELSE 0 END) as null_integer_cnt,
+ SUM(CASE WHEN col_decimal IS NULL THEN 1 ELSE 0 END) as null_decimal_cnt,
+ SUM(CASE WHEN col_bool IS NULL THEN 1 ELSE 0 END) as null_bool_cnt,
+ SUM(CASE WHEN col_text IS NULL THEN 1 ELSE 0 END) as null_text_cnt
+ FROM public.mysql_null_test
+ into: source_nulls
+
+ # Count NULL values in target
+ - type: query
+ connection: '{target.name}'
+ query: |
+ SELECT
+ SUM(CASE WHEN col_string IS NULL THEN 1 ELSE 0 END) as null_string_cnt,
+ SUM(CASE WHEN col_integer IS NULL THEN 1 ELSE 0 END) as null_integer_cnt,
+ SUM(CASE WHEN col_decimal IS NULL THEN 1 ELSE 0 END) as null_decimal_cnt,
+ SUM(CASE WHEN col_bool IS NULL THEN 1 ELSE 0 END) as null_bool_cnt,
+ SUM(CASE WHEN col_text IS NULL THEN 1 ELSE 0 END) as null_text_cnt
+ FROM mysql.mysql_null_test
+ into: target_nulls
+
+ - type: log
+ message: |
+ Source NULLs: string={store.source_nulls[0].null_string_cnt}, integer={store.source_nulls[0].null_integer_cnt}, decimal={store.source_nulls[0].null_decimal_cnt}
+ Target NULLs: string={store.target_nulls[0].null_string_cnt}, integer={store.target_nulls[0].null_integer_cnt}, decimal={store.target_nulls[0].null_decimal_cnt}
+
+ # Verify NULL counts match for each column type
+ - type: check
+ check: int_parse(store.source_nulls[0].null_string_cnt) == int_parse(store.target_nulls[0].null_string_cnt)
+ failure_message: "NULL string count mismatch: source={store.source_nulls[0].null_string_cnt}, target={store.target_nulls[0].null_string_cnt}"
+
+ - type: check
+ check: int_parse(store.source_nulls[0].null_integer_cnt) == int_parse(store.target_nulls[0].null_integer_cnt)
+ failure_message: "NULL integer count mismatch: source={store.source_nulls[0].null_integer_cnt}, target={store.target_nulls[0].null_integer_cnt}"
+
+ - type: check
+ check: int_parse(store.source_nulls[0].null_decimal_cnt) == int_parse(store.target_nulls[0].null_decimal_cnt)
+ failure_message: "NULL decimal count mismatch: source={store.source_nulls[0].null_decimal_cnt}, target={store.target_nulls[0].null_decimal_cnt}"
+
+ - type: check
+ check: int_parse(store.source_nulls[0].null_text_cnt) == int_parse(store.target_nulls[0].null_text_cnt)
+ failure_message: "NULL text count mismatch: source={store.source_nulls[0].null_text_cnt}, target={store.target_nulls[0].null_text_cnt}"
+
+ # Calculate checksums on source (PostgreSQL)
+ - type: query
+ connection: '{source.name}'
+ query: |
+ SELECT
+ SUM(COALESCE(col_bigint, 0)) as sum_bigint,
+ SUM(COALESCE(col_integer, 0)) as sum_integer,
+ SUM(COALESCE(col_smallint, 0)) as sum_smallint,
+ SUM(COALESCE(CAST(col_decimal AS NUMERIC), 0)) as sum_decimal,
+ SUM(COALESCE(LENGTH(col_string), 0)) as sum_string_len,
+ SUM(COALESCE(LENGTH(col_text), 0)) as sum_text_len
+ FROM public.mysql_null_test
+ into: source_checksum
+
+ # Calculate checksums on target (MySQL)
+ - type: query
+ connection: '{target.name}'
+ query: |
+ SELECT
+ SUM(COALESCE(col_bigint, 0)) as sum_bigint,
+ SUM(COALESCE(col_integer, 0)) as sum_integer,
+ SUM(COALESCE(col_smallint, 0)) as sum_smallint,
+ SUM(COALESCE(CAST(col_decimal AS DECIMAL(30,10)), 0)) as sum_decimal,
+ SUM(COALESCE(LENGTH(col_string), 0)) as sum_string_len,
+ SUM(COALESCE(LENGTH(col_text), 0)) as sum_text_len
+ FROM mysql.mysql_null_test
+ into: target_checksum
+
+ - type: log
+ message: |
+ Checksums:
+ Source: bigint={store.source_checksum[0].sum_bigint}, integer={store.source_checksum[0].sum_integer}, string_len={store.source_checksum[0].sum_string_len}
+ Target: bigint={store.target_checksum[0].sum_bigint}, integer={store.target_checksum[0].sum_integer}, string_len={store.target_checksum[0].sum_string_len}
+
+ # Verify checksums match
+ - type: check
+ check: int_parse(store.source_checksum[0].sum_bigint) == int_parse(store.target_checksum[0].sum_bigint)
+ failure_message: "Bigint checksum mismatch"
+
+ - type: check
+ check: int_parse(store.source_checksum[0].sum_integer) == int_parse(store.target_checksum[0].sum_integer)
+ failure_message: "Integer checksum mismatch"
+
+ - type: check
+ check: int_parse(store.source_checksum[0].sum_string_len) == int_parse(store.target_checksum[0].sum_string_len)
+ failure_message: "String length checksum mismatch"
+
+ - type: check
+ check: int_parse(store.source_checksum[0].sum_text_len) == int_parse(store.target_checksum[0].sum_text_len)
+ failure_message: "Text length checksum mismatch"
+
+ # Verify special characters row (col_bigint = 888888) transferred correctly
+ - type: query
+ connection: '{target.name}'
+ query: |
+ SELECT col_string, col_text
+ FROM mysql.mysql_null_test
+ WHERE col_bigint = 888888
+ into: special_row
+
+ - type: log
+ message: |
+ Special characters row:
+ col_string: {store.special_row[0].col_string}
+ col_text: {store.special_row[0].col_text}
+
+ - type: check
+ check: contains(store.special_row[0].col_string, "\"double quotes\"") && contains(store.special_row[0].col_string, "'single quotes'")
+ failure_message: "col_string should contain both double and single quotes"
+
+ - type: check
+ check: contains(store.special_row[0].col_text, "tabs and")
+ failure_message: "col_text should contain 'tabs and'"
+
+ - type: check
+ check: contains(store.special_row[0].col_text, "newlines and")
+ failure_message: "col_text should contain 'newlines and'"
+
+ - type: check
+ check: contains(store.special_row[0].col_text, "quotes")
+ failure_message: "col_text should contain quotes"
+
+ - type: log
+ message: "SUCCESS: MySQL LoadDataLocal NULL handling test passed"
+
+ # Cleanup
+ - type: query
+ connection: '{source.name}'
+ query: DROP TABLE IF EXISTS public.mysql_null_test
+
+ - type: query
+ connection: '{target.name}'
+ query: DROP TABLE IF EXISTS mysql.mysql_null_test
+
+streams:
+ public.mysql_null_test:
+ object: mysql.mysql_null_test
diff --git a/cmd/sling/tests/replications/r.95.select_column_rename.yaml b/cmd/sling/tests/replications/r.95.select_column_rename.yaml
new file mode 100644
index 000000000..0f470d47e
--- /dev/null
+++ b/cmd/sling/tests/replications/r.95.select_column_rename.yaml
@@ -0,0 +1,322 @@
+source: postgres
+target: postgres
+
+defaults:
+ mode: full-refresh
+
+hooks:
+ start:
+ # Create main test table with many columns
+ - type: query
+ connection: '{source.name}'
+ query: |
+ DROP TABLE IF EXISTS public.test_select_main;
+ CREATE TABLE public.test_select_main (
+ id INT,
+ first_name VARCHAR(100),
+ last_name VARCHAR(100),
+ email VARCHAR(100),
+ phone VARCHAR(50),
+ address_line1 VARCHAR(200),
+ address_line2 VARCHAR(200),
+ city VARCHAR(100),
+ state VARCHAR(50),
+ zip_code VARCHAR(20),
+ country VARCHAR(100),
+ created_at TIMESTAMP,
+ updated_at TIMESTAMP,
+ is_active BOOLEAN,
+ score DECIMAL(10,2)
+ );
+ INSERT INTO public.test_select_main VALUES
+ (1, 'Alice', 'Smith', 'alice@example.com', '555-0101', '123 Main St', 'Apt 1', 'New York', 'NY', '10001', 'USA', '2024-01-01 10:00:00', '2024-01-15 12:00:00', true, 95.50),
+ (2, 'Bob', 'Jones', 'bob@example.com', '555-0102', '456 Oak Ave', NULL, 'Los Angeles', 'CA', '90001', 'USA', '2024-01-02 11:00:00', '2024-01-16 13:00:00', true, 87.25),
+ (3, 'Charlie', 'Brown', 'charlie@example.com', '555-0103', '789 Pine Rd', 'Suite 100', 'Chicago', 'IL', '60601', 'USA', '2024-01-03 12:00:00', '2024-01-17 14:00:00', false, 72.00);
+
+ # Create copy for exclusion test
+ - type: query
+ connection: '{source.name}'
+ query: |
+ DROP TABLE IF EXISTS public.test_select_main_exclude;
+ CREATE TABLE public.test_select_main_exclude AS SELECT * FROM public.test_select_main;
+
+ # Create copy for wildcard test
+ - type: query
+ connection: '{source.name}'
+ query: |
+ DROP TABLE IF EXISTS public.test_select_main_wildcard;
+ CREATE TABLE public.test_select_main_wildcard AS SELECT * FROM public.test_select_main;
+
+ # Create secondary test table for custom SQL tests
+ - type: query
+ connection: '{source.name}'
+ query: |
+ DROP TABLE IF EXISTS public.test_select_orders;
+ CREATE TABLE public.test_select_orders (
+ order_id INT,
+ customer_id INT,
+ order_date DATE,
+ ship_date DATE,
+ total_amount DECIMAL(12,2),
+ discount_amount DECIMAL(10,2),
+ tax_amount DECIMAL(10,2),
+ status VARCHAR(50)
+ );
+ INSERT INTO public.test_select_orders VALUES
+ (101, 1, '2024-01-10', '2024-01-12', 150.00, 10.00, 12.60, 'delivered'),
+ (102, 2, '2024-01-11', '2024-01-14', 250.00, 25.00, 20.25, 'delivered'),
+ (103, 3, '2024-01-12', NULL, 75.00, 0.00, 6.75, 'pending');
+
+ # Cleanup target tables
+ - type: query
+ connection: '{target.name}'
+ query: |
+ DROP TABLE IF EXISTS public.test_select_rename_basic;
+ DROP TABLE IF EXISTS public.test_select_exclude;
+ DROP TABLE IF EXISTS public.test_select_wildcard;
+ DROP TABLE IF EXISTS public.test_select_mixed;
+ DROP TABLE IF EXISTS public.test_select_sql_rename;
+ DROP TABLE IF EXISTS public.test_select_sql_fields;
+
+ end:
+ # Check for errors first
+ - check: execution.status.error == 0
+ on_failure: break
+
+ #
+ # Test 1: Basic column renaming (table stream)
+ #
+ - type: query
+ connection: '{target.name}'
+ query: SELECT column_name FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'test_select_rename_basic' ORDER BY ordinal_position
+ into: basic_cols
+
+ - log: "Test 1 - Basic rename columns: {store.basic_cols}"
+
+ - check: store.basic_cols[0].column_name == "user_id"
+
+ - check: store.basic_cols[1].column_name == "full_name"
+
+ - check: store.basic_cols[2].column_name == "user_email"
+
+ - type: query
+ connection: '{target.name}'
+ query: SELECT * FROM public.test_select_rename_basic ORDER BY user_id
+ into: basic_data
+
+ - check: store.basic_data[0].full_name == "Alice"
+
+ - check: store.basic_data[1].user_email == "bob@example.com"
+
+ #
+ # Test 2: Exclusion with - prefix (table stream)
+ #
+ - type: query
+ connection: '{target.name}'
+ query: SELECT column_name FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'test_select_exclude' ORDER BY ordinal_position
+ into: exclude_cols
+
+ - log: "Test 2 - Exclude columns: {store.exclude_cols}"
+
+ # Verify excluded columns are not present (address_line1, address_line2, city, state, zip_code, country)
+ - type: query
+ connection: '{target.name}'
+ query: SELECT COUNT(*) as cnt FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'test_select_exclude' AND column_name IN ('address_line1', 'address_line2', 'city', 'state', 'zip_code', 'country')
+ into: excluded_check
+
+ - check: int_parse(store.excluded_check[0].cnt) == 0
+
+ # Verify we have the expected remaining columns (id should exist)
+ - type: query
+ connection: '{target.name}'
+ query: SELECT COUNT(*) as cnt FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'test_select_exclude' AND column_name = 'id'
+ into: id_check
+
+ - check: int_parse(store.id_check[0].cnt) == 1
+
+ #
+ # Test 3: Wildcard exclusion with glob pattern (table stream)
+ #
+ - type: query
+ connection: '{target.name}'
+ query: SELECT column_name FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'test_select_wildcard' ORDER BY ordinal_position
+ into: wildcard_cols
+
+ - log: "Test 3 - Wildcard exclude columns: {store.wildcard_cols}"
+
+ # Should NOT have address_line1, address_line2 (excluded by address_*)
+ - type: query
+ connection: '{target.name}'
+ query: SELECT COUNT(*) as cnt FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'test_select_wildcard' AND column_name LIKE 'address%'
+ into: wildcard_check
+
+ - check: int_parse(store.wildcard_check[0].cnt) == 0
+
+ #
+ # Test 4: Mixed - select specific columns with rename (using {fields})
+ #
+ - type: query
+ connection: '{target.name}'
+ query: SELECT column_name FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'test_select_mixed' ORDER BY ordinal_position
+ into: mixed_cols
+
+ - log: "Test 4 - Mixed select columns: {store.mixed_cols}"
+
+ # Verify exactly 4 columns
+ - type: query
+ connection: '{target.name}'
+ query: SELECT COUNT(*) as cnt FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'test_select_mixed'
+ into: mixed_count
+
+ - check: int_parse(store.mixed_count[0].cnt) == 4
+
+ - check: store.mixed_cols[0].column_name == "user_id"
+
+ - check: store.mixed_cols[1].column_name == "name"
+
+ - check: store.mixed_cols[2].column_name == "contact_email"
+
+ - check: store.mixed_cols[3].column_name == "active"
+
+ - type: query
+ connection: '{target.name}'
+ query: SELECT * FROM public.test_select_mixed ORDER BY user_id
+ into: mixed_data
+
+ - check: store.mixed_data[0].name == "Alice"
+
+ - check: store.mixed_data[2].contact_email == "charlie@example.com"
+
+ #
+ # Test 5: Table stream with select rename (orders table)
+ #
+ - type: query
+ connection: '{target.name}'
+ query: SELECT column_name FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'test_select_sql_rename' ORDER BY ordinal_position
+ into: sql_rename_cols
+
+ - log: "Test 5 - Table with rename columns: {store.sql_rename_cols}"
+
+ - check: store.sql_rename_cols[0].column_name == "order_number"
+
+ - check: store.sql_rename_cols[1].column_name == "customer"
+
+ - check: store.sql_rename_cols[2].column_name == "order_total"
+
+ - type: query
+ connection: '{target.name}'
+ query: SELECT * FROM public.test_select_sql_rename ORDER BY order_number
+ into: sql_rename_data
+
+ - check: int_parse(store.sql_rename_data[0].order_number) == 101
+
+ - check: float_parse(store.sql_rename_data[1].order_total) == 250.00
+
+ #
+ # Test 6: Custom SQL with {fields} placeholder and select rename
+ #
+ - type: query
+ connection: '{target.name}'
+ query: SELECT column_name FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'test_select_sql_fields' ORDER BY ordinal_position
+ into: sql_fields_cols
+
+ - log: "Test 6 - SQL {fields} placeholder columns: {store.sql_fields_cols}"
+
+ # Verify exactly 3 columns
+ - type: query
+ connection: '{target.name}'
+ query: SELECT COUNT(*) as cnt FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'test_select_sql_fields'
+ into: sql_fields_count
+
+ - check: int_parse(store.sql_fields_count[0].cnt) == 3
+
+ - check: store.sql_fields_cols[0].column_name == "id"
+
+ - check: store.sql_fields_cols[1].column_name == "customer_name"
+
+ - check: store.sql_fields_cols[2].column_name == "customer_score"
+
+ - type: query
+ connection: '{target.name}'
+ query: SELECT * FROM public.test_select_sql_fields ORDER BY id
+ into: sql_fields_data
+
+ - check: store.sql_fields_data[0].customer_name == "Alice"
+
+ - check: float_parse(store.sql_fields_data[1].customer_score) == 87.25
+
+ # Cleanup
+ - type: query
+ connection: '{source.name}'
+ query: |
+ DROP TABLE IF EXISTS public.test_select_main;
+ DROP TABLE IF EXISTS public.test_select_main_exclude;
+ DROP TABLE IF EXISTS public.test_select_main_wildcard;
+ DROP TABLE IF EXISTS public.test_select_orders;
+
+ - type: query
+ connection: '{target.name}'
+ query: |
+ DROP TABLE IF EXISTS public.test_select_rename_basic;
+ DROP TABLE IF EXISTS public.test_select_exclude;
+ DROP TABLE IF EXISTS public.test_select_wildcard;
+ DROP TABLE IF EXISTS public.test_select_mixed;
+ DROP TABLE IF EXISTS public.test_select_sql_rename;
+ DROP TABLE IF EXISTS public.test_select_sql_fields;
+
+streams:
+ # Test 1: Basic column renaming (table stream - select works directly)
+ public.test_select_main:
+ object: public.test_select_rename_basic
+ select:
+ - 'id as user_id'
+ - 'first_name as full_name'
+ - 'email as user_email'
+
+ # Test 2: Exclusion - exclude all address-related columns (table stream)
+ # Note: When using exclusions, ALL select items must be exclusions (- prefix)
+ public.test_select_main_exclude:
+ object: public.test_select_exclude
+ select:
+ - '-address_line1'
+ - '-address_line2'
+ - '-city'
+ - '-state'
+ - '-zip_code'
+ - '-country'
+
+ # Test 3: Wildcard exclusion - exclude address_* columns (table stream)
+ public.test_select_main_wildcard:
+ object: public.test_select_wildcard
+ select:
+ - '-address_*'
+
+ # Test 4: Mixed - select specific columns with some renamed (using {fields})
+ test_select_mixed:
+ sql: SELECT {fields} FROM public.test_select_main
+ object: public.test_select_mixed
+ select:
+ - 'id as user_id'
+ - 'first_name as name'
+ - 'email as contact_email'
+ - 'is_active as active'
+
+ # Test 5: Table stream with select rename (orders table)
+ public.test_select_orders:
+ object: public.test_select_sql_rename
+ select:
+ - 'order_id as order_number'
+ - 'customer_id as customer'
+ - 'total_amount as order_total'
+
+ # Test 6: Custom SQL with {fields} placeholder and select rename
+ test_select_sql_fields:
+ sql: |
+ SELECT {fields}
+ FROM public.test_select_main
+ WHERE is_active = true
+ object: public.test_select_sql_fields
+ select:
+ - 'id'
+ - 'first_name as customer_name'
+ - 'score as customer_score'
diff --git a/cmd/sling/tests/replications/r.96.s3_multi_bucket.yaml b/cmd/sling/tests/replications/r.96.s3_multi_bucket.yaml
new file mode 100644
index 000000000..8a901465e
--- /dev/null
+++ b/cmd/sling/tests/replications/r.96.s3_multi_bucket.yaml
@@ -0,0 +1,83 @@
+# Test S3 multi-bucket access with a single connection
+# This test validates that a single S3 connection (with valid AWS credentials)
+# can access files from multiple buckets by specifying full S3 URIs in stream names.
+
+source: aws_s3
+target: postgres
+
+defaults:
+ mode: full-refresh
+ target_options:
+ adjust_column_type: true
+
+hooks:
+ end:
+ - type: check
+ check: execution.status.error == 0
+ on_failure: break
+
+ - type: query
+ connection: '{target.name}'
+ query: select count(*) as cnt from public.s3_multi_bucket_test1
+ into: result1
+
+ - type: query
+ connection: '{target.name}'
+ query: select count(*) as cnt from public.s3_multi_bucket_test2
+ into: result2
+
+ - type: query
+ connection: '{target.name}'
+ query: select count(*) as cnt from public.s3_multi_bucket_test3
+ into: result3
+
+ - type: log
+ message: |
+ ✓ Stream 1 (test-bucket-west-345141): {store.result1[0].cnt} rows
+ ✓ Stream 2 (ocral-data-1): {store.result2[0].cnt} rows
+ ✓ Stream 3 (multi_bucket with files key): {store.result3[0].cnt} rows
+
+ - type: check
+ check: int_parse(store.result1[0].cnt) > 0
+ on_failure: abort
+ message: "FAIL: Stream 1 should have rows"
+
+ - type: check
+ check: int_parse(store.result2[0].cnt) > 0
+ on_failure: abort
+ message: "FAIL: Stream 2 should have rows"
+
+ - type: check
+ check: int_parse(store.result3[0].cnt) > 0
+ on_failure: abort
+ message: "FAIL: Stream 3 (files key) should have rows"
+
+ # Stream 3 should have rows from both files (2x the rows of stream 1 or 2)
+ - type: check
+ check: int_parse(store.result3[0].cnt) == int_parse(store.result1[0].cnt) + int_parse(store.result2[0].cnt)
+ on_failure: abort
+ message: "FAIL: Stream 3 should have combined rows from both buckets"
+
+ - type: log
+ message: "✅ SUCCESS: S3 multi-bucket access with single connection works correctly"
+
+ # Cleanup
+ - type: query
+ connection: '{target.name}'
+ query: |
+ DROP TABLE IF EXISTS public.s3_multi_bucket_test1;
+ DROP TABLE IF EXISTS public.s3_multi_bucket_test2;
+ DROP TABLE IF EXISTS public.s3_multi_bucket_test3;
+
+streams:
+ 's3://test-bucket-west-345141/test1.csv':
+ object: 'public.s3_multi_bucket_test1'
+
+ 's3://ocral-data-1/test1.csv':
+ object: 'public.s3_multi_bucket_test2'
+
+ multi_bucket:
+ files:
+ - s3://test-bucket-west-345141/test1.csv
+ - s3://ocral-data-1/test1.csv
+ object: 'public.s3_multi_bucket_test3'
diff --git a/cmd/sling/tests/replications/r.97.delete_missing_with_transforms.yaml b/cmd/sling/tests/replications/r.97.delete_missing_with_transforms.yaml
new file mode 100644
index 000000000..cd3587115
--- /dev/null
+++ b/cmd/sling/tests/replications/r.97.delete_missing_with_transforms.yaml
@@ -0,0 +1,146 @@
+# Test that transforms don't break delete_missing functionality
+# This tests the bug where transforms are applied during delete detection,
+# causing errors because delete detection only selects PK columns.
+
+source: MSSQL
+target: POSTGRES
+
+hooks:
+ start:
+ # Create source table with 10 rows
+ - type: query
+ connection: '{source.name}'
+ query: |
+ IF OBJECT_ID('dbo.delete_transform_test', 'U') IS NOT NULL DROP TABLE dbo.delete_transform_test;
+ CREATE TABLE dbo.delete_transform_test (
+ id INT PRIMARY KEY,
+ dateadded DATETIME DEFAULT GETDATE(),
+ lastchanged DATETIME DEFAULT GETDATE(),
+ value NVARCHAR(100)
+ );
+ INSERT INTO dbo.delete_transform_test (id, dateadded, lastchanged, value) VALUES
+ (1, '2024-01-15', '2024-01-20', 'row1'),
+ (2, '2024-02-10', '2024-02-25', 'row2'),
+ (3, '2024-03-05', '2024-03-10', 'row3'),
+ (4, '2024-04-01', '2024-04-05', 'row4'),
+ (5, '2024-05-01', '2024-05-15', 'row5'),
+ (6, '2024-06-01', '2024-06-10', 'row6'),
+ (7, '2024-07-01', '2024-07-20', 'row7'),
+ (8, '2024-08-01', '2024-08-25', 'row8'),
+ (9, '2024-09-01', '2024-09-30', 'row9'),
+ (10, '2024-10-01', '2024-10-15', 'row10');
+
+ # Create second source table with only 8 rows (IDs 1-8)
+ # IDs 9 and 10 will be soft-deleted when this syncs to target
+ - type: query
+ connection: '{source.name}'
+ query: |
+ IF OBJECT_ID('dbo.delete_transform_test2', 'U') IS NOT NULL DROP TABLE dbo.delete_transform_test2;
+ CREATE TABLE dbo.delete_transform_test2 (
+ id INT PRIMARY KEY,
+ dateadded DATETIME DEFAULT GETDATE(),
+ lastchanged DATETIME DEFAULT GETDATE(),
+ value NVARCHAR(100)
+ );
+ INSERT INTO dbo.delete_transform_test2 (id, dateadded, lastchanged, value) VALUES
+ (1, '2024-01-15', '2024-01-20', 'row1'),
+ (2, '2024-02-10', '2024-02-25', 'row2'),
+ (3, '2024-03-05', '2024-03-10', 'row3'),
+ (4, '2024-04-01', '2024-04-05', 'row4'),
+ (5, '2024-05-01', '2024-05-15', 'row5'),
+ (6, '2024-06-01', '2024-06-10', 'row6'),
+ (7, '2024-07-01', '2024-07-20', 'row7'),
+ (8, '2024-08-01', '2024-08-25', 'row8');
+
+ end:
+ - type: check
+ check: execution.status.error == 0
+ on_failure: break
+
+ # Verify row count (10 rows total, 2 soft-deleted)
+ - type: query
+ connection: '{target.name}'
+ query: SELECT COUNT(*) as count FROM public.delete_transform_test
+ into: total_count
+
+ - type: check
+ check: int_parse(store.total_count[0].count) == 10
+ failure_message: "Expected 10 total rows, got {store.total_count[0].count}"
+
+ # Verify computed transform column exists
+ - type: query
+ connection: '{target.name}'
+ query: |
+ SELECT COUNT(*) as col_exists
+ FROM information_schema.columns
+ WHERE table_schema = 'public'
+ AND table_name = 'delete_transform_test'
+ AND column_name = 'computed_date'
+ into: computed_col_exists
+
+ - type: check
+ check: int_parse(store.computed_col_exists[0].col_exists) == 1
+ failure_message: "computed_date column should exist from transform"
+
+ - type: log
+ message: "SUCCESS: computed_date column exists from transform"
+
+ # Verify soft-deleted rows (IDs 9-10) have _sling_deleted_at set
+ - type: query
+ connection: '{target.name}'
+ query: |
+ SELECT COUNT(*) as deleted_count
+ FROM public.delete_transform_test
+ WHERE id IN (9, 10) AND _sling_deleted_at IS NOT NULL
+ into: deleted_count
+
+ - type: check
+ check: int_parse(store.deleted_count[0].deleted_count) == 2
+ failure_message: "Expected 2 soft-deleted rows (IDs 9,10), got {store.deleted_count[0].deleted_count}"
+
+ - type: log
+ message: "SUCCESS: IDs 9,10 are soft-deleted correctly"
+
+ # Cleanup
+ - type: query
+ connection: '{source.name}'
+ query: |
+ IF OBJECT_ID('dbo.delete_transform_test', 'U') IS NOT NULL DROP TABLE dbo.delete_transform_test;
+ IF OBJECT_ID('dbo.delete_transform_test2', 'U') IS NOT NULL DROP TABLE dbo.delete_transform_test2;
+
+ - type: query
+ connection: '{target.name}'
+ query: DROP TABLE IF EXISTS public.delete_transform_test CASCADE;
+
+streams:
+ # First stream: full-refresh to create table with all 10 rows and transform
+ dbo.delete_transform_test:
+ object: public.delete_transform_test
+ mode: full-refresh
+ primary_key: [id]
+ target_options:
+ column_casing: lower
+
+ transforms:
+ - computed_date: >
+ greatest(
+ date_parse(record.dateadded),
+ date_parse(record.lastchanged)
+ )
+
+ # Second stream: incremental with delete_missing to soft-delete rows 9,10
+ # This is where the bug manifests - transforms should NOT be applied during delete detection
+ dbo.delete_transform_test2:
+ object: public.delete_transform_test
+ mode: incremental
+ primary_key: [id]
+ target_options:
+ delete_missing: soft
+ column_casing: lower
+
+ transforms:
+ - computed_date: >
+ greatest(
+ date_parse(record.dateadded),
+ date_parse(record.lastchanged)
+ )
diff --git a/cmd/sling/tests/suite.cli.yaml b/cmd/sling/tests/suite.cli.yaml
index 6c21c6e19..aaa342104 100644
--- a/cmd/sling/tests/suite.cli.yaml
+++ b/cmd/sling/tests/suite.cli.yaml
@@ -864,7 +864,7 @@
sling conns exec mssql file://cmd/sling/tests/replications/r.47.mssql_uniqueidentifier.sql
SLING_CLI_TOKEN='' sling run -d -r cmd/sling/tests/replications/r.47.mssql_uniqueidentifier.yaml
sling run -d -r cmd/sling/tests/replications/r.47.mssql_uniqueidentifier.postgres.yaml
- sling run -d -r cmd/sling/tests/replications/r.47.fabric_uniqueidentifier.yaml
+ # sling run -d -r cmd/sling/tests/replications/r.47.fabric_uniqueidentifier.yaml
output_contains:
- "execution succeeded"
- "DROP TABLE public.unique_identifier_test2"
@@ -1099,11 +1099,11 @@
- '✓ All 100,000 CSV records imported successfully into Postgres'
# Test MSSQL to Databricks with uppercase table name (Issue #664)
-- id: 128
- name: Test MSSQL to Databricks with uppercase table name (Issue #664)
- run: 'sling run -d -r cmd/sling/tests/replications/r.66.mssql_databricks_uppercase_table.yaml'
- output_contains:
- - 'execution succeeded'
+# - id: 128
+# name: Test MSSQL to Databricks with uppercase table name (Issue #664)
+# run: 'sling run -d -r cmd/sling/tests/replications/r.66.mssql_databricks_uppercase_table.yaml'
+# output_contains:
+# - 'execution succeeded'
# Test JSON camelCase to PostgreSQL with column_casing (should only create snake_case columns)
- id: 129
@@ -1381,6 +1381,41 @@
- 'SUCCESS: BIT correctly cast to VARCHAR for mariadb'
- 'SUCCESS: BIT correctly cast to BOOLEAN for mariadb'
+- id: 155
+ name: 'Test SLING_SYNCED_AT_COLUMN with soft delete (MSSQL to Postgres)'
+ run: |
+ sling run --debug -r cmd/sling/tests/replications/r.85.mssql_postgres_synced_at.yaml
+ output_contains:
+ - 'execution succeeded'
+ - "SUCCESS: All 10 rows have _sling_synced_op='I' after full-refresh"
+ - 'SUCCESS: _sling_synced_at column exists'
+ - 'SUCCESS: _sling_deleted_at column does NOT exist'
+ - 'SUCCESS: synced_at_test1 has 2 distinct _sling_synced_at values'
+ - 'SUCCESS: _sling_synced_op column exists'
+ - "SUCCESS: IDs 9,10 have _sling_synced_op='D' (soft deleted)"
+ - "SUCCESS: IDs 1-8 have _sling_synced_op='U' (updated)"
+
+- id: 156
+ name: Test mixed-case record key references in transforms (MySQL to local parquet)
+ run: 'sling run -d -r cmd/sling/tests/replications/r.86.record_key_casing.yaml'
+ streams: 1
+ rows: 3
+ output_contains:
+ - 'SUCCESS: All 3 rows exported successfully with mixed-case column transform'
+ - 'SUCCESS: true_changed_at column was computed correctly'
+ - 'execution succeeded'
+
+- id: 157
+ name: Test mixed-case record key references in transforms (MySQL to BigQuery)
+ run: 'sling run -d -r cmd/sling/tests/replications/r.87.record_key_casing_bigquery.yaml'
+ after: [156]
+ streams: 1
+ rows: 3
+ output_contains:
+ - 'SUCCESS: All 3 rows exported successfully with mixed-case column transform (BigQuery)'
+ - 'SUCCESS: true_changed_at column was computed correctly (BigQuery)'
+ - 'execution succeeded'
+
# # Test PostGIS (PostgreSQL) to GeoJSON as target with geometry column name "geom"
# - id: 150
# name: Test PostGIS (PostgreSQL) to GeoJSON as target
@@ -1402,3 +1437,113 @@
# - '"type":"FeatureCollection"'
# - '"geometry":{"type":"Point","coordinates":[9.09425263416477,53.4920035631827]}'
# - '"properties":{"id":1,"name":"Point 1"}'
+
+# GitHub Issue #694: table_ddl with user-defined PRIMARY KEY and WITH clause
+- id: 158
+ name: 'Test custom table_ddl with PRIMARY KEY and WITH clause (MSSQL)'
+ run: 'sling run -d -r cmd/sling/tests/replications/r.88.table_ddl_with_clause.yaml'
+ streams: 2
+ rows: 2
+ output_contains:
+ - 'execution succeeded'
+ - 'Primary key columns:'
+ - 'Primary key columns (WITH clause table):'
+ - 'Compression info:'
+
+# GitHub Issue #678: definition-only mode
+- id: 159
+ name: 'Test definition-only mode creates table without data (Postgres to MSSQL)'
+ run: 'sling run -d -r cmd/sling/tests/replications/r.89.definition_only_db.yaml'
+ streams: 1
+ rows: 0
+ output_contains:
+ - 'created table definition'
+ - 'execution succeeded'
+ - 'SUCCESS: Table definition created with correct schema and 0 rows'
+
+- id: 160
+ name: 'Test definition-only mode creates parquet file without data'
+ run: 'sling run -d -r cmd/sling/tests/replications/r.90.definition_only_file.yaml'
+ streams: 1
+ rows: 0
+ output_contains:
+ - 'where 1=0'
+ - 'execution succeeded'
+ - 'SUCCESS: Parquet file definition created with correct schema'
+
+- id: 161
+ name: 'Test definition-only mode fails for CSV file target'
+ run: 'sling run --src-conn POSTGRES --src-stream "select 1 as a" --tgt-object file:///tmp/test_def_only.csv --mode definition-only'
+ err: true
+ output_contains:
+ - 'only supports parquet or arrow formats'
+
+- id: 162
+ name: 'Test definition-only mode from parquet file source to database'
+ run: 'sling run -d -r cmd/sling/tests/replications/r.91.definition_only_file_source.yaml'
+ output_contains:
+ - 'execution succeeded'
+ - 'SUCCESS: Table created from parquet file source with schema and 0 rows'
+
+# Test Oracle XMLTYPE column transfer to BigQuery (hang issue)
+- id: 163
+ name: 'Test Oracle XMLTYPE column to BigQuery transfer (hang issue)'
+ run: 'sling run -d -r cmd/sling/tests/replications/r.92.oracle_xmltype_bigquery.yaml'
+ streams: 1
+ rows: 3
+ output_contains:
+ - "using text since type 'xmltype' not mapped"
+ - 'execution succeeded'
+ - 'SUCCESS: Oracle XMLTYPE to BigQuery transfer completed without hanging!'
+
+# Test MySQL LoadDataLocal using RegisterReaderHandler pattern
+- id: 164
+ name: 'Test MySQL LoadDataLocal with native Go driver'
+ run: 'sling run -d -r cmd/sling/tests/replications/r.93.mysql_load_data_local.yaml'
+ streams: 1
+ rows: 18
+ output_contains:
+ - 'execution succeeded'
+ - 'SUCCESS: MySQL LoadDataLocal test passed'
+
+# Test MySQL LoadDataLocal NULL handling with checksums
+- id: 165
+ name: 'Test MySQL LoadDataLocal NULL handling'
+ run: 'sling run -d -r cmd/sling/tests/replications/r.94.mysql_load_data_local_nulls.yaml'
+ streams: 1
+ rows: 53
+ output_contains:
+ - 'execution succeeded'
+ - 'SUCCESS: MySQL LoadDataLocal NULL handling test passed'
+
+# Test column renaming via select option (comprehensive tests)
+- id: 166
+ name: 'Test column renaming via select (comprehensive)'
+ run: 'sling run -d -r cmd/sling/tests/replications/r.95.select_column_rename.yaml'
+ streams: 6
+ output_contains:
+ - 'execution succeeded'
+ - 'Test 1 - Basic rename columns'
+ - 'Test 2 - Exclude columns'
+ - 'Test 3 - Wildcard exclude columns'
+ - 'Test 4 - Mixed select columns'
+ - 'Test 5 - Table with rename columns'
+ - 'Test 6 - SQL {fields} placeholder columns'
+
+# Test S3 multi-bucket access with single connection (different buckets in stream URIs and files key)
+- id: 167
+ name: 'Test S3 multi-bucket access with single connection'
+ run: 'sling run -d -r cmd/sling/tests/replications/r.96.s3_multi_bucket.yaml'
+ streams: 3
+ output_contains:
+ - 'SUCCESS: S3 multi-bucket access with single connection works correctly'
+
+# Test delete_missing with transforms doesn't fail (transforms should be skipped during delete detection)
+- id: 168
+ name: 'Test delete_missing with transforms (transforms skipped during delete detection)'
+ run: 'sling run -d -r cmd/sling/tests/replications/r.97.delete_missing_with_transforms.yaml'
+ streams: 2
+ output_contains:
+ - 'execution succeeded'
+ - 'SUCCESS: computed_date column exists from transform'
+ - 'SUCCESS: IDs 9,10 are soft-deleted correctly'
\ No newline at end of file
diff --git a/core/dbio/api/api.go b/core/dbio/api/api.go
index 5a44ce85e..0b75e9ef5 100644
--- a/core/dbio/api/api.go
+++ b/core/dbio/api/api.go
@@ -222,6 +222,7 @@ type APIStreamConfig struct {
Mode string
Range string
DsConfigMap map[string]any // stream processor options
+ SchemaOnly bool
}
func (ac *APIConnection) ReadDataflow(endpointName string, sCfg APIStreamConfig) (df *iop.Dataflow, err error) {
diff --git a/core/dbio/connection/connection.go b/core/dbio/connection/connection.go
index 2beeef5f3..91fd1060c 100644
--- a/core/dbio/connection/connection.go
+++ b/core/dbio/connection/connection.go
@@ -173,11 +173,28 @@ func (c *Connection) Info() Info {
}
}
-func (c *Connection) Hash() string {
+// GetType returns the more accurate type, especially for ODBC databases
+// a bit gnarly...
+func (c *Connection) GetType() dbio.Type {
+ if t := c.Data["conn_template"]; c.Type == dbio.TypeDbODBC && t != nil {
+ return dbio.Type(cast.ToString(t))
+ }
+ return c.Type
+}
+
+func (c *Connection) Hash(excludeKeys ...string) string {
+ excludeMap := map[string]bool{}
+ for _, key := range excludeKeys {
+ excludeMap[key] = true
+ }
+
parts := []string{c.Name, c.Type.Name()}
keys := lo.Keys(c.Data)
sort.Strings(keys)
for _, key := range keys {
+ if excludeMap[key] {
+ continue
+ }
value := g.F("%s=%s", key, g.Marshal(c.Data[key]))
parts = append(parts, value)
}
@@ -387,8 +404,11 @@ func (c *Connection) AsFileContext(ctx context.Context, options ...AsConnOptions
opt = options[0]
}
+ // exclude url for files so Hash matches for files
+ cacheKey := c.Hash("url")
+
// default cache to true
- if cc, ok := connCache.Get(c.Hash()); ok && opt.UseCache {
+ if cc, ok := connCache.Get(cacheKey); ok && opt.UseCache {
if cc.File != nil {
return cc.File, nil
}
@@ -402,7 +422,9 @@ func (c *Connection) AsFileContext(ctx context.Context, options ...AsConnOptions
if err != nil {
return
}
- connCache.Set(c.Hash(), c) // cache
+
+ // set cache
+ connCache.Set(cacheKey, c)
if opt.Expire > 0 {
time.AfterFunc(time.Duration(opt.Expire)*time.Second, func() {
diff --git a/core/dbio/connection/connection_discover.go b/core/dbio/connection/connection_discover.go
index 4df56c29e..335df1f30 100644
--- a/core/dbio/connection/connection_discover.go
+++ b/core/dbio/connection/connection_discover.go
@@ -9,6 +9,7 @@ import (
"github.com/flarco/g"
"github.com/gobwas/glob"
"github.com/samber/lo"
+ "github.com/slingdata-io/sling-cli/core/dbio"
"github.com/slingdata-io/sling-cli/core/dbio/api"
"github.com/slingdata-io/sling-cli/core/dbio/database"
"github.com/slingdata-io/sling-cli/core/dbio/filesys"
@@ -22,7 +23,7 @@ func (c *Connection) Test() (ok bool, err error) {
switch {
case c.Type.IsDb():
- dbConn, err := c.AsDatabase(AsConnOptions{UseCache: false})
+ dbConn, err := c.AsDatabase(AsConnOptions{UseCache: c.GetType() == dbio.TypeDbDuckDb})
if err != nil {
return ok, g.Error(err, "could not initiate %s", c.Name)
}
@@ -213,7 +214,7 @@ func (c *Connection) Discover(opt *DiscoverOptions) (ok bool, nodes filesys.File
switch {
case c.Type.IsDb():
- dbConn, err := c.AsDatabase(AsConnOptions{UseCache: false})
+ dbConn, err := c.AsDatabase(AsConnOptions{UseCache: c.GetType() == dbio.TypeDbDuckDb})
if err != nil {
return ok, nodes, schemata, endpoints, g.Error(err, "could not initiate %s", c.Name)
}
diff --git a/core/dbio/connection/connection_local.go b/core/dbio/connection/connection_local.go
index 93e5e1498..e95769f2b 100644
--- a/core/dbio/connection/connection_local.go
+++ b/core/dbio/connection/connection_local.go
@@ -120,7 +120,7 @@ func GetLocalConns(options ...any) ConnEntries {
for _, conn := range dbtConns {
c := ConnEntry{
Name: strings.ToUpper(conn.Info().Name),
- Description: conn.Type.NameLong(),
+ Description: conn.GetType().NameLong(),
Source: "dbt profiles yaml",
Connection: conn,
}
@@ -140,7 +140,7 @@ func GetLocalConns(options ...any) ConnEntries {
for _, conn := range profileConns {
c := ConnEntry{
Name: strings.ToUpper(conn.Info().Name),
- Description: conn.Type.NameLong(),
+ Description: conn.GetType().NameLong(),
Source: name + " env yaml",
Connection: conn,
}
@@ -164,7 +164,7 @@ func GetLocalConns(options ...any) ConnEntries {
for _, conn := range profileConns {
c := ConnEntry{
Name: strings.ToUpper(conn.Info().Name),
- Description: conn.Type.NameLong(),
+ Description: conn.GetType().NameLong(),
Source: "env-var env yaml",
Connection: conn,
}
@@ -233,7 +233,7 @@ func GetLocalConns(options ...any) ConnEntries {
c := ConnEntry{
Name: conn.Info().Name,
- Description: conn.Type.NameLong(),
+ Description: conn.GetType().NameLong(),
Source: "env variable",
Connection: conn,
}
@@ -386,7 +386,7 @@ func (ec *EnvFileConns) ConnectionEntries() (entries ConnEntries, err error) {
for _, conn := range profileConns {
c := ConnEntry{
Name: strings.ToUpper(conn.Info().Name),
- Description: conn.Type.NameLong(),
+ Description: conn.GetType().NameLong(),
Source: ec.Name,
Connection: conn,
}
diff --git a/core/dbio/database/database.go b/core/dbio/database/database.go
index 88f1bb8a4..848a09be5 100755
--- a/core/dbio/database/database.go
+++ b/core/dbio/database/database.go
@@ -1941,8 +1941,9 @@ func (conn *BaseConn) DropView(viewNames ...string) (err error) {
sql := g.R(conn.template.Core["drop_view"], "view", viewName)
_, err = conn.Self().Exec(sql)
if err != nil {
- errIgnoreWord := conn.template.Variable["error_ignore_drop_view"]
- if !(errIgnoreWord != "" && strings.Contains(cast.ToString(err), errIgnoreWord)) {
+ errMsg := strings.ToLower(err.Error())
+ errIgnoreWord := strings.ToLower(conn.Template().Variable["error_ignore_drop_view"])
+ if !(errIgnoreWord != "" && strings.Contains(errMsg, errIgnoreWord)) {
return g.Error(err, "Error for "+sql)
}
g.Debug("view %s does not exist", viewName)
@@ -2228,7 +2229,7 @@ func (conn *BaseConn) GetAnalysis(analysisName string, values map[string]interfa
// CastColumnForSelect casts to the correct target column type
func (conn *BaseConn) CastColumnForSelect(srcCol iop.Column, tgtCol iop.Column) string {
- return conn.Self().Quote(srcCol.Name)
+ return conn.Template().Quote(srcCol.Name)
}
// CastColumnsForSelect cast the source columns into the target Column types
@@ -2248,7 +2249,7 @@ func (conn *BaseConn) CastColumnsForSelect(srcColumns iop.Columns, tgtColumns io
}
// don't normalize name, leave as is
- selectExpr := conn.Self().Quote(srcCol.Name)
+ selectExpr := conn.Template().Quote(srcCol.Name)
if !strings.EqualFold(srcCol.DbType, tgtCol.DbType) {
g.Debug(
@@ -2272,7 +2273,7 @@ func (conn *BaseConn) CastColumnsForSelect(srcColumns iop.Columns, tgtColumns io
}
// add alias
- qName := conn.Self().Quote(srcCol.Name)
+ qName := conn.Template().Quote(srcCol.Name)
selectExprs = append(selectExprs, g.F("%s as %s", selectExpr, qName))
}
@@ -2281,7 +2282,7 @@ func (conn *BaseConn) CastColumnsForSelect(srcColumns iop.Columns, tgtColumns io
func (conn *BaseConn) castBoolForSelect(srcCol iop.Column, tgtCol iop.Column) (selectStr string) {
- qName := conn.Self().Quote(srcCol.Name)
+ qName := conn.Template().Quote(srcCol.Name)
castFunc := conn.GetType().GetTemplateValue("function.cast_as")
@@ -2396,7 +2397,7 @@ func (conn *BaseConn) GenerateInsertStatement(tableName string, cols iop.Columns
for i, field := range fields {
c++
values[i] = conn.bindVar(i+1, field, n, c)
- qFields[i] = conn.Self().Quote(field)
+ qFields[i] = conn.Template().Quote(field)
}
valuesStr += fmt.Sprintf("(%s),", strings.Join(values, ", "))
}
@@ -2420,8 +2421,8 @@ func (conn *BaseConn) GetNativeType(col iop.Column) (nativeType string, err erro
return col.GetNativeType(conn.Self().GetType(), ct)
}
-// GenerateDDL genrate a DDL based on a dataset
-func (conn *BaseConn) GenerateDDL(table Table, data iop.Dataset, temporary bool) (string, error) {
+// GenerateDDL generate a DDL based on a dataset
+func (conn *BaseConn) GenerateDDL(table Table, data iop.Dataset, temporary bool) (ddl string, err error) {
if !data.Inferred || data.SafeInference {
if len(data.Columns) > 0 && data.Columns[0].Stats.TotalCnt == 0 && data.Columns[0].Type == "" {
@@ -2491,7 +2492,7 @@ func (conn *BaseConn) GenerateDDL(table Table, data iop.Dataset, temporary bool)
g.Trace("%s - %s %s", col.Name, col.Type, g.Marshal(col.Stats))
}
- columnDDL := conn.Self().Quote(col.Name) + " " + nativeType
+ columnDDL := conn.Template().Quote(col.Name) + " " + nativeType
columnsDDL = append(columnsDDL, columnDDL)
}
@@ -2504,12 +2505,50 @@ func (conn *BaseConn) GenerateDDL(table Table, data iop.Dataset, temporary bool)
createTemplate = table.DDL
}
- ddl := g.R(
+ ddl = g.R(
createTemplate,
"table", table.FullName(),
"col_types", strings.Join(columnsDDL, ",\n "),
)
+ partitionBy := ""
+ if keys, ok := table.Keys[iop.PartitionKey]; ok {
+ // allow custom SQL expression for partitioning
+ partitionBy = g.F("partition by %s", strings.Join(keys, ", "))
+ } else if keyCols := data.Columns.GetKeys(iop.PartitionKey); len(keyCols) > 0 {
+ colNames := conn.Template().QuoteNames(keyCols.Names()...)
+ partitionBy = g.F("partition by %s", strings.Join(colNames, ", "))
+ }
+ ddl = strings.ReplaceAll(ddl, "{partition_by}", partitionBy)
+
+ clusterBy := ""
+ if keyCols := data.Columns.GetKeys(iop.ClusterKey); len(keyCols) > 0 {
+ colNames := conn.Template().QuoteNames(keyCols.Names()...)
+ clusterBy = g.F("cluster by %s", strings.Join(colNames, ", "))
+ }
+ ddl = strings.ReplaceAll(ddl, "{cluster_by}", clusterBy)
+
+ distKey := ""
+ if keyCols := data.Columns.GetKeys(iop.DistributionKey); len(keyCols) > 0 {
+ colNames := conn.Template().QuoteNames(keyCols.Names()...)
+ distKey = g.F("distkey(%s)", strings.Join(colNames, ", "))
+ }
+ ddl = strings.ReplaceAll(ddl, "{dist_key}", distKey)
+
+ sortKey := ""
+ if keyCols := data.Columns.GetKeys(iop.SortKey); len(keyCols) > 0 {
+ colNames := conn.Template().QuoteNames(keyCols.Names()...)
+ sortKey = g.F("compound sortkey(%s)", strings.Join(colNames, ", "))
+ }
+ ddl = strings.ReplaceAll(ddl, "{sort_key}", sortKey)
+
+ primaryKeyExpr := ""
+ if keyCols := columns.GetKeys(iop.PrimaryKey); len(keyCols) > 0 {
+ colNames := conn.Template().QuoteNames(keyCols.Names()...)
+ primaryKeyExpr = g.F("%s", strings.Join(colNames, ", "))
+ }
+ ddl = strings.ReplaceAll(ddl, "{primary_key}", primaryKeyExpr)
+
return ddl, nil
}
@@ -2779,7 +2818,10 @@ func (conn *BaseConn) GenerateMergeSQL(srcTable string, tgtTable string, pkField
"pk_fields", mc.Map["pk_fields"],
"set_fields", mc.Map["set_fields"],
"insert_fields", mc.Map["insert_fields"],
+ "src_insert_fields", mc.Map["src_insert_fields"],
"src_fields", mc.Map["src_fields"],
+ "tgt_fields", mc.Map["tgt_fields"],
+ "placeholder_fields", mc.Map["placeholder_fields"],
)
return
@@ -2791,6 +2833,10 @@ type MergeConfig struct {
Map map[string]string
}
+func (mc MergeConfig) TemplatePath() string {
+ return g.F("core.merge_%s", mc.Strategy)
+}
+
// GenerateMergeConfig returns the merge config
func (conn *BaseConn) GenerateMergeConfig(srcTable string, tgtTable string, pkFields []string) (mc MergeConfig, err error) {
@@ -2840,6 +2886,7 @@ func (conn *BaseConn) GenerateMergeConfig(srcTable string, tgtTable string, pkFi
setFieldsAll := []string{}
insertFields := []string{}
placeholderFields := []string{}
+ srcInsertFields := []string{}
for _, tgtColName := range tgtCols.Names() {
srcCol := g.PtrVal(srcColumns.GetColumn(tgtColName)) // should be found
tgtCol := tgtColumns.GetColumn(tgtColName)
@@ -2858,7 +2905,16 @@ func (conn *BaseConn) GenerateMergeConfig(srcTable string, tgtTable string, pkFi
phExpr := strings.ReplaceAll(colExpr, srcColNameQ, g.F("ph.%s", srcColNameQ))
placeholderFields = append(placeholderFields, phExpr)
+ srcExpr := strings.ReplaceAll(colExpr, srcColNameQ, g.F("src.%s", srcColNameQ))
+ srcInsertFields = append(srcInsertFields, srcExpr)
+
setSrcExpr := strings.ReplaceAll(colExpr, srcColNameQ, g.F("src.%s", srcColNameQ))
+
+ // set sync operation to `U` for update update
+ if strings.EqualFold(tgtCol.Name, env.ReservedFields.SyncedOp) {
+ setSrcExpr = "'U'"
+ }
+
setField := g.F("%s = %s", tgtColNameQ, setSrcExpr)
setFieldsAll = append(setFieldsAll, setField)
if _, ok := pkFieldMap[tgtCol.Name]; !ok {
@@ -2884,6 +2940,7 @@ func (conn *BaseConn) GenerateMergeConfig(srcTable string, tgtTable string, pkFi
"src_fields": strings.Join(srcFields, ", "),
"tgt_fields": strings.Join(tgtFields, ", "),
"insert_fields": strings.Join(insertFields, ", "),
+ "src_insert_fields": strings.Join(srcInsertFields, ", "),
"pk_fields": strings.Join(pkFields, ", "),
"src_pk_fields": strings.Join(srcPkFields, ", "),
"tgt_pk_fields": strings.Join(tgtPkFields, ", "),
@@ -2900,11 +2957,10 @@ func (conn *BaseConn) GenerateMergeConfig(srcTable string, tgtTable string, pkFi
return mc, g.Error("invalid merge strategy %s", mc.Strategy)
}
- key := g.F("core.merge_%s", mc.Strategy)
- mc.Template = conn.GetTemplateValue(key)
+ mc.Template = conn.GetTemplateValue(mc.TemplatePath())
if mc.Template == "" {
- return mc, g.Error("merge strategy `%s` not supported for %s (did not find SQL template key `%s`)", mc.Strategy, conn.GetType(), key)
+ return mc, g.Error("merge strategy `%s` not supported for %s (did not find SQL template key `%s`)", mc.Strategy, conn.GetType(), mc.TemplatePath())
}
return
@@ -3083,23 +3139,23 @@ func GetOptimizeTableStatements(conn Connection, table *Table, newColumns iop.Co
return
}
for i, key := range pKey {
- pKey[i] = conn.Self().Quote(key)
+ pKey[i] = conn.Template().Quote(key)
}
// add new column with new type
ddlParts = append(ddlParts, g.R(
conn.GetTemplateValue("core.add_column"),
"table", table.FullName(),
- "column", conn.Self().Quote(colNameTemp),
+ "column", conn.Template().Quote(colNameTemp),
"type", col.DbType,
))
// update set to cast old values
oldColCasted := conn.Self().CastColumnForSelect(oldCols[index], col)
- if oldColCasted == conn.Self().Quote(col.Name) {
+ if oldColCasted == conn.Template().Quote(col.Name) {
oldColCasted = g.R(
conn.GetTemplateValue("function.cast_as"),
- "field", conn.Self().Quote(col.Name),
+ "field", conn.Template().Quote(col.Name),
"type", col.DbType,
)
}
@@ -3116,7 +3172,7 @@ func GetOptimizeTableStatements(conn Connection, table *Table, newColumns iop.Co
"table", table.FullName(),
"set_fields", g.R(
"{temp_column} = {old_column_casted}",
- "temp_column", conn.Self().Quote(colNameTemp),
+ "temp_column", conn.Template().Quote(colNameTemp),
"old_column_casted", oldColCasted,
),
"fields", strings.Join(fields, ", "),
@@ -3128,13 +3184,13 @@ func GetOptimizeTableStatements(conn Connection, table *Table, newColumns iop.Co
ddlParts = append(ddlParts, g.R(
conn.GetTemplateValue("core.drop_column"),
"table", table.FullName(),
- "column", conn.Self().Quote(col.Name),
+ "column", conn.Template().Quote(col.Name),
))
// rename new column to old name
tableName := table.FullName()
- oldColName := conn.Self().Quote(colNameTemp)
- newColName := conn.Self().Quote(col.Name)
+ oldColName := conn.Template().Quote(colNameTemp)
+ newColName := conn.Template().Quote(col.Name)
if conn.Self().GetType().IsSQLServer() {
tableName = conn.Unquote(table.FullName())
@@ -3259,8 +3315,8 @@ func (conn *BaseConn) CompareChecksums(tableName string, columns iop.Columns) (e
expr = "0"
}
colName := fieldsMap[strings.ToLower(col.Name)]
- expr = g.R(expr, "field", conn.Self().Quote(cast.ToString(colName)))
- exprs = append(exprs, g.F("sum(%s) as %s", expr, conn.Self().Quote(cast.ToString(colName))))
+ expr = g.R(expr, "field", conn.Template().Quote(cast.ToString(colName)))
+ exprs = append(exprs, g.F("sum(%s) as %s", expr, conn.Template().Quote(cast.ToString(colName))))
exprMap[strings.ToLower(col.Name)] = g.F("sum(%s)", expr)
}
@@ -3464,7 +3520,7 @@ func (conn *BaseConn) AddMissingColumns(table Table, newCols iop.Columns) (ok bo
sql := g.R(
conn.Template().Core["add_column"],
"table", table.FullName(),
- "column", conn.Self().Quote(col.Name),
+ "column", conn.Template().Quote(col.Name),
"type", nativeType,
)
diff --git a/core/dbio/database/database_fabric.go b/core/dbio/database/database_fabric.go
index f46791c9e..cd75e0113 100644
--- a/core/dbio/database/database_fabric.go
+++ b/core/dbio/database/database_fabric.go
@@ -113,8 +113,8 @@ func (conn *MsFabricConn) makeABFSClient() (fs filesys.FileSysClient, err error)
return fs, nil
}
-// getOneLakePath generates a OneLake path for temporary staging
-func (conn *MsFabricConn) getOneLakePath(tableFName string) string {
+// getStagingPath generates a staging path for file uploads (uses DFS endpoint)
+func (conn *MsFabricConn) getStagingPath(tableFName string) string {
endpoint := conn.GetProp("abfs_endpoint")
filesystem := conn.GetProp("abfs_filesystem")
parent := conn.GetProp("abfs_parent")
@@ -131,6 +131,20 @@ func (conn *MsFabricConn) getOneLakePath(tableFName string) string {
return fmt.Sprintf("%s/%s/%s/%s", basePath, tempCloudStorageFolder, cleanTableName, cast.ToString(g.Now()))
}
+// getCopyIntoPath converts a staging path to the appropriate endpoint for COPY INTO
+// Per Microsoft docs, the .blob endpoint yields best performance for COPY INTO
+// See: https://learn.microsoft.com/en-us/sql/t-sql/statements/copy-into-transact-sql
+func (conn *MsFabricConn) getCopyIntoPath(stagingPath string) string {
+ // Check if user explicitly set a copy_into_endpoint override
+ if copyIntoEndpoint := conn.GetProp("copy_into_endpoint"); copyIntoEndpoint != "" {
+ // Replace the endpoint in the path with the user-specified one
+ endpoint := conn.GetProp("abfs_endpoint")
+ return strings.Replace(stagingPath, endpoint, copyIntoEndpoint, 1)
+ }
+
+ return stagingPath
+}
+
// CopyFromOneLake uses the COPY INTO command to load data from OneLake
func (conn *MsFabricConn) CopyFromOneLake(tableFName, oneLakePath string, columns iop.Columns, fileFormat dbio.FileType) (err error) {
// Prepare target columns
@@ -202,8 +216,8 @@ func (conn *MsFabricConn) BulkImportFlow(tableFName string, df *iop.Dataflow) (c
settingMppBulkImportFlow(conn, iop.GzipCompressorType)
- // Get OneLake path
- oneLakePath := conn.getOneLakePath(tableFName)
+ // Get staging path (for ABFS uploads - uses DFS endpoint)
+ stagingPath := conn.getStagingPath(tableFName)
// Create ABFS client
abfsFs, err := conn.makeABFSClient()
@@ -212,15 +226,15 @@ func (conn *MsFabricConn) BulkImportFlow(tableFName string, df *iop.Dataflow) (c
}
// Delete any existing files at path
- err = filesys.Delete(abfsFs, oneLakePath)
+ err = filesys.Delete(abfsFs, stagingPath)
if err != nil {
- return df.Count(), g.Error(err, "Could not delete existing files: "+oneLakePath)
+ return df.Count(), g.Error(err, "Could not delete existing files: "+stagingPath)
}
// Set up cleanup
df.Defer(func() {
if !cast.ToBool(os.Getenv("SLING_KEEP_TEMP")) {
- filesys.Delete(abfsFs, oneLakePath)
+ filesys.Delete(abfsFs, stagingPath)
}
})
@@ -241,21 +255,24 @@ func (conn *MsFabricConn) BulkImportFlow(tableFName string, df *iop.Dataflow) (c
// Fabric COPY INTO treats empty fields as NULL (no NULL_IF parameter available)
abfsFs.SetProp("null_as", ``)
abfsFs.SetProp("compression", `gzip`)
- bw, err = filesys.WriteDataflow(abfsFs, df, oneLakePath)
+ bw, err = filesys.WriteDataflow(abfsFs, df, stagingPath)
case dbio.FileTypeParquet:
if env.UseDuckDbCompute() {
- bw, err = filesys.WriteDataflowViaDuckDB(abfsFs, df, oneLakePath)
+ bw, err = filesys.WriteDataflowViaDuckDB(abfsFs, df, stagingPath)
} else {
- bw, err = filesys.WriteDataflow(abfsFs, df, oneLakePath)
+ bw, err = filesys.WriteDataflow(abfsFs, df, stagingPath)
}
}
if err != nil {
return df.Count(), g.Error(err, "Error writing to OneLake")
}
- g.Debug("total written: %s to %s", humanize.Bytes(cast.ToUint64(bw)), oneLakePath)
+ g.Debug("total written: %s to %s", humanize.Bytes(cast.ToUint64(bw)), stagingPath)
+
+ // Get COPY INTO path (may convert DFS to Blob endpoint for better compatibility)
+ copyIntoPath := conn.getCopyIntoPath(stagingPath)
// Execute COPY INTO
- err = conn.CopyFromOneLake(tableFName, oneLakePath, df.Columns, fileFormat)
+ err = conn.CopyFromOneLake(tableFName, copyIntoPath, df.Columns, fileFormat)
if err != nil {
return df.Count(), g.Error(err, "Error copying into Fabric from OneLake")
}
diff --git a/core/dbio/database/database_mysql.go b/core/dbio/database/database_mysql.go
index 55327e24a..b385747ac 100755
--- a/core/dbio/database/database_mysql.go
+++ b/core/dbio/database/database_mysql.go
@@ -13,6 +13,7 @@ import (
"cloud.google.com/go/cloudsqlconn"
cloudsqlmysql "cloud.google.com/go/cloudsqlconn/mysql/mysql"
"github.com/go-sql-driver/mysql"
+ "github.com/google/uuid"
"github.com/jmoiron/sqlx"
"github.com/slingdata-io/sling-cli/core/dbio"
"github.com/spf13/cast"
@@ -25,9 +26,10 @@ import (
// MySQLConn is a MySQL or MariaDB connection
type MySQLConn struct {
BaseConn
- URL string
- isCloudSQL bool
- cloudSQLCleanup func()
+ URL string
+ isCloudSQL bool
+ localInfileEnabled bool
+ cloudSQLCleanup func()
}
// Init initiates the object
@@ -45,9 +47,11 @@ func (conn *MySQLConn) Init() error {
// the LoadDataOutFile needs special circumstances
conn.BaseConn.SetProp("allow_bulk_export", "false")
- // InsertBatchStream is faster than LoadDataInFile
- if conn.BaseConn.GetProp("allow_bulk_import") == "" {
- conn.BaseConn.SetProp("allow_bulk_import", "false")
+ // Enable allowAllFiles for LOAD DATA LOCAL INFILE via RegisterReaderHandler
+ // This is required for the go-sql-driver to use Reader:: syntax
+ if conn.BaseConn.GetProp("allow_all_files") == "" &&
+ conn.BaseConn.GetProp("allowAllFiles") == "" {
+ conn.BaseConn.SetProp("allow_all_files", "true")
}
instance := Connection(conn)
@@ -56,6 +60,24 @@ func (conn *MySQLConn) Init() error {
return conn.BaseConn.Init()
}
+// checkLocalInfileEnabled checks if the MySQL server allows LOCAL INFILE
+// and caches the result in the struct field for subsequent calls
+func (conn *MySQLConn) checkLocalInfileEnabled() {
+ // Query the server using raw DB to avoid datastream context issues
+ var varName, varValue string
+ err := conn.db.QueryRow("SHOW GLOBAL VARIABLES LIKE 'local_infile'").Scan(&varName, &varValue)
+ if err != nil {
+ g.Debug("could not check local_infile variable: %v", err)
+ return
+ }
+
+ conn.localInfileEnabled = strings.ToLower(varValue) == "on" || varValue == "1"
+
+ if conn.localInfileEnabled {
+ g.Debug("local_infile is enabled on MySQL server")
+ }
+}
+
// GetURL returns the processed URL
func (conn *MySQLConn) GetURL(newURL ...string) string {
connURL := conn.BaseConn.URL
@@ -142,7 +164,15 @@ func (conn *MySQLConn) Connect(timeOut ...int) (err error) {
mysql.RegisterTLSConfig(conn.GetProp("sling_conn_id"), tlsConfig)
}
- return conn.BaseConn.Connect(timeOut...)
+ err = conn.BaseConn.Connect(timeOut...)
+ if err != nil {
+ return err
+ }
+
+ // Check and cache local_infile setting after connect
+ conn.checkLocalInfileEnabled()
+
+ return nil
}
// connectCloudSQL establishes a connection to Google Cloud SQL MySQL using IAM authentication
@@ -285,6 +315,9 @@ func (conn *MySQLConn) connectCloudSQL(timeOut ...int) error {
conn.postConnect()
+ // Check and cache local_infile setting after connect
+ conn.checkLocalInfileEnabled()
+
return nil
}
@@ -396,21 +429,13 @@ func (conn *MySQLConn) BulkExportStream(table Table) (ds *iop.Datastream, err er
// BulkImportStream bulk import stream
func (conn *MySQLConn) BulkImportStream(tableFName string, ds *iop.Datastream) (count uint64, err error) {
-
+ // Check ADBC first
if conn.UseADBC() {
conn.Commit()
return conn.adbc.BulkImportStream(tableFName, ds)
}
- _, err = exec.LookPath("mysql")
- if err != nil {
- g.Trace("mysql not found in path. Using cursor...")
- return conn.BaseConn.InsertBatchStream(tableFName, ds)
- } else if conn.GetProp("allow_bulk_import") != "true" {
- return conn.BaseConn.InsertBatchStream(tableFName, ds)
- }
-
- // needs to get columns to shape stream
+ // Get columns to shape stream
columns, err := conn.GetColumns(tableFName)
if err != nil {
err = g.Error(err, "could not get column list")
@@ -423,7 +448,29 @@ func (conn *MySQLConn) BulkImportStream(tableFName string, ds *iop.Datastream) (
return
}
- return conn.LoadDataInFile(tableFName, ds)
+ // Check server capability (cached from connect)
+ // Note: LoadDataLocal keeps the connection busy, so we can't use it when
+ // adjust_column_type is enabled (it requires queries during load)
+ useBulk := conn.GetProp("use_bulk") != "false"
+ adjustColumnType := cast.ToBool(conn.GetProp("adjust_column_type"))
+
+ if conn.localInfileEnabled && useBulk && !adjustColumnType {
+ // Use native Go driver - no external binary needed
+ return conn.LoadDataLocal(tableFName, ds)
+ }
+
+ // Log why we're not using LoadDataLocal
+ if !conn.localInfileEnabled {
+ g.Debug("local_infile is disabled on server, using fallback")
+ } else if !useBulk {
+ g.Debug("use_bulk is false, using InsertBatchStream")
+ } else if adjustColumnType {
+ g.Debug("adjust_column_type enabled, using InsertBatchStream to allow concurrent queries")
+ }
+
+ // Final fallback: InsertBatchStream
+ g.Trace("using InsertBatchStream as fallback")
+ return conn.BaseConn.InsertBatchStream(tableFName, ds)
}
// LoadDataOutFile Bulk Export
@@ -481,51 +528,35 @@ func (conn *MySQLConn) LoadDataOutFile(ctx *g.Context, sql string) (stdOutReader
return stdOutReader, err
}
-// LoadDataInFile Bulk Import
-func (conn *MySQLConn) LoadDataInFile(tableFName string, ds *iop.Datastream) (count uint64, err error) {
- var stderr bytes.Buffer
-
- connURL := conn.URL
- if su := conn.GetProp("ssh_url"); su != "" {
- connURL = su // use ssh url if specified
- }
-
- url, err := dburl.Parse(connURL)
- if err != nil {
- err = g.Error(err, "Error dburl.Parse(conn.URL)")
- return
- }
-
- password, _ := url.User.Password()
- host := strings.ReplaceAll(url.Host, ":"+url.Port(), "")
- database := strings.ReplaceAll(url.Path, "/", "")
-
- loadQuery := g.R(`LOAD DATA LOCAL INFILE '/dev/stdin' INTO TABLE {table} FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"' ESCAPED BY '"' IGNORE 1 LINES;`, "table", tableFName)
- proc := exec.Command(
- "mysql",
- "--local-infile=1",
- "-h", host,
- "-P", url.Port(),
- "-u", url.User.Username(),
- "-p"+password,
- database,
- "-e", loadQuery,
+// LoadDataLocal uses go-sql-driver/mysql's RegisterReaderHandler for LOAD DATA LOCAL INFILE
+// This allows streaming data directly to MySQL without requiring the external mysql binary
+func (conn *MySQLConn) LoadDataLocal(tableFName string, ds *iop.Datastream) (count uint64, err error) {
+ // Generate unique handler name to avoid conflicts in concurrent operations
+ handlerName := "sling_" + uuid.New().String()
+
+ // Register the reader handler for streaming CSV with header
+ // The MySQL LOAD DATA template uses IGNORE 1 LINES, so we need the header
+ // BoolAsInt is required because MySQL's LOAD DATA doesn't convert true/false to 1/0
+ cfg := iop.LoaderStreamConfig(true)
+ cfg.BoolAsInt = true
+ mysql.RegisterReaderHandler(handlerName, func() io.Reader {
+ return ds.NewCsvReader(cfg)
+ })
+ defer mysql.DeregisterReaderHandler(handlerName)
+
+ // Get the template and build the query
+ tmpl := conn.GetTemplateValue("core.load_data_local_reader")
+ loadQuery := g.R(tmpl,
+ "handler_name", handlerName,
+ "table", tableFName,
)
- proc.Stderr = &stderr
- proc.Stdin = ds.NewCsvReader(iop.DefaultStreamConfig())
+ g.Trace("LoadDataLocal query: %s", loadQuery)
- err = proc.Run()
+ // Execute the LOAD DATA statement
+ _, err = conn.Exec(loadQuery)
if err != nil {
- cmdStr := strings.ReplaceAll(strings.Join(proc.Args, " "), password, "****")
- err = g.Error(
- err,
- fmt.Sprintf(
- "MySQL Import Command -> %s\nMySQL Import Error -> %s",
- cmdStr, stderr.String(),
- ),
- )
- return ds.Count, err
+ return 0, g.Error(err, "LoadDataLocal failed for table %s", tableFName)
}
return ds.Count, nil
diff --git a/core/dbio/database/database_oracle.go b/core/dbio/database/database_oracle.go
index a5085ad16..8c383b0bd 100755
--- a/core/dbio/database/database_oracle.go
+++ b/core/dbio/database/database_oracle.go
@@ -599,6 +599,11 @@ func (conn *OracleConn) CastColumnForSelect(srcCol iop.Column, tgtCol iop.Column
tgtCol.DbPrecision = lo.Ternary(tgtCol.DbPrecision == 0, 4000, tgtCol.DbPrecision)
switch {
+ case srcDbType == "xmltype":
+ // XMLTYPE columns cause the go-ora driver to hang when reading directly.
+ // Cast to CLOB to extract XML content as text.
+ // See: https://github.com/sijms/go-ora/issues/562
+ selectStr = g.F("(%s).getclobval()", qName)
case srcDbType != "clob" && tgtDbType == "clob":
selectStr = g.F("to_clob(%s)", qName)
case srcDbType == "clob" && tgtCol.IsString() && tgtDbType != "clob":
diff --git a/core/dbio/database/database_prometheus.go b/core/dbio/database/database_prometheus.go
index 08a08d460..61f4f8c88 100644
--- a/core/dbio/database/database_prometheus.go
+++ b/core/dbio/database/database_prometheus.go
@@ -767,6 +767,9 @@ func (conn *PrometheusConn) StreamRowsChunked(queryContext *g.Context, query str
}
ds.SetConfig(props)
+ // Start the bytes-written processor to prevent blocking on bwRows channel
+ ds.StartBwProcessor()
+
// Process in chunks
go func() {
defer ds.Close()
diff --git a/core/dbio/database/schemata.go b/core/dbio/database/schemata.go
index 538293c9f..4a5670fa7 100644
--- a/core/dbio/database/schemata.go
+++ b/core/dbio/database/schemata.go
@@ -5,6 +5,7 @@ import (
"encoding/json"
"runtime/debug"
"strings"
+ "sync"
"unicode"
"github.com/flarco/g"
@@ -274,6 +275,15 @@ func (t *Table) Select(Opts ...SelectOptions) (sql string) {
if f == "*" || strings.Contains(f, "(") {
return f
}
+
+ // Parse for "field as alias" syntax
+ original, alias, _, _ := iop.ParseSelectExpr(f)
+ if alias != "" {
+ // Generate: "original_col" AS "alias_name"
+ origQuoted := q + strings.ReplaceAll(original, q, "") + q
+ aliasQuoted := q + strings.ReplaceAll(alias, q, "") + q
+ return origQuoted + " as " + aliasQuoted
+ }
return q + strings.ReplaceAll(f, q, "") + q
})
@@ -284,9 +294,51 @@ func (t *Table) Select(Opts ...SelectOptions) (sql string) {
fieldsStr := lo.Ternary(len(fields) > 0, strings.Join(fields, ", "), "*")
- // auto convert to json if needed
+ // auto convert complex types as needed
{
switch t.Dialect {
+ case dbio.TypeDbOracle:
+ // XMLTYPE columns cause the go-ora driver to hang when reading directly.
+ // Cast to CLOB to extract XML content as text.
+ // See: https://github.com/sijms/go-ora/issues/562
+ var xmlTypeCols iop.Columns
+ for _, col := range t.Columns {
+ if strings.EqualFold(col.DbType, "xmltype") {
+ xmlTypeCols = append(xmlTypeCols, col)
+ }
+ }
+
+ if len(xmlTypeCols) > 0 {
+ if len(fields) == 0 || (len(fields) == 1 && fields[0] == "*") {
+ // Need to explicitly list all columns with XMLTYPE casted
+ fieldExprs := []string{}
+ for _, col := range t.Columns {
+ colQ := t.Dialect.Quote(col.Name)
+ if xmlTypeCols.GetColumn(col.Name) != nil {
+ // Cast XMLTYPE to CLOB using getclobval()
+ expr := g.F("(%s).getclobval() as %s", colQ, colQ)
+ fieldExprs = append(fieldExprs, expr)
+ } else {
+ fieldExprs = append(fieldExprs, colQ)
+ }
+ }
+ fieldsStr = strings.Join(fieldExprs, ", ")
+ } else {
+ fieldExprs := []string{}
+ for _, field := range opts.Fields {
+ field = strings.TrimSpace(field)
+ colQ := t.Dialect.Quote(field)
+ if xmlTypeCols.GetColumn(field) != nil {
+ expr := g.F("(%s).getclobval() as %s", colQ, colQ)
+ fieldExprs = append(fieldExprs, expr)
+ } else {
+ fieldExprs = append(fieldExprs, colQ)
+ }
+ }
+ fieldsStr = strings.Join(fieldExprs, ", ")
+ }
+ }
+
case dbio.TypeDbBigQuery:
var toJsonCols iop.Columns
@@ -963,6 +1015,7 @@ func GetTablesSchemata(conn Connection, tableNames ...string) (schemata Schemata
// GetSchemataAll obtains the schemata for all databases detected
func GetSchemataAll(conn Connection) (schemata Schemata, err error) {
schemata = Schemata{Databases: map[string]Database{}}
+ var mu sync.Mutex
connInfo := conn.Info()
@@ -997,13 +1050,14 @@ func GetSchemataAll(conn Connection) (schemata Schemata, err error) {
}
// pull down schemata
- newSchemata, err := newConn.GetSchemata("", "")
+ newSchemata, err := newConn.GetSchemata(SchemataLevelColumn, "")
if err != nil {
g.Warn("could not obtain schemata for database: %s. %s", dbName, err)
return
}
- // merge all schematas
+ // merge all schematas with mutex protection
+ mu.Lock()
for name, database := range newSchemata.Databases {
g.Debug(
" collected %d columns, in %d tables/views from database %s",
@@ -1013,6 +1067,7 @@ func GetSchemataAll(conn Connection) (schemata Schemata, err error) {
)
schemata.Databases[name] = database
}
+ mu.Unlock()
}
// loop an connect to each
@@ -1032,10 +1087,44 @@ func (t *Table) AddPrimaryKeyToDDL(ddl string, columns iop.Columns) (string, err
if pkCols := columns.GetKeys(iop.PrimaryKey); len(pkCols) > 0 {
ddl = strings.TrimSpace(ddl)
- // add pk right before the last parenthesis
- lastParen := strings.LastIndex(ddl, ")")
- if lastParen == -1 {
- return ddl, g.Error("could not find last parenthesis")
+ // Find the closing parenthesis of the column definitions
+ // We need to find the first balanced closing paren that matches the opening
+ // paren of the CREATE TABLE column list, not just the last paren in the DDL
+ // This handles cases like: CREATE TABLE t (col1 int) WITH (data_compression=page)
+
+ // Find "CREATE TABLE" pattern to locate start of statement
+ createTableIdx := strings.Index(strings.ToUpper(ddl), "CREATE TABLE")
+ if createTableIdx == -1 {
+ return ddl, g.Error("could not find CREATE TABLE in DDL")
+ }
+
+ // Find the opening paren after CREATE TABLE (this is the column list)
+ openParen := strings.Index(ddl[createTableIdx:], "(")
+ if openParen == -1 {
+ return ddl, g.Error("could not find opening parenthesis for column list")
+ }
+ openParen += createTableIdx
+
+ // Find the matching closing paren by counting balanced parens
+ depth := 1
+ closeParen := -1
+ for i := openParen + 1; i < len(ddl); i++ {
+ switch ddl[i] {
+ case '(':
+ depth++
+ case ')':
+ depth--
+ if depth == 0 {
+ closeParen = i
+ }
+ }
+ if closeParen != -1 {
+ break
+ }
+ }
+
+ if closeParen == -1 {
+ return ddl, g.Error("could not find closing parenthesis for column list")
}
prefix := "primary key"
@@ -1045,7 +1134,7 @@ func (t *Table) AddPrimaryKeyToDDL(ddl string, columns iop.Columns) (string, err
}
quotedNames := t.Dialect.QuoteNames(pkCols.Names()...)
- ddl = ddl[:lastParen] + g.F(", %s (%s)", prefix, strings.Join(quotedNames, ", ")) + ddl[lastParen:]
+ ddl = ddl[:closeParen] + g.F(", %s (%s)", prefix, strings.Join(quotedNames, ", ")) + ddl[closeParen:]
}
return ddl, nil
diff --git a/core/dbio/database/schemata_test.go b/core/dbio/database/schemata_test.go
index fef055aaf..b4a24fca5 100644
--- a/core/dbio/database/schemata_test.go
+++ b/core/dbio/database/schemata_test.go
@@ -6,6 +6,9 @@ import (
"github.com/flarco/g"
"github.com/slingdata-io/sling-cli/core/dbio"
+ "github.com/slingdata-io/sling-cli/core/dbio/iop"
+ "github.com/slingdata-io/sling-cli/core/env"
+ "github.com/spf13/cast"
"github.com/stretchr/testify/assert"
)
@@ -297,6 +300,127 @@ func TestParseSQLMultiStatements(t *testing.T) {
}
}
+func TestGetSchemataAll(t *testing.T) {
+ ef := env.LoadSlingEnvFile()
+
+ url := cast.ToStringMap(ef.Connections["POSTGRES"])["url"]
+ if url == nil {
+ t.Skip("POSTGRES env var not set")
+ }
+
+ conn, err := NewConn(cast.ToString(url))
+ if !assert.NoError(t, err) {
+ return
+ }
+ defer conn.Close()
+
+ schemata, err := GetSchemataAll(conn)
+ if !assert.NoError(t, err) {
+ return
+ }
+
+ // Count all tables from all databases
+ tableCount := 0
+ for _, db := range schemata.Databases {
+ tableCount += len(db.Tables())
+ }
+
+ assert.Greater(t, tableCount, 0, "expected at least one table across all databases")
+}
+
+func TestAddPrimaryKeyToDDL(t *testing.T) {
+ // Verifies that primary key is placed correctly in column definitions
+ // when table_ddl contains WITH clause or other suffixes after ({col_types})
+
+ type testCase struct {
+ name string
+ dialect dbio.Type
+ ddl string
+ pkCols []string
+ expected string
+ }
+
+ cases := []testCase{
+ {
+ name: "simple DDL without WITH clause",
+ dialect: dbio.TypeDbSQLServer,
+ ddl: `create table "dbo"."test" ("col1" nvarchar(10), "col2" nvarchar(6))`,
+ pkCols: []string{"col1", "col2"},
+ expected: `create table "dbo"."test" ("col1" nvarchar(10), "col2" nvarchar(6), primary key ("col1", "col2"))`,
+ },
+ {
+ name: "DDL with WITH clause (GitHub issue #694)",
+ dialect: dbio.TypeDbSQLServer,
+ ddl: `create table "dbo"."test" ("col1" nvarchar(10), "col2" nvarchar(6)) WITH (data_compression=page)`,
+ pkCols: []string{"col1", "col2"},
+ expected: `create table "dbo"."test" ("col1" nvarchar(10), "col2" nvarchar(6), primary key ("col1", "col2")) WITH (data_compression=page)`,
+ },
+ {
+ name: "DDL with multiple WITH options",
+ dialect: dbio.TypeDbSQLServer,
+ ddl: `create table "dbo"."test" ("col1" int, "col2" int) WITH (PAD_INDEX = ON, FILLFACTOR = 90)`,
+ pkCols: []string{"col1"},
+ expected: `create table "dbo"."test" ("col1" int, "col2" int, primary key ("col1")) WITH (PAD_INDEX = ON, FILLFACTOR = 90)`,
+ },
+ {
+ name: "DDL with nested parentheses in column type",
+ dialect: dbio.TypeDbSQLServer,
+ ddl: `create table "dbo"."test" ("col1" decimal(10,2), "col2" varchar(100)) WITH (LOCK_ESCALATION = TABLE)`,
+ pkCols: []string{"col1"},
+ expected: `create table "dbo"."test" ("col1" decimal(10,2), "col2" varchar(100), primary key ("col1")) WITH (LOCK_ESCALATION = TABLE)`,
+ },
+ {
+ name: "Postgres DDL without suffix",
+ dialect: dbio.TypeDbPostgres,
+ ddl: `create table if not exists "public"."test" ("col1" integer, "col2" text)`,
+ pkCols: []string{"col1"},
+ expected: `create table if not exists "public"."test" ("col1" integer, "col2" text, primary key ("col1"))`,
+ },
+ {
+ name: "Postgres DDL with PARTITION BY clause",
+ dialect: dbio.TypeDbPostgres,
+ ddl: `create table if not exists "public"."test" ("col1" integer, "col2" date) PARTITION BY RANGE (col2)`,
+ pkCols: []string{"col1"},
+ expected: `create table if not exists "public"."test" ("col1" integer, "col2" date, primary key ("col1")) PARTITION BY RANGE (col2)`,
+ },
+ {
+ name: "Oracle DDL with named constraint",
+ dialect: dbio.TypeDbOracle,
+ ddl: `create table "SCHEMA"."TEST" ("COL1" NUMBER, "COL2" VARCHAR2(100))`,
+ pkCols: []string{"COL1"},
+ expected: `create table "SCHEMA"."TEST" ("COL1" NUMBER, "COL2" VARCHAR2(100), constraint test_pkey primary key ("COL1"))`,
+ },
+ {
+ name: "no primary key columns",
+ dialect: dbio.TypeDbSQLServer,
+ ddl: `create table "dbo"."test" ("col1" int, "col2" int) WITH (FILLFACTOR = 90)`,
+ pkCols: []string{},
+ expected: `create table "dbo"."test" ("col1" int, "col2" int) WITH (FILLFACTOR = 90)`,
+ },
+ }
+
+ for _, c := range cases {
+ t.Run(c.name, func(t *testing.T) {
+ table := &Table{
+ Name: "test",
+ Dialect: c.dialect,
+ }
+
+ // Create columns with primary key flag
+ var cols iop.Columns
+ for _, name := range c.pkCols {
+ col := iop.Column{Name: name}
+ col.SetMetadata(iop.PrimaryKey.MetadataKey(), "true")
+ cols = append(cols, col)
+ }
+
+ result, err := table.AddPrimaryKeyToDDL(c.ddl, cols)
+ assert.NoError(t, err)
+ assert.Equal(t, c.expected, result)
+ })
+ }
+}
+
func TestTrimSQLComments(t *testing.T) {
type testCase struct {
name string
diff --git a/core/dbio/dbio_types.go b/core/dbio/dbio_types.go
index 95d47477c..38d793f97 100644
--- a/core/dbio/dbio_types.go
+++ b/core/dbio/dbio_types.go
@@ -10,6 +10,7 @@ import (
"github.com/flarco/g"
"github.com/slingdata-io/sling-cli/core/env"
+ "github.com/spf13/cast"
"gopkg.in/yaml.v2"
)
@@ -200,6 +201,10 @@ func (t Type) DefPort() int {
// DBNameUpperCase returns true is upper case is default
func (t Type) DBNameUpperCase() bool {
+ tp, _ := t.Template()
+ if val := tp.Value("variable.column_upper"); val != "" {
+ return cast.ToBool(val)
+ }
return g.In(t, TypeDbOracle, TypeDbSnowflake, TypeDbExasol)
}
@@ -283,6 +288,7 @@ func (t Type) NameLong() string {
TypeDbDatabricks: "DB - Databricks",
TypeDbExasol: "DB - Exasol",
TypeDbD1: "DB - D1",
+ Type("db2"): "DB - DB2",
TypeDbSQLite: "DB - SQLite",
TypeDbDuckDb: "DB - DuckDB",
TypeDbDuckLake: "DB - DuckLake",
@@ -333,6 +339,7 @@ func (t Type) Name() string {
TypeDbDatabricks: "Databricks",
TypeDbExasol: "Exasol",
TypeDbD1: "D1",
+ Type("db2"): "DB2",
TypeDbSQLite: "SQLite",
TypeDbDuckDb: "DuckDB",
TypeDbDuckLake: "DuckLake",
@@ -539,7 +546,9 @@ func (tp Template) Quote(field string) string {
// always normalize if case is uniform. Why would you quote and not normalize?
if !HasVariedCase(field) && Normalize {
- if tp.Type.DBNameUpperCase() {
+ if val := tp.Value("variable.column_upper"); val != "" && cast.ToBool(val) {
+ field = strings.ToUpper(field)
+ } else if tp.Type.DBNameUpperCase() {
field = strings.ToUpper(field)
} else {
field = strings.ToLower(field)
@@ -549,6 +558,7 @@ func (tp Template) Quote(field string) string {
field = tp.Type.Unquote(field)
return q + field + q
}
+
func (tp Template) QuoteNames(names ...string) (newNames []string) {
newNames = make([]string, len(names))
for i := range names {
diff --git a/core/dbio/filesys/fs.go b/core/dbio/filesys/fs.go
index d9f33807b..4dc649224 100755
--- a/core/dbio/filesys/fs.go
+++ b/core/dbio/filesys/fs.go
@@ -275,6 +275,21 @@ func NormalizeURI(fs FileSysClient, uri string) string {
return fs.Prefix("/") + path
}
return fs.Prefix("/") + strings.TrimLeft(strings.TrimPrefix(uri, fs.Prefix()), "/")
+ case dbio.TypeFileS3, dbio.TypeFileGoogle:
+ // For S3/GCS, if URI already has the scheme prefix (e.g., s3://bucket/path),
+ // return it as-is to allow accessing different buckets with the same credentials.
+ // This enables multi-bucket access with a single connection.
+ scheme := fs.FsType().String() + "://"
+ if strings.HasPrefix(uri, scheme) {
+ // Ensure there's a trailing slash after the bucket name if no path is specified.
+ // This is required for ParseURL to correctly extract an empty path.
+ // e.g., "s3://bucket" -> "s3://bucket/"
+ if !strings.Contains(strings.TrimPrefix(uri, scheme), "/") {
+ return uri + "/"
+ }
+ return uri
+ }
+ fallthrough
default:
return fs.Prefix("/") + strings.TrimLeft(strings.TrimPrefix(uri, fs.Prefix()), "/")
}
@@ -485,6 +500,7 @@ func (fs *BaseFileSysClient) GetDatastream(uri string, cfg ...iop.FileStreamConf
ds = iop.NewDatastreamContext(fs.Context().Ctx, nil)
ds.SafeInference = true
+ ds.SchemaOnly = Cfg.SchemaOnly
ds.SetMetadata(fs.GetProp("METADATA"))
ds.Metadata.StreamURL.Value = uri
ds.SetConfig(fs.Props())
@@ -635,16 +651,45 @@ func (fs *BaseFileSysClient) ReadDataflow(url string, cfg ...iop.FileStreamConfi
if g.In(Cfg.Format, dbio.FileTypeIceberg, dbio.FileTypeDelta) || Cfg.SQL != "" {
nodes = FileNodes{FileNode{URI: url}}
} else if prefixes := Cfg.FileSelect; len(prefixes) > 0 {
- rootPath := GetDeepestPartitionParent(url)
- g.Trace("listing path: %s", rootPath)
- nodes, err = fs.Self().ListRecursive(rootPath)
- if err != nil {
- err = g.Error(err, "Error getting paths")
- return
+ // Check if any FileSelect entries are full URIs with scheme prefix.
+ // If so, they may reference different buckets (multi-bucket access).
+ fullURIPrefixes := []string{}
+ relativePrefixes := []string{}
+
+ for _, prefix := range prefixes {
+ if strings.Contains(prefix, "://") {
+ fullURIPrefixes = append(fullURIPrefixes, prefix)
+ } else {
+ relativePrefixes = append(relativePrefixes, prefix)
+ }
}
- // select only prefixes
- nodes = nodes.SelectWithPrefix(prefixes...)
+ // Handle full URI prefixes (may be from different buckets)
+ if len(fullURIPrefixes) > 0 {
+ for _, uri := range fullURIPrefixes {
+ g.Trace("listing path (full URI): %s", uri)
+ uriNodes, err := fs.Self().ListRecursive(uri)
+ if err != nil {
+ err = g.Error(err, "Error getting paths for %s", uri)
+ return df, err
+ }
+ nodes = append(nodes, uriNodes...)
+ }
+ }
+
+ // Handle relative prefixes (original behavior)
+ if len(relativePrefixes) > 0 {
+ rootPath := GetDeepestPartitionParent(url)
+ g.Trace("listing path: %s", rootPath)
+ pathNodes, err := fs.Self().ListRecursive(rootPath)
+ if err != nil {
+ err = g.Error(err, "Error getting paths")
+ return df, err
+ }
+ // select only prefixes
+ pathNodes = pathNodes.SelectWithPrefix(relativePrefixes...)
+ nodes = append(nodes, pathNodes...)
+ }
} else {
g.Trace("listing path: %s", url)
nodes, err = fs.Self().ListRecursive(url)
@@ -1246,6 +1291,7 @@ func GetDataflowViaDuckDB(fs FileSysClient, uri string, nodes FileNodes, cfg iop
ds := iop.NewDatastreamContext(fs.Context().Ctx, nil)
ds.SafeInference = true
+ ds.SchemaOnly = cfg.SchemaOnly
ds.SetMetadata(fs.GetProp("METADATA"))
ds.Metadata.StreamURL.Value = uri
ds.SetConfig(fs.Props())
@@ -1611,6 +1657,7 @@ func MergeReaders(fs FileSysClient, fileType dbio.FileType, nodes FileNodes, cfg
url := fs.GetProp("url")
ds = iop.NewDatastreamContext(fs.Context().Ctx, nil)
ds.SafeInference = true
+ ds.SchemaOnly = cfg.SchemaOnly
ds.SetMetadata(fs.GetProp("METADATA"))
ds.Metadata.StreamURL.Value = url
ds.SetConfig(fs.Client().Props())
diff --git a/core/dbio/filesys/fs_google.go b/core/dbio/filesys/fs_google.go
index 1cbcb43e8..1d6c7dffc 100644
--- a/core/dbio/filesys/fs_google.go
+++ b/core/dbio/filesys/fs_google.go
@@ -59,8 +59,10 @@ func (fs *GoogleFileSysClient) GetPath(uri string) (path string, err error) {
return
}
- if fs.bucket != host {
- err = g.Error("URL bucket differs from connection bucket. %s != %s", host, fs.bucket)
+ // If URI specifies a different bucket, update fs.bucket to use it.
+ // This allows multi-bucket access with a single connection.
+ if fs.bucket != host && host != "" {
+ fs.bucket = host
}
return path, err
diff --git a/core/dbio/filesys/fs_local.go b/core/dbio/filesys/fs_local.go
index 299ae9b09..c8b62e8ec 100644
--- a/core/dbio/filesys/fs_local.go
+++ b/core/dbio/filesys/fs_local.go
@@ -120,6 +120,7 @@ func (fs *LocalFileSysClient) GetDatastream(uri string, cfg ...iop.FileStreamCon
ds = iop.NewDatastreamContext(fs.Context().Ctx, nil)
ds.SafeInference = true
+ ds.SchemaOnly = Cfg.SchemaOnly
ds.SetMetadata(fs.GetProp("METADATA"))
ds.Metadata.StreamURL.Value = path
ds.SetConfig(fs.Props())
diff --git a/core/dbio/filesys/fs_s3.go b/core/dbio/filesys/fs_s3.go
index 4690c7da1..580ae711c 100644
--- a/core/dbio/filesys/fs_s3.go
+++ b/core/dbio/filesys/fs_s3.go
@@ -98,13 +98,35 @@ func (fs *S3FileSysClient) GetPath(uri string) (path string, err error) {
return
}
- if fs.bucket != host {
- err = g.Error("URL bucket differs from connection bucket. %s != %s", host, fs.bucket)
+ // If URI specifies a different bucket, update fs.bucket to use it.
+ // This allows multi-bucket access with a single connection.
+ if fs.bucket != host && host != "" {
+ fs.bucket = host
}
return path, err
}
+// getBucketAndPath extracts the bucket and key from a URI without mutating fs.bucket.
+// This is safe for concurrent use when reading from multiple buckets.
+func (fs *S3FileSysClient) getBucketAndPath(uri string) (bucket, path string, err error) {
+ // normalize, in case url is provided without prefix
+ uri = NormalizeURI(fs, uri)
+
+ host, path, err := ParseURL(uri)
+ if err != nil {
+ return
+ }
+
+ // Use the bucket from the URI if specified, otherwise fall back to connection's bucket
+ bucket = host
+ if bucket == "" {
+ bucket = fs.bucket
+ }
+
+ return bucket, path, err
+}
+
const defaultRegion = "us-east-1"
type fakeWriterAt struct {
@@ -255,43 +277,49 @@ func (fs *S3FileSysClient) Connect() (err error) {
// getSession returns the aws config and sets the region based on the bucket
func (fs *S3FileSysClient) getConfig() aws.Config {
+ return fs.getConfigForBucket(fs.bucket)
+}
+
+// getConfigForBucket returns the aws config with the region set for the specified bucket.
+// This is safe for concurrent use when reading from multiple buckets.
+func (fs *S3FileSysClient) getConfigForBucket(bucket string) aws.Config {
fs.mux.Lock()
defer fs.mux.Unlock()
endpoint := fs.GetProp("ENDPOINT")
region := fs.GetProp("REGION")
- if fs.bucket == "" {
+ if bucket == "" {
return fs.awsConfig
} else if region != "" {
- fs.RegionMap[fs.bucket] = region
+ fs.RegionMap[bucket] = region
} else if strings.HasSuffix(endpoint, ".digitaloceanspaces.com") {
region := strings.TrimSuffix(endpoint, ".digitaloceanspaces.com")
region = strings.TrimPrefix(region, "https://")
- fs.RegionMap[fs.bucket] = region
+ fs.RegionMap[bucket] = region
} else if strings.HasSuffix(endpoint, ".cloudflarestorage.com") {
- fs.RegionMap[fs.bucket] = "auto"
- } else if endpoint == "" && fs.RegionMap[fs.bucket] == "" {
+ fs.RegionMap[bucket] = "auto"
+ } else if endpoint == "" && fs.RegionMap[bucket] == "" {
s3Client := s3.NewFromConfig(fs.awsConfig)
- region, err := manager.GetBucketRegion(fs.Context().Ctx, s3Client, fs.bucket, func(o *s3.Options) {
+ region, err := manager.GetBucketRegion(fs.Context().Ctx, s3Client, bucket, func(o *s3.Options) {
o.Region = defaultRegion
})
if err != nil {
var apiErr smithy.APIError
if errors.As(err, &apiErr) && apiErr.ErrorCode() == "NotFound" {
- g.Debug("unable to find bucket %s's region not found", fs.bucket)
- g.Debug("Region not found for " + fs.bucket)
+ g.Debug("unable to find bucket %s's region not found", bucket)
+ g.Debug("Region not found for " + bucket)
} else {
- g.Debug(g.Error(err, "Error getting Region for "+fs.bucket).Error())
+ g.Debug(g.Error(err, "Error getting Region for "+bucket).Error())
}
} else {
- fs.RegionMap[fs.bucket] = region
+ fs.RegionMap[bucket] = region
}
}
// Create a copy of the config with the appropriate region
configCopy := fs.awsConfig.Copy()
- if fs.RegionMap[fs.bucket] != "" {
- configCopy.Region = fs.RegionMap[fs.bucket]
+ if fs.RegionMap[bucket] != "" {
+ configCopy.Region = fs.RegionMap[bucket]
} else {
configCopy.Region = defaultRegion
}
@@ -394,16 +422,19 @@ func (r *S3ReaderWrapper) closeOnce() error {
// path should specify the full path with scheme:
// `s3://my_bucket/key/to/file.txt` or `s3://my_bucket/key/to/directory`
func (fs *S3FileSysClient) GetReader(uri string) (reader io.Reader, err error) {
- key, err := fs.GetPath(uri)
+ // Use getBucketAndKey to extract bucket and key without mutating fs.bucket.
+ // This is safe for concurrent use when reading from multiple buckets.
+ bucket, key, err := fs.getBucketAndPath(uri)
if err != nil {
return
}
- svc := s3.NewFromConfig(fs.getConfig())
+ // Get config for the specific bucket (handles region lookup)
+ svc := s3.NewFromConfig(fs.getConfigForBucket(bucket))
// Use GetObject directly for streaming
result, err := svc.GetObject(fs.Context().Ctx, &s3.GetObjectInput{
- Bucket: aws.String(fs.bucket),
+ Bucket: aws.String(bucket),
Key: aws.String(key),
})
if err != nil {
diff --git a/core/dbio/filesys/fs_test.go b/core/dbio/filesys/fs_test.go
index 244698ee5..005aff5a4 100755
--- a/core/dbio/filesys/fs_test.go
+++ b/core/dbio/filesys/fs_test.go
@@ -702,7 +702,7 @@ func TestFileSysDOSpaces(t *testing.T) {
"ENDPOINT=nyc3.digitaloceanspaces.com",
"ACCESS_KEY_ID="+os.Getenv("DOS_ACCESS_KEY_ID"),
"SECRET_ACCESS_KEY="+os.Getenv("DOS_SECRET_ACCESS_KEY"),
- "METADATA="+g.Marshal(iop.Metadata{LoadedAt: iop.KeyValue{"loaded_at", time.Now().Unix()}, StreamURL: iop.KeyValue{"url", ""}}),
+ "METADATA="+g.Marshal(iop.Metadata{SyncedAt: iop.KeyValue{"loaded_at", time.Now().Unix()}, StreamURL: iop.KeyValue{"url", ""}}),
)
assert.NoError(t, err)
diff --git a/core/dbio/iop/README.md b/core/dbio/iop/README.md
deleted file mode 100755
index 175ffcc31..000000000
--- a/core/dbio/iop/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-
-## Input-Process-Output (ipo)
\ No newline at end of file
diff --git a/core/dbio/iop/datastream.go b/core/dbio/iop/datastream.go
index cee8f507d..9a1ec7c7b 100644
--- a/core/dbio/iop/datastream.go
+++ b/core/dbio/iop/datastream.go
@@ -52,6 +52,7 @@ type Datastream struct {
Bytes atomic.Uint64
Sp *StreamProcessor
SafeInference bool
+ SchemaOnly bool
NoDebug bool
Inferred bool
deferFuncs []func()
@@ -87,6 +88,7 @@ type FileStreamConfig struct {
IncrementalValue string `json:"incremental_value"`
FileSelect []string `json:"file_select"` // a list of files to include.
DuckDBFilename bool `json:"duckdb_filename"` // stream URL
+ SchemaOnly bool `json:"schema_only"`
Props map[string]string `json:"props"`
}
@@ -118,7 +120,8 @@ type KeyValue struct {
type Metadata struct {
StreamURL KeyValue `json:"stream_url"`
- LoadedAt KeyValue `json:"loaded_at"`
+ SyncedAt KeyValue `json:"synced_at"`
+ SyncedOp KeyValue `json:"synced_op"`
RowNum KeyValue `json:"row_num"`
RowID KeyValue `json:"row_id"`
ExecID KeyValue `json:"exec_id"`
@@ -261,6 +264,13 @@ func (ds *Datastream) processBwRows() {
}
}
+// StartBwProcessor starts the bytes-written processor goroutine.
+// This should be called when creating a datastream that pushes rows
+// directly without calling Start() (e.g., chunked streaming).
+func (ds *Datastream) StartBwProcessor() {
+ go ds.processBwRows()
+}
+
// SetReady sets the ds.ready
func (ds *Datastream) SetReady() {
if !ds.Ready {
@@ -804,29 +814,47 @@ skipBuffer:
return name
}
- if ds.Metadata.LoadedAt.Key != "" && ds.Metadata.LoadedAt.Value != nil {
- ds.Metadata.LoadedAt.Key = ensureName(ds.Metadata.LoadedAt.Key)
+ if ds.Metadata.SyncedAt.Key != "" && ds.Metadata.SyncedAt.Value != nil {
+ ds.Metadata.SyncedAt.Key = ensureName(ds.Metadata.SyncedAt.Key)
// handle timestamp value
isTimestamp := false
- if tVal, err := cast.ToTimeE(ds.Metadata.LoadedAt.Value); err == nil {
+ if tVal, err := cast.ToTimeE(ds.Metadata.SyncedAt.Value); err == nil {
isTimestamp = true
- ds.Metadata.LoadedAt.Value = tVal
+ ds.Metadata.SyncedAt.Value = tVal
} else {
- ds.Metadata.LoadedAt.Value = cast.ToInt64(ds.Metadata.LoadedAt.Value)
+ ds.Metadata.SyncedAt.Value = cast.ToInt64(ds.Metadata.SyncedAt.Value)
}
col := Column{
- Name: ds.Metadata.LoadedAt.Key,
+ Name: ds.Metadata.SyncedAt.Key,
Type: lo.Ternary(isTimestamp, TimestampzType, IntegerType),
Position: len(ds.Columns) + 1,
- Description: "Sling.Metadata.LoadedAt",
- Metadata: map[string]string{"sling_metadata": "loaded_at"},
+ Description: "Sling.Metadata.SyncedAt",
+ Metadata: map[string]string{"sling_metadata": "synced_at"},
+ Sourced: true,
+ }
+ ds.Columns = append(ds.Columns, col)
+ metaValuesMap[col.Position-1] = func(it *Iterator) any {
+ return ds.Metadata.SyncedAt.Value
+ }
+ }
+
+ if ds.Metadata.SyncedOp.Key != "" && ds.Metadata.SyncedOp.Value != nil {
+ ds.Metadata.SyncedOp.Key = ensureName(ds.Metadata.SyncedOp.Key)
+
+ col := Column{
+ Name: ds.Metadata.SyncedOp.Key,
+ Type: StringType,
+ DbPrecision: 4,
+ Position: len(ds.Columns) + 1,
+ Description: "Sling.Metadata.SyncedOp",
+ Metadata: map[string]string{"sling_metadata": "synced_op"},
Sourced: true,
}
ds.Columns = append(ds.Columns, col)
metaValuesMap[col.Position-1] = func(it *Iterator) any {
- return ds.Metadata.LoadedAt.Value
+ return ds.Metadata.SyncedOp.Value
}
}
@@ -956,6 +984,10 @@ skipBuffer:
loop:
for ds.it.next() {
+ if ds.SchemaOnly {
+ break // don't push any rows
+ }
+
schemaChgLoop:
for {
// reprocess row if needed (to expand it as needed)
diff --git a/core/dbio/iop/datatype.go b/core/dbio/iop/datatype.go
index 10f3281d7..b8ee0f76f 100755
--- a/core/dbio/iop/datatype.go
+++ b/core/dbio/iop/datatype.go
@@ -1626,7 +1626,7 @@ func NewSelector(selectExprs []string, casing ColumnCasing) *Selector {
continue
}
- field, newName, isExclude, _ := parseSelectExpr(expr)
+ field, newName, isExclude, _ := ParseSelectExpr(expr)
fieldLower := strings.ToLower(field)
if isExclude {
@@ -1694,7 +1694,7 @@ func (s *Selector) compute(name, nameLower string) string {
// 4. Check glob exclusions
for _, pattern := range s.excludeGlobs {
- if matchesSelectGlob(nameLower, pattern) {
+ if MatchesSelectGlob(nameLower, pattern) {
return ""
}
}
@@ -1711,7 +1711,7 @@ func (s *Selector) compute(name, nameLower string) string {
// 7. Check glob inclusions
for _, pattern := range s.includeGlobs {
- if matchesSelectGlob(nameLower, pattern) {
+ if MatchesSelectGlob(nameLower, pattern) {
return name
}
}
@@ -1774,7 +1774,7 @@ func applySelectAllMode(fields []string, fieldMap map[string]int, selectExprs []
continue
}
- field, newName, isExclude, err := parseSelectExpr(expr)
+ field, newName, isExclude, err := ParseSelectExpr(expr)
if err != nil {
return nil, err
}
@@ -1784,7 +1784,7 @@ func applySelectAllMode(fields []string, fieldMap map[string]int, selectExprs []
if strings.Contains(field, "*") {
// Glob exclusion
for i, f := range fields {
- if matchesSelectGlob(strings.ToLower(f), strings.ToLower(field)) {
+ if MatchesSelectGlob(strings.ToLower(f), strings.ToLower(field)) {
excluded[i] = true
}
}
@@ -1831,7 +1831,7 @@ func applySelectExplicitMode(fields []string, fieldMap map[string]int, selectExp
for _, expr := range selectExprs {
expr = strings.TrimSpace(expr)
- field, newName, isExclude, err := parseSelectExpr(expr)
+ field, newName, isExclude, err := ParseSelectExpr(expr)
if err != nil {
return nil, err
}
@@ -1847,7 +1847,7 @@ func applySelectExplicitMode(fields []string, fieldMap map[string]int, selectExp
if added[i] {
continue
}
- if matchesSelectGlob(strings.ToLower(f), strings.ToLower(field)) {
+ if MatchesSelectGlob(strings.ToLower(f), strings.ToLower(field)) {
newFields = append(newFields, f)
added[i] = true
}
@@ -1873,7 +1873,7 @@ func applySelectExplicitMode(fields []string, fieldMap map[string]int, selectExp
return newFields, nil
}
-// parseSelectExpr parses a single select expression
+// ParseSelectExpr parses a single select expression
// Returns: (fieldName, newName, isExclusion, error)
// Examples:
//
@@ -1882,7 +1882,7 @@ func applySelectExplicitMode(fields []string, fieldMap map[string]int, selectExp
// "field as new" -> ("field", "new", false, nil)
// "prefix*" -> ("prefix*", "", false, nil)
// "-*suffix" -> ("*suffix", "", true, nil)
-func parseSelectExpr(expr string) (field string, newName string, exclude bool, err error) {
+func ParseSelectExpr(expr string) (field string, newName string, exclude bool, err error) {
expr = strings.TrimSpace(expr)
// Check for exclusion prefix
@@ -1908,9 +1908,9 @@ func parseSelectExpr(expr string) (field string, newName string, exclude bool, e
return field, "", exclude, nil
}
-// matchesSelectGlob checks if name matches a simple glob pattern (prefix* or *suffix)
+// MatchesSelectGlob checks if name matches a simple glob pattern (prefix* or *suffix)
// Both name and pattern should be lowercase for case-insensitive matching
-func matchesSelectGlob(name, pattern string) bool {
+func MatchesSelectGlob(name, pattern string) bool {
if !strings.Contains(pattern, "*") {
return name == pattern
}
diff --git a/core/dbio/iop/duckdb.go b/core/dbio/iop/duckdb.go
index 927adea80..de9520f34 100644
--- a/core/dbio/iop/duckdb.go
+++ b/core/dbio/iop/duckdb.go
@@ -1426,13 +1426,17 @@ func (duck *DuckDb) MakeScanQuery(format dbio.FileType, uri string, fsc FileStre
}
// reserved word to use for timestamp comparison (when listing)
- const slingLoadedAtColumn = "_sling_loaded_at"
- if fsc.IncrementalKey != "" && fsc.IncrementalKey != slingLoadedAtColumn &&
+ if fsc.IncrementalKey != "" && fsc.IncrementalKey != env.ReservedFields.LoadedAt &&
fsc.IncrementalValue != "" {
incrementalWhereCond = g.F("%s > %s", dbio.TypeDbDuckDb.Quote(fsc.IncrementalKey), fsc.IncrementalValue)
where = g.F("where %s", incrementalWhereCond)
}
+ // we need only the types
+ if fsc.SchemaOnly {
+ where = "where 1=0"
+ }
+
if format == dbio.FileTypeNone {
g.Warn("duck.MakeScanQuery: format is empty, cannot determine stream_scanner")
}
diff --git a/core/dbio/iop/stream_processor.go b/core/dbio/iop/stream_processor.go
index 4f7288d75..9ab6decac 100644
--- a/core/dbio/iop/stream_processor.go
+++ b/core/dbio/iop/stream_processor.go
@@ -1111,6 +1111,10 @@ func (sp *StreamProcessor) CastToStringE(val any) (valString string, err error)
if err != nil {
return "", g.Error(err, "could not cast to string: %#v", v)
}
+ case time.Time:
+ valString = v.Format(time.RFC3339Nano)
+ case *time.Time:
+ valString = v.Format(time.RFC3339Nano)
case chJSON: // Clickhouse JSON / Variant or any with MarshalJSON()
var sBytes []byte
sBytes, err = v.MarshalJSON()
@@ -1582,6 +1586,7 @@ func (sp *StreamProcessor) CastRow(row []any, columns Columns) []any {
for i, val := range row {
col := &columns[i]
row[i] = sp.CastVal(i, val, col)
+ // g.Warn("%d | col %s | nVal => %#v", sp.N, col.Name, row[i])
if row[i] != nil && row[i] != "" {
sp.colStats[i].LastVal = row[i]
}
diff --git a/core/dbio/templates/azuredwh.yaml b/core/dbio/templates/azuredwh.yaml
index 43398def0..a143928fa 100755
--- a/core/dbio/templates/azuredwh.yaml
+++ b/core/dbio/templates/azuredwh.yaml
@@ -9,8 +9,8 @@ core:
update {table} as t1 set {set_fields2}
from (select * from {temp_table}) as t2
where {pk_fields_equal}
- insert: insert into {table} ({cols}) values ({values})
- insert_temp: insert into {table} ({cols}) select {cols} from {temp_table}
+ insert: insert into {table} ({fields}) values ({values})
+ insert_temp: insert into {table} ({fields}) select {cols} from {temp_table}
insert_ignore: insert into {table} ({fields}) values ({values}) on conflict ({pk_fields}) do nothing
insert_ignore_temp: insert into {table} ({names}) select {names} from {temp_table} on conflict ({pk_fields}) do nothing
update_temp: |
diff --git a/core/dbio/templates/azuresql.yaml b/core/dbio/templates/azuresql.yaml
index caf8b5493..9f4461667 100755
--- a/core/dbio/templates/azuresql.yaml
+++ b/core/dbio/templates/azuresql.yaml
@@ -9,8 +9,8 @@ core:
update {table} as t1 set {set_fields2}
from (select * from {temp_table}) as t2
where {pk_fields_equal}
- insert: insert into {table} ({cols}) values ({values})
- insert_temp: insert into {table} ({cols}) select {cols} from {temp_table}
+ insert: insert into {table} ({fields}) values ({values})
+ insert_temp: insert into {table} ({fields}) select {cols} from {temp_table}
insert_ignore: insert into {table} ({fields}) values ({values}) on conflict ({pk_fields}) do nothing
insert_ignore_temp: insert into {table} ({names}) select {names} from {temp_table} on conflict ({pk_fields}) do nothing
update_temp: |
diff --git a/core/dbio/templates/db2.yaml b/core/dbio/templates/db2.yaml
new file mode 100644
index 000000000..1afccabc6
--- /dev/null
+++ b/core/dbio/templates/db2.yaml
@@ -0,0 +1,416 @@
+core:
+ drop_table: |
+ BEGIN
+ DECLARE CONTINUE HANDLER FOR SQLSTATE '42704' BEGIN END;
+ EXECUTE IMMEDIATE 'DROP TABLE {table}';
+ END;
+ drop_view: |
+ BEGIN
+ DECLARE CONTINUE HANDLER FOR SQLSTATE '42704' BEGIN END;
+ EXECUTE IMMEDIATE 'DROP VIEW {view}';
+ END;
+ drop_index: drop index {schema}.{index}
+ create_table: create table {table} ({col_types})
+ create_index: create index {index} on {table} ({cols})
+ create_unique_index: create unique index {index} on {table} ({cols})
+ insert: insert into {table} ({fields}) values ({values})
+ insert_temp: insert into {table} ({fields}) select {cols} from {temp_table}
+ sample: select {fields} from {table} TABLESAMPLE SYSTEM (50) fetch first {n} rows only
+ rename_table: rename table {table} to {new_table}
+ modify_column: alter table {table} alter column {column} set data type {type}
+ use_database: set current schema {schema}
+ delete_where_not_exist: |
+ delete from {target_table}
+ where {where}
+ and {unique_id} not in (
+ select {unique_id}
+ from {temp_table}
+ )
+ update_where_not_exist: |
+ update {target_table}
+ set {set_fields}
+ where {where}
+ and {unique_id} not in (
+ select {unique_id}
+ from {temp_table}
+ )
+ merge_update_insert: |
+ MERGE INTO {tgt_table} tgt
+ USING {src_table} src
+ ON ({src_tgt_pk_equal})
+ WHEN MATCHED THEN
+ UPDATE SET {set_fields}
+ WHEN NOT MATCHED THEN
+ INSERT ({insert_fields}) VALUES ({src_insert_fields})
+ merge_delete_insert: |
+ DELETE FROM {tgt_table}
+ WHERE EXISTS (
+ SELECT 1 FROM {src_table} src
+ WHERE {src_tgt_pk_equal}
+ );
+ INSERT INTO {tgt_table} ({insert_fields})
+ SELECT {src_fields} FROM {src_table}
+ db2import: |
+ IMPORT FROM '{file}' OF DEL
+ MODIFIED BY COLDEL, CODEPAGE=1208
+ METHOD P (1, 2, 3)
+ MESSAGES '{message_file}'
+ INSERT INTO {table} ({columns})
+
+metadata:
+ current_database:
+ select current server from sysibm.sysdummy1
+
+ databases: |
+ select distinct dbname as name
+ from syscat.tables
+ order by dbname
+
+ schemas: |
+ select schemaname as schema_name
+ from syscat.schemata
+ order by schemaname
+
+ tables: |
+ select tabschema as schema_name, tabname as table_name, 'false' as is_view
+ from syscat.tables
+ where type = 'T'
+ {{if .schema -}} and tabschema = '{schema}' {{- end}}
+ order by tabschema, tabname
+
+ views: |
+ select viewschema as schema_name, viewname as table_name, 'true' as is_view
+ from syscat.views
+ {{if .schema -}} where viewschema = '{schema}' {{- end}}
+ order by viewschema, viewname
+
+ columns: |
+ select
+ colname as column_name,
+ typename as data_type,
+ case
+ when typename in ('DECIMAL', 'NUMERIC') then length
+ when typename in ('FLOAT', 'REAL', 'DOUBLE') then length
+ else null
+ end as precision,
+ case
+ when typename in ('DECIMAL', 'NUMERIC') then scale
+ else null
+ end as scale
+ from syscat.columns
+ where tabschema = '{schema}'
+ and tabname = '{table}'
+ order by colno
+
+ primary_keys: |
+ select
+ constname as pk_name,
+ colseq as position,
+ colname as column_name
+ from syscat.keycoluse
+ where tabschema = '{schema}'
+ and tabname = '{table}'
+ and constname in (
+ select constname
+ from syscat.tabconst
+ where tabschema = '{schema}'
+ and tabname = '{table}'
+ and type = 'P'
+ )
+ order by colseq
+
+ indexes: |
+ select
+ indname as index_name,
+ colname as column_name
+ from syscat.indexcoluse
+ where indschema = '{schema}'
+ and tabname = '{table}'
+ order by indname, colseq
+
+ columns_full: |
+ select
+ tabschema as schema_name,
+ tabname as table_name,
+ colname as column_name,
+ typename as data_type,
+ colno as position
+ from syscat.columns
+ where 1=1
+ {{if .schema -}} and tabschema = '{schema}' {{- end}}
+ {{if .table -}} and tabname = '{table}' {{- end}}
+ order by tabschema, tabname, colno
+
+ schemata: |
+ select
+ c.tabschema as schema_name,
+ c.tabname as table_name,
+ case
+ when t.type = 'T' then 'false'
+ else 'true'
+ end as is_view,
+ c.colname as column_name,
+ c.typename as data_type,
+ c.colno as position
+ from syscat.columns c
+ left join syscat.tables t
+ on t.tabschema = c.tabschema
+ and t.tabname = c.tabname
+ where 1=1
+ {{if .schema -}} and c.tabschema = '{schema}' {{- end}}
+ {{if .tables -}} and c.tabname in ({tables}) {{- end}}
+ order by c.tabschema, c.tabname, c.colno
+
+ row_count_estimates: |
+ select
+ tabschema as schema_name,
+ tabname as table_name,
+ card as count
+ from syscat.tables
+ where type = 'T'
+ {{if .schema -}} and tabschema = '{schema}' {{- end}}
+ {{if .table -}} and tabname = '{table}' {{- end}}
+ order by card desc
+
+ ddl_table: |
+ select 'CREATE TABLE "' || tabschema || '"."' || tabname || '" (' ||
+ listagg(
+ '"' || colname || '" ' || typename ||
+ case
+ when typename in ('VARCHAR', 'CHAR', 'GRAPHIC', 'VARGRAPHIC') then '(' || length || ')'
+ when typename in ('DECIMAL', 'NUMERIC') then '(' || length || ',' || scale || ')'
+ else ''
+ end ||
+ case when nulls = 'N' then ' NOT NULL' else '' end,
+ ', '
+ ) within group (order by colno) || ')' as ddl
+ from syscat.columns
+ where tabschema = '{schema}'
+ and tabname = '{table}'
+ group by tabschema, tabname
+
+ ddl_view: |
+ select text as ddl
+ from syscat.views
+ where viewschema = '{schema}'
+ and viewname = '{table}'
+
+ sessions: |
+ select
+ agent_id as sid,
+ application_handle,
+ application_name,
+ authid as username,
+ client_applname as program
+ from table(mon_get_connection(cast(null as bigint), -1))
+ where application_name <> ''
+
+ session_terminate: |
+ call admin_cmd('force application (' || {agent_id} || ')')
+
+analysis:
+ field_chars: |
+ select
+ '{schema}' as schema_nm,
+ '{table}' as table_nm,
+ '{field}' as field,
+ sum(case when locate(chr(10), {field}) > 0 then 1 else 0 end) as cnt_nline,
+ sum(case when locate(chr(9), {field}) > 0 then 1 else 0 end) as cnt_tab,
+ sum(case when locate(',', {field}) > 0 then 1 else 0 end) as cnt_comma,
+ sum(case when locate('"', {field}) > 0 then 1 else 0 end) as cnt_dquote,
+ min(length({field})) as f_min_len,
+ max(length({field})) as f_max_len
+ from "{schema}"."{table}"
+
+ field_stat_len: |
+ select
+ '{schema}' as schema_nm,
+ '{table}' as table_nm,
+ '{field}' as field,
+ count(*) as tot_cnt,
+ min(length(varchar({field}))) as f_min_len,
+ max(length(varchar({field}))) as f_max_len
+ from "{schema}"."{table}"
+
+ field_stat_deep: |
+ select
+ '{schema}' as schema_nm,
+ '{table}' as table_nm,
+ '{field}' as field,
+ count(*) as tot_cnt,
+ count({field}) as f_cnt,
+ count(*) - count({field}) as f_null_cnt,
+ round(100.0 * (count(*) - count({field})) / count(*), 1) as f_null_prct,
+ count(distinct {field}) as f_dstct_cnt,
+ round(100.0 * count(distinct {field}) / count(*), 1) as f_dstct_prct,
+ count(*) - count(distinct {field}) as f_dup_cnt,
+ cast(min({field}) as varchar(1000)) as f_min,
+ cast(max({field}) as varchar(1000)) as f_max,
+ min(length(varchar({field}))) as f_min_len,
+ max(length(varchar({field}))) as f_max_len
+ from "{schema}"."{table}"
+
+ distro_field: |
+ with t1 as (
+ select
+ '{field}' as field,
+ {field},
+ count(*) as cnt
+ from "{schema}"."{table}"
+ group by {field}
+ order by count(*) desc
+ fetch first 1000 rows only
+ ),
+ t2 as (
+ select
+ '{field}' as field,
+ count(*) as ttl_cnt
+ from "{schema}"."{table}"
+ )
+ select
+ '{table}' as table_nm,
+ t1.field,
+ {field} as value,
+ cnt,
+ round(100.0 * cnt / ttl_cnt, 2) as prct
+ from t1
+ join t2 on t1.field = t2.field
+ order by cnt desc
+
+ distro_field_group: |
+ with t1 as (
+ select
+ '{field}' as field,
+ {group_expr} as group_exp,
+ {field},
+ count(*) as cnt
+ from "{schema}"."{table}"
+ group by {field}, {group_expr}
+ order by count(*) desc
+ fetch first 1000 rows only
+ ),
+ t2 as (
+ select
+ '{field}' as field,
+ count(*) as ttl_cnt
+ from "{schema}"."{table}"
+ )
+ select
+ '{table}' as table_nm,
+ t1.field,
+ t1.group_exp,
+ {field} as value,
+ cnt,
+ round(100.0 * cnt / ttl_cnt, 2) as prct
+ from t1
+ join t2 on t1.field = t2.field
+ order by cnt desc
+
+ distro_field_date: |
+ with t1 as (
+ select
+ '{field}' as field,
+ year({field}) as year,
+ month({field}) as month,
+ day({field}) as day,
+ count(*) as cnt
+ from "{schema}"."{table}"
+ group by year({field}), month({field}), day({field})
+ order by year({field}), month({field}), day({field})
+ ),
+ t2 as (
+ select '{field}' as field, count(*) as ttl_cnt
+ from "{schema}"."{table}"
+ )
+ select
+ '{schema}' as schema_nm,
+ '{table}' as table_nm,
+ t1.field,
+ t1.year,
+ t1.month,
+ t1.day,
+ cnt,
+ round(100.0 * cnt / ttl_cnt, 2) as prct
+ from t1
+ join t2 on t1.field = t2.field
+ order by t1.year, t1.month, t1.day
+
+function:
+ truncate_f: trunc({field}, 0)
+ truncate_datef: date({field})
+ string_type: varchar(32672)
+ cast_to_string: 'varchar({field})'
+ cast_to_text: 'varchar({field}, 32672)'
+ date_to_int: days({field}) - days('1970-01-01')
+ number_to_int: int({field})
+ sleep: call dbms_lock.sleep({seconds})
+ checksum_integer: cast(abs({field}) as bigint)
+ checksum_bigint: cast(abs({field}) as decimal(31,0))
+ checksum_decimal: cast(abs({field}) as decimal(31,0))
+ checksum_date: cast((days({field}) - days('1970-01-01')) as bigint) * 86400
+ checksum_datetime: cast((days({field}) - days('1970-01-01')) as bigint) * 86400 + cast(midnight_seconds({field}) as bigint)
+ checksum_string: cast(length({field}) as bigint)
+ checksum_boolean: cast(length(varchar({field})) as bigint)
+ checksum_json: cast(length(varchar({field})) as bigint)
+ now: current timestamp
+ concat: concat({fields})
+
+variable:
+ tmp_folder: /tmp
+ bind_string: "?"
+ error_filter_table_exists: undefined
+ error_ignore_drop_table: undefined
+ error_ignore_drop_view: undefined
+ max_string_type: varchar(32672)
+ max_string_length: 32672
+ max_column_length: 128
+ column_upper: true
+
+native_type_map:
+ smallint: smallint
+ integer: integer
+ int: integer
+ bigint: bigint
+ decimal: decimal
+ numeric: decimal
+ dec: decimal
+ real: float
+ float: float
+ double: float
+ double precision: float
+ decfloat: float
+ char: string
+ character: string
+ varchar: string
+ character varying: string
+ clob: text
+ blob: binary
+ varbinary: binary
+ binary: binary
+ date: date
+ time: time
+ timestamp: timestamp
+ boolean: bool
+ xml: text
+ graphic: string
+ vargraphic: string
+ dbclob: string
+
+general_type_map:
+ bigint: bigint
+ binary: varbinary(32672)
+ bool: boolean
+ date: date
+ datetime: timestamp
+ decimal: "decimal(,)"
+ integer: integer
+ json: clob
+ smallint: smallint
+ string: "varchar()"
+ text: clob
+ timestamp: timestamp
+ timestampz: timestamp
+ float: double
+ time: time
+ timez: time
+ uuid: char(36)
\ No newline at end of file
diff --git a/core/dbio/templates/exasol.yaml b/core/dbio/templates/exasol.yaml
index 86b7dfe98..0c6a08ecd 100644
--- a/core/dbio/templates/exasol.yaml
+++ b/core/dbio/templates/exasol.yaml
@@ -191,7 +191,7 @@ bulk:
function:
add_months: add_months({field}, {num})
cast: cast({field} as {type})
- concat: concat({strings})
+ concat: concat({fields})
date_diff_days: days_between({date1}, {date2})
date_diff_seconds: seconds_between({date1}, {date2})
date_parse_format: to_timestamp({string}, {format})
@@ -208,6 +208,7 @@ function:
uuid: sys_guid()
variable:
+ column_upper: true
bool_as: string
duplicates_group_by: false
handle_null_compare: true
diff --git a/core/dbio/templates/fabric.yaml b/core/dbio/templates/fabric.yaml
index 00e2e7c86..224622874 100755
--- a/core/dbio/templates/fabric.yaml
+++ b/core/dbio/templates/fabric.yaml
@@ -18,8 +18,8 @@ core:
incremental_select_limit: select top {limit} {fields} from {table} where ({incremental_where_cond}){where_and} order by {update_key} asc
incremental_select_limit_offset: select top {limit} * from ( select {fields} from {table} where ({incremental_where_cond}){where_and} order by {update_key} asc offset {offset} rows) as t
incremental_select: select {fields} from {table} where ({incremental_where_cond}){where_and}
- insert: insert into {table} ({cols}) values ({values})
- insert_temp: insert into {table} ({cols}) select {cols} from {temp_table}
+ insert: insert into {table} ({fields}) values ({values})
+ insert_temp: insert into {table} ({fields}) select {cols} from {temp_table}
insert_ignore: insert into {table} ({fields}) values ({values}) on conflict ({pk_fields}) do nothing
insert_ignore_temp: insert into {table} ({names}) select {names} from {temp_table} on conflict ({pk_fields}) do nothing
update_temp: |
diff --git a/core/dbio/templates/firebird.yaml b/core/dbio/templates/firebird.yaml
new file mode 100644
index 000000000..4e33cefd6
--- /dev/null
+++ b/core/dbio/templates/firebird.yaml
@@ -0,0 +1,435 @@
+core:
+ drop_table: |
+ execute block as
+ begin
+ if (exists(select 1 from rdb$relations where (rdb$relation_name) = ('{table}'))) then
+ execute statement 'drop table {table}';
+ end
+ drop_view: |
+ execute block as
+ begin
+ if (exists(select 1 from rdb$relations where rdb$relation_name = ('{view}'))) then
+ execute statement 'drop view {view}';
+ end
+ drop_index: |
+ execute block as
+ begin
+ if (exists(select 1 from rdb$indices where rdb$index_name = ('{index}'))) then
+ execute statement 'drop index {index}';
+ end
+ create_table: create table {table} ({col_types})
+ create_index: create index {index} on {table} ({cols})
+ create_unique_index: create unique index {index} on {table} ({cols})
+ replace: |
+ update or insert into {table} ({fields})
+ values ({values})
+ matching ({pk_fields})
+ replace_temp: |
+ merge into {table} t1
+ using {temp_table} t2
+ on ({pk_fields_equal})
+ when matched then
+ update set {set_fields2}
+ when not matched then
+ insert ({names}) values ({names2})
+ insert: insert into {table} ({fields}) values ({values})
+ insert_temp: insert into {table} ({fields}) select {cols} from {temp_table}
+ insert_ignore: |
+ execute block as
+ begin
+ insert into {table} ({fields}) values ({values});
+ when any do
+ begin
+ -- ignore duplicate key errors
+ end
+ end
+ insert_ignore_temp: |
+ merge into {table} t1
+ using {temp_table} t2
+ on ({pk_fields_equal})
+ when not matched then
+ insert ({names}) values ({names2})
+ update_temp: |
+ merge into {table} t1
+ using {temp_table} t2
+ on ({pk_fields_equal})
+ when matched then
+ update set {set_fields2}
+ sample: select first {n} {fields} from {table}
+ rename_table: alter table {table} rename to {new_table}
+ modify_column: alter table {table} alter column {column} type {type}
+ use_database: -- firebird doesn't support USE DATABASE
+
+metadata:
+
+ current_database: |
+ select MON$DATABASE_NAME as name
+ from MON$DATABASE
+
+ databases: |
+ select MON$DATABASE_NAME as name
+ from MON$DATABASE
+
+ schemas: |
+ select 'main' as schema_name
+ from RDB$DATABASE
+
+ tables: |
+ select
+ 'main' as schema_name,
+ trim(r.RDB$RELATION_NAME) as table_name,
+ 'false' as is_view
+ from RDB$RELATIONS r
+ where r.RDB$SYSTEM_FLAG = 0
+ and r.RDB$VIEW_BLR is null
+ order by r.RDB$RELATION_NAME
+
+ views: |
+ select
+ 'main' as schema_name,
+ trim(r.RDB$RELATION_NAME) as table_name,
+ 'true' as is_view
+ from RDB$RELATIONS r
+ where r.RDB$SYSTEM_FLAG = 0
+ and r.RDB$VIEW_BLR is not null
+ order by r.RDB$RELATION_NAME
+
+ columns: |
+ select
+ trim(rf.RDB$FIELD_NAME) as column_name,
+ case f.RDB$FIELD_TYPE
+ when 7 then 'SMALLINT'
+ when 8 then 'INTEGER'
+ when 10 then 'FLOAT'
+ when 12 then 'DATE'
+ when 13 then 'TIME'
+ when 14 then 'CHAR(' || f.RDB$FIELD_LENGTH || ')'
+ when 16 then
+ case f.RDB$FIELD_SUB_TYPE
+ when 0 then 'BIGINT'
+ when 1 then 'NUMERIC(' || f.RDB$FIELD_PRECISION || ',' || abs(f.RDB$FIELD_SCALE) || ')'
+ when 2 then 'DECIMAL(' || f.RDB$FIELD_PRECISION || ',' || abs(f.RDB$FIELD_SCALE) || ')'
+ end
+ when 27 then 'DOUBLE PRECISION'
+ when 35 then 'TIMESTAMP'
+ when 37 then 'VARCHAR(' || f.RDB$FIELD_LENGTH || ')'
+ when 40 then 'CSTRING(' || f.RDB$FIELD_LENGTH || ')'
+ when 261 then 'BLOB'
+ else 'UNKNOWN'
+ end as data_type,
+ f.RDB$FIELD_PRECISION as "precision",
+ abs(f.RDB$FIELD_SCALE) as scale
+ from RDB$RELATION_FIELDS rf
+ join RDB$FIELDS f on rf.RDB$FIELD_SOURCE = f.RDB$FIELD_NAME
+ where trim(rf.RDB$RELATION_NAME) = '{table}'
+ order by rf.RDB$FIELD_POSITION
+
+ primary_keys: |
+ select
+ trim(rc.RDB$CONSTRAINT_NAME) as pk_name,
+ sg.RDB$FIELD_POSITION + 1 as position,
+ trim(sg.RDB$FIELD_NAME) as column_name
+ from RDB$RELATION_CONSTRAINTS rc
+ join RDB$INDEX_SEGMENTS sg on rc.RDB$INDEX_NAME = sg.RDB$INDEX_NAME
+ where rc.RDB$CONSTRAINT_TYPE = 'PRIMARY KEY'
+ and (trim(rc.RDB$RELATION_NAME)) = ('{table}')
+ order by sg.RDB$FIELD_POSITION
+
+ indexes: |
+ select
+ trim(i.RDB$INDEX_NAME) as index_name,
+ trim(sg.RDB$FIELD_NAME) as column_name
+ from RDB$INDICES i
+ join RDB$INDEX_SEGMENTS sg on i.RDB$INDEX_NAME = sg.RDB$INDEX_NAME
+ where i.RDB$SYSTEM_FLAG = 0
+ and (trim(i.RDB$RELATION_NAME)) = ('{table}')
+ order by i.RDB$INDEX_NAME, sg.RDB$FIELD_POSITION
+
+ columns_full: |
+ select
+ 'main' as schema_name,
+ trim(rf.RDB$RELATION_NAME) as table_name,
+ trim(rf.RDB$FIELD_NAME) as column_name,
+ case f.RDB$FIELD_TYPE
+ when 7 then 'SMALLINT'
+ when 8 then 'INTEGER'
+ when 10 then 'FLOAT'
+ when 12 then 'DATE'
+ when 13 then 'TIME'
+ when 14 then 'CHAR(' || f.RDB$FIELD_LENGTH || ')'
+ when 16 then
+ case f.RDB$FIELD_SUB_TYPE
+ when 0 then 'BIGINT'
+ when 1 then 'NUMERIC(' || f.RDB$FIELD_PRECISION || ',' || abs(f.RDB$FIELD_SCALE) || ')'
+ when 2 then 'DECIMAL(' || f.RDB$FIELD_PRECISION || ',' || abs(f.RDB$FIELD_SCALE) || ')'
+ end
+ when 27 then 'DOUBLE PRECISION'
+ when 35 then 'TIMESTAMP'
+ when 37 then 'VARCHAR(' || f.RDB$FIELD_LENGTH || ')'
+ when 40 then 'CSTRING(' || f.RDB$FIELD_LENGTH || ')'
+ when 261 then 'BLOB'
+ else 'UNKNOWN'
+ end as data_type,
+ rf.RDB$FIELD_POSITION + 1 as position
+ from RDB$RELATION_FIELDS rf
+ join RDB$FIELDS f on rf.RDB$FIELD_SOURCE = f.RDB$FIELD_NAME
+ join RDB$RELATIONS r on rf.RDB$RELATION_NAME = r.RDB$RELATION_NAME
+ where r.RDB$SYSTEM_FLAG = 0
+ {{if .table -}} and (trim(rf.RDB$RELATION_NAME)) = ('{table}') {{- end}}
+ order by rf.RDB$RELATION_NAME, rf.RDB$FIELD_POSITION
+
+ schemata: |
+ select
+ 'main' as schema_name,
+ trim(rf.RDB$RELATION_NAME) as table_name,
+ case
+ when r.RDB$VIEW_BLR is null then false
+ else true
+ end as is_view,
+ trim(rf.RDB$FIELD_NAME) as column_name,
+ case f.RDB$FIELD_TYPE
+ when 7 then 'SMALLINT'
+ when 8 then 'INTEGER'
+ when 10 then 'FLOAT'
+ when 12 then 'DATE'
+ when 13 then 'TIME'
+ when 14 then 'CHAR(' || f.RDB$FIELD_LENGTH || ')'
+ when 16 then
+ case f.RDB$FIELD_SUB_TYPE
+ when 0 then 'BIGINT'
+ when 1 then 'NUMERIC(' || f.RDB$FIELD_PRECISION || ',' || abs(f.RDB$FIELD_SCALE) || ')'
+ when 2 then 'DECIMAL(' || f.RDB$FIELD_PRECISION || ',' || abs(f.RDB$FIELD_SCALE) || ')'
+ end
+ when 27 then 'DOUBLE PRECISION'
+ when 35 then 'TIMESTAMP'
+ when 37 then 'VARCHAR(' || f.RDB$FIELD_LENGTH || ')'
+ when 40 then 'CSTRING(' || f.RDB$FIELD_LENGTH || ')'
+ when 261 then 'BLOB'
+ else 'UNKNOWN'
+ end as data_type,
+ rf.RDB$FIELD_POSITION + 1 as position
+ from RDB$RELATION_FIELDS rf
+ join RDB$FIELDS f on rf.RDB$FIELD_SOURCE = f.RDB$FIELD_NAME
+ join RDB$RELATIONS r on rf.RDB$RELATION_NAME = r.RDB$RELATION_NAME
+ where r.RDB$SYSTEM_FLAG = 0
+ {{if .tables -}} and trim(rf.RDB$RELATION_NAME) in ({tables}) {{- end}}
+ order by rf.RDB$RELATION_NAME, rf.RDB$FIELD_POSITION
+
+ row_count_estimates: |
+ select
+ 'main' as schema_name,
+ trim(r.RDB$RELATION_NAME) as table_name,
+ 0 as count
+ from RDB$RELATIONS r
+ where r.RDB$SYSTEM_FLAG = 0
+ and r.RDB$VIEW_BLR is null
+ {{if .table -}} and trim(r.RDB$RELATION_NAME) = '{table}' {{- end}}
+
+ ddl_table: |
+ execute block
+ returns (ddl varchar(8000))
+ as
+ declare variable field_name varchar(31);
+ declare variable field_type varchar(100);
+ declare variable field_null varchar(10);
+ declare variable first_field boolean = true;
+ begin
+ ddl = 'CREATE TABLE {table} (';
+
+ for select
+ trim(rf.RDB$FIELD_NAME),
+ case f.RDB$FIELD_TYPE
+ when 7 then 'SMALLINT'
+ when 8 then 'INTEGER'
+ when 10 then 'FLOAT'
+ when 12 then 'DATE'
+ when 13 then 'TIME'
+ when 14 then 'CHAR(' || f.RDB$FIELD_LENGTH || ')'
+ when 16 then
+ case f.RDB$FIELD_SUB_TYPE
+ when 0 then 'BIGINT'
+ when 1 then 'NUMERIC(' || f.RDB$FIELD_PRECISION || ',' || abs(f.RDB$FIELD_SCALE) || ')'
+ when 2 then 'DECIMAL(' || f.RDB$FIELD_PRECISION || ',' || abs(f.RDB$FIELD_SCALE) || ')'
+ end
+ when 27 then 'DOUBLE PRECISION'
+ when 35 then 'TIMESTAMP'
+ when 37 then 'VARCHAR(' || f.RDB$FIELD_LENGTH || ')'
+ when 40 then 'CSTRING(' || f.RDB$FIELD_LENGTH || ')'
+ when 261 then 'BLOB'
+ else 'UNKNOWN'
+ end,
+ case when rf.RDB$NULL_FLAG = 1 then 'NOT NULL' else '' end
+ from RDB$RELATION_FIELDS rf
+ join RDB$FIELDS f on rf.RDB$FIELD_SOURCE = f.RDB$FIELD_NAME
+ where (trim(rf.RDB$RELATION_NAME)) = ('{table}')
+ order by rf.RDB$FIELD_POSITION
+ into :field_name, :field_type, :field_null
+ do
+ begin
+ if (not first_field) then
+ ddl = ddl || ', ';
+ ddl = ddl || field_name || ' ' || field_type || ' ' || field_null;
+ first_field = false;
+ end
+
+ ddl = ddl || ')';
+ suspend;
+ end
+
+ ddl_view: |
+ select RDB$VIEW_SOURCE as ddl
+ from RDB$RELATIONS
+ where (trim(RDB$RELATION_NAME)) = ('{table}')
+
+ sessions: |
+ select
+ a.MON$ATTACHMENT_ID as pid,
+ a.MON$USER as username,
+ a.MON$REMOTE_ADDRESS as client_addr,
+ a.MON$STATE as state,
+ s.MON$SQL_TEXT as query
+ from MON$ATTACHMENTS a
+ left join MON$STATEMENTS s on a.MON$ATTACHMENT_ID = s.MON$ATTACHMENT_ID
+ where a.MON$ATTACHMENT_ID <> current_connection
+
+ session_terminate: delete from MON$ATTACHMENTS where MON$ATTACHMENT_ID = {pid}
+
+analysis:
+ field_chars: |
+ select
+ 'main' as schema_nm,
+ '{table}' as table_nm,
+ '{field}' as field,
+ sum(case when {field} containing ascii_char(10) then 1 else 0 end) as cnt_nline,
+ sum(case when {field} containing ascii_char(9) then 1 else 0 end) as cnt_tab,
+ sum(case when {field} containing ',' then 1 else 0 end) as cnt_comma,
+ sum(case when {field} containing '"' then 1 else 0 end) as cnt_dquote,
+ min(char_length({field})) as f_min_len,
+ max(char_length({field})) as f_max_len
+ from {table}
+
+ field_stat_len: |
+ select
+ 'main' as schema_nm,
+ '{table}' as table_nm,
+ '{field}' as field,
+ count(*) as tot_cnt,
+ min(char_length(cast({field} as varchar(8000)))) as f_min_len,
+ max(char_length(cast({field} as varchar(8000)))) as f_max_len
+ from {table}
+
+ field_stat_deep: |
+ select
+ 'main' as schema_nm,
+ '{table}' as table_nm,
+ '{field}' as field,
+ count(*) as tot_cnt,
+ count({field}) as f_cnt,
+ count(*) - count({field}) as f_null_cnt,
+ cast(100.0 * (count(*) - count({field})) / count(*) as numeric(5,1)) as f_null_prct,
+ count(distinct {field}) as f_dstct_cnt,
+ cast(100.0 * count(distinct {field}) / count(*) as numeric(5,1)) as f_dstct_prct,
+ count(*) - count(distinct {field}) as f_dup_cnt,
+ cast(min({field}) as varchar(255)) as f_min,
+ cast(max({field}) as varchar(255)) as f_max,
+ min(char_length(cast({field} as varchar(8000)))) as f_min_len,
+ max(char_length(cast({field} as varchar(8000)))) as f_max_len
+ from {table}
+
+ distro_field: |
+ select first 1000
+ '{table}' as table_nm,
+ '{field}' as field,
+ {field} as value,
+ count(*) as cnt,
+ cast(100.0 * count(*) / (select count(*) from {table}) as numeric(5,2)) as prct
+ from {table}
+ group by {field}
+ order by count(*) desc
+
+ distro_field_group: |
+ select first 1000
+ '{table}' as table_nm,
+ '{field}' as field,
+ {group_expr} as group_exp,
+ {field} as value,
+ count(*) as cnt,
+ cast(100.0 * count(*) / (select count(*) from {table}) as numeric(5,2)) as prct
+ from {table}
+ group by {field}, {group_expr}
+ order by count(*) desc
+
+ distro_field_date: |
+ select
+ 'main' as schema_nm,
+ '{table}' as table_nm,
+ '{field}' as field,
+ extract(year from {field}) as year,
+ extract(month from {field}) as month,
+ extract(day from {field}) as day,
+ count(*) as cnt,
+ cast(100.0 * count(*) / (select count(*) from {table}) as numeric(5,2)) as prct
+ from {table}
+ group by 1, 2, 3, 4, 5, 6
+ order by 4, 5, 6
+
+function:
+ truncate_f: cast({field} as integer)
+ truncate_datef: cast({field} as date)
+ string_type: varchar(8000)
+ cast_to_string: 'cast({field} as varchar(8000))'
+ cast_to_text: 'cast({field} as varchar(8000))'
+ date_to_int: datediff(day, date '1900-01-01', {field})
+ number_to_int: cast({field} as integer)
+ sleep: -- firebird doesn't have a sleep function
+ checksum_datetime: cast(datediff(second, timestamp '1970-01-01 00:00:00', {field}) * 1000000 as bigint)
+ checksum_string: char_length({field})
+ checksum_boolean: char_length(cast({field} as varchar(10)))
+ checksum_json: char_length(replace({field}, ' ', ''))
+ now: current_timestamp
+
+variable:
+ tmp_folder: /tmp
+ bind_string: '?'
+ error_filter_table_exists: already exists
+ max_string_type: varchar(8000)
+ max_string_length: 8000
+ max_column_length: 31
+
+native_type_map:
+ smallint: smallint
+ short: smallint
+ long: integer
+ integer: integer
+ bigint: bigint
+ float: float
+ double precision: float
+ numeric: decimal
+ decimal: decimal
+ date: date
+ time: time
+ timestamp: timestamp
+ char: string
+ varchar: text
+ blob: text
+ boolean: bool
+ varying: text
+
+general_type_map:
+ bigint: bigint
+ binary: blob
+ bool: boolean
+ date: date
+ datetime: timestamp
+ decimal: "decimal(,)"
+ integer: integer
+ json: varchar(8000)
+ smallint: smallint
+ string: "varchar()"
+ text: varchar(8000)
+ timestamp: timestamp
+ timestampz: timestamp with local time zone
+ float: double precision
+ time: time
+ timez: time
+ uuid: varchar(36)
\ No newline at end of file
diff --git a/core/dbio/templates/mariadb.yaml b/core/dbio/templates/mariadb.yaml
index cdf61e684..d783be6f1 100644
--- a/core/dbio/templates/mariadb.yaml
+++ b/core/dbio/templates/mariadb.yaml
@@ -9,6 +9,15 @@ core:
alter_columns: alter table {table} modify {col_ddl}
modify_column: '{column} {type}'
+ load_data_local_reader: |
+ LOAD DATA LOCAL INFILE 'Reader::{handler_name}'
+ INTO TABLE {table}
+ FIELDS TERMINATED BY ','
+ OPTIONALLY ENCLOSED BY '"'
+ ESCAPED BY '\\'
+ LINES TERMINATED BY '\n'
+ IGNORE 1 LINES
+
metadata:
current_database: select database() as name from dual
diff --git a/core/dbio/templates/mysql.yaml b/core/dbio/templates/mysql.yaml
index e1e81ed3a..3ee0cf3a8 100755
--- a/core/dbio/templates/mysql.yaml
+++ b/core/dbio/templates/mysql.yaml
@@ -9,6 +9,15 @@ core:
alter_columns: alter table {table} modify {col_ddl}
modify_column: '{column} {type}'
+ load_data_local_reader: |
+ LOAD DATA LOCAL INFILE 'Reader::{handler_name}'
+ INTO TABLE {table}
+ FIELDS TERMINATED BY ','
+ OPTIONALLY ENCLOSED BY '"'
+ ESCAPED BY '\\'
+ LINES TERMINATED BY '\n'
+ IGNORE 1 LINES
+
metadata:
current_database: select database() as name from dual
diff --git a/core/dbio/templates/postgres.yaml b/core/dbio/templates/postgres.yaml
index ff2a99214..7c3b9668d 100755
--- a/core/dbio/templates/postgres.yaml
+++ b/core/dbio/templates/postgres.yaml
@@ -14,8 +14,8 @@ core:
update {table} as t1 set {set_fields2}
from (select * from {temp_table}) as t2
where {pk_fields_equal}
- insert: insert into {table} ({cols}) values ({values})
- insert_temp: insert into {table} ({cols}) select {cols} from {temp_table}
+ insert: insert into {table} ({fields}) values ({values})
+ insert_temp: insert into {table} ({fields}) select {cols} from {temp_table}
insert_ignore: insert into {table} ({fields}) values ({values}) on conflict ({pk_fields}) do nothing
insert_ignore_temp: insert into {table} ({names}) select {names} from {temp_table} on conflict ({pk_fields}) do nothing
update_temp: |
diff --git a/core/dbio/templates/redshift.yaml b/core/dbio/templates/redshift.yaml
index 3ae76dd54..76ec0be8a 100755
--- a/core/dbio/templates/redshift.yaml
+++ b/core/dbio/templates/redshift.yaml
@@ -26,7 +26,7 @@ core:
# optimize_column: |
# alter table {table} rename to {table_old};
# create table {table} ( {col_ddl} );
- # insert into {table} ({cols})
+ # insert into {table} ({fields})
# select {cols}
# from {table_old};
# drop table {table_old};
diff --git a/core/dbio/templates/sqlserver.yaml b/core/dbio/templates/sqlserver.yaml
index 62d883e54..bbee555bc 100755
--- a/core/dbio/templates/sqlserver.yaml
+++ b/core/dbio/templates/sqlserver.yaml
@@ -21,8 +21,8 @@ core:
incremental_select_limit: select top {limit} {fields} from {table} where ({incremental_where_cond}){where_and} order by {update_key} asc
incremental_select_limit_offset: select top {limit} * from ( select {fields} from {table} where ({incremental_where_cond}){where_and} order by {update_key} asc offset {offset} rows) as t
incremental_select: select {fields} from {table} where ({incremental_where_cond}){where_and}
- insert: insert into {table} ({cols}) values ({values})
- insert_temp: insert into {table} ({cols}) select {cols} from {temp_table}
+ insert: insert into {table} ({fields}) values ({values})
+ insert_temp: insert into {table} ({fields}) select {cols} from {temp_table}
insert_ignore: insert into {table} ({fields}) values ({values}) on conflict ({pk_fields}) do nothing
insert_ignore_temp: insert into {table} ({names}) select {names} from {temp_table} on conflict ({pk_fields}) do nothing
update_temp: |
diff --git a/core/dbio/templates/trino.yaml b/core/dbio/templates/trino.yaml
index 704d16b7b..f02fb8260 100755
--- a/core/dbio/templates/trino.yaml
+++ b/core/dbio/templates/trino.yaml
@@ -11,13 +11,13 @@ core:
update {table} as t1 set {set_fields2}
from (select * from {temp_table}) as t2
where {pk_fields_equal}
- insert: insert into {table} ({cols}) values ({values})
+ insert: insert into {table} ({fields}) values ({values})
limit: select {fields} from {table} offset {offset} limit {limit}
limit_sql: |
select * from (
{sql}
) as t offset {offset} limit {limit}
- insert_temp: insert into {table} ({cols}) select {cols} from {temp_table}
+ insert_temp: insert into {table} ({fields}) select {cols} from {temp_table}
insert_ignore: insert into {table} ({fields}) values ({values}) on conflict ({pk_fields}) do nothing
insert_ignore_temp: insert into {table} ({names}) select {names} from {temp_table} on conflict ({pk_fields}) do nothing
update_temp: |
diff --git a/core/env/env.go b/core/env/env.go
index b81c2bad7..0621f41d7 100755
--- a/core/env/env.go
+++ b/core/env/env.go
@@ -19,6 +19,7 @@ import (
)
var (
+ Marker = "Sling CLI | https://slingdata.io"
HomeDir = os.Getenv("SLING_HOME_DIR")
HomeDirEnvFile = ""
Env = &EnvFile{}
@@ -47,6 +48,26 @@ var (
return path.Join(RuntimeFolder(), g.F("%s.json", name))
}
setupOtel = func() {}
+
+ ReservedFields = struct {
+ LoadedAt string
+ SyncedAt string
+ SyncedOp string
+ DeletedAt string
+ StreamURL string
+ RowNum string
+ RowID string
+ ExecID string
+ }{
+ LoadedAt: "_sling_loaded_at",
+ SyncedAt: "_sling_synced_at",
+ SyncedOp: "_sling_synced_op",
+ DeletedAt: "_sling_deleted_at",
+ StreamURL: "_sling_stream_url",
+ RowNum: "_sling_row_num",
+ RowID: "_sling_row_id",
+ ExecID: "_sling_exec_id",
+ }
)
const (
diff --git a/core/sling/config.go b/core/sling/config.go
index 8bdd7d4f6..28088c989 100644
--- a/core/sling/config.go
+++ b/core/sling/config.go
@@ -15,6 +15,7 @@ import (
"github.com/slingdata-io/sling-cli/core/dbio/connection"
"github.com/slingdata-io/sling-cli/core/dbio/database"
"github.com/slingdata-io/sling-cli/core/dbio/filesys"
+ "github.com/slingdata-io/sling-cli/core/env"
"github.com/spf13/cast"
"github.com/flarco/g"
@@ -41,6 +42,8 @@ const (
SnapshotMode Mode = "snapshot"
// BackfillMode is to backfill
BackfillMode Mode = "backfill"
+ // DefinitionOnlyMode is to create table/file definition without data
+ DefinitionOnlyMode Mode = "definition-only"
)
var AllMode = []struct {
@@ -52,6 +55,7 @@ var AllMode = []struct {
{TruncateMode, "TruncateMode"},
{SnapshotMode, "SnapshotMode"},
{BackfillMode, "BackfillMode"},
+ {DefinitionOnlyMode, "DefinitionOnlyMode"},
}
// NewConfig return a config object from a YAML / JSON string
@@ -158,7 +162,14 @@ func (cfg *Config) SetDefault() {
cfg.Mode = FullRefreshMode
}
- if val := os.Getenv("SLING_LOADED_AT_COLUMN"); val != "" {
+ if val := os.Getenv("SLING_SYNCED_AT_COLUMN"); val != "" {
+ if cast.ToBool(val) {
+ cfg.MetadataSyncedAt = g.Bool(true)
+ env.ReservedFields.DeletedAt = env.ReservedFields.SyncedAt // deleted_at becomes synched_at
+ } else {
+ cfg.MetadataSyncedAt = g.Bool(false)
+ }
+ } else if val := os.Getenv("SLING_LOADED_AT_COLUMN"); val != "" {
if cast.ToBool(val) || val == "unix" || val == "timestamp" {
cfg.MetadataLoadedAt = g.Bool(true)
} else {
@@ -339,9 +350,9 @@ func (cfg *Config) DetermineType() (Type JobType, err error) {
}
}
- validMode := g.In(cfg.Mode, FullRefreshMode, IncrementalMode, BackfillMode, SnapshotMode, TruncateMode)
+ validMode := g.In(cfg.Mode, FullRefreshMode, IncrementalMode, BackfillMode, SnapshotMode, TruncateMode, DefinitionOnlyMode)
if !validMode {
- err = g.Error("must specify valid mode: full-refresh, incremental, backfill, snapshot or truncate")
+ err = g.Error("must specify valid mode: full-refresh, incremental, backfill, snapshot, truncate, or definition-only")
return
}
@@ -359,9 +370,11 @@ func (cfg *Config) DetermineType() (Type JobType, err error) {
// OK, no need for update key
} else if srcApiProvided {
// OK, no need for update key/pk, API uses SLING_STATE for tracking
- } else if srcFileProvided && cfg.Source.UpdateKey == slingLoadedAtColumn {
+ } else if srcFileProvided && cfg.Source.UpdateKey == env.ReservedFields.LoadedAt {
// need to loaded_at column for file incremental
cfg.MetadataLoadedAt = g.Bool(true)
+ } else if srcFileProvided && cfg.Source.UpdateKey == env.ReservedFields.SyncedAt {
+ cfg.MetadataSyncedAt = g.Bool(true)
} else if cfg.Source.UpdateKey == "" && len(cfg.Source.PrimaryKey()) == 0 {
err = g.Error("must specify value for 'update_key' and/or 'primary_key' for incremental mode. See docs for more details: https://docs.slingdata.io/sling-cli/run/configuration")
if args := os.Getenv("SLING_CLI_ARGS"); strings.Contains(args, "-src-conn") || strings.Contains(args, "-tgt-conn") {
@@ -386,6 +399,15 @@ func (cfg *Config) DetermineType() (Type JobType, err error) {
}
} else if cfg.Mode == SnapshotMode {
cfg.MetadataLoadedAt = g.Bool(true) // needed for snapshot mode
+ } else if cfg.Mode == DefinitionOnlyMode {
+ // For file targets, only parquet and arrow formats are supported
+ if tgtFileProvided {
+ format := cfg.Target.ObjectFileFormat()
+ if !g.In(format, dbio.FileTypeParquet, dbio.FileTypeArrow) {
+ err = g.Error("definition-only mode for file targets only supports parquet or arrow formats, got: %s", format)
+ return
+ }
+ }
}
if srcDbProvided && tgtDbProvided {
@@ -860,8 +882,10 @@ func (cfg *Config) FormatTargetObjectName() (err error) {
cfg.Target.Object = strings.TrimSpace(renderedObject)
if cfg.TgtConn.Type.IsDb() {
+ dbType := cfg.TgtConn.GetType()
+
// normalize casing of object names
- table, err := database.ParseTableName(cfg.Target.Object, cfg.TgtConn.Type)
+ table, err := database.ParseTableName(cfg.Target.Object, dbType)
if err != nil {
return g.Error(err, "could not parse target table name")
} else if table.IsQuery() {
@@ -875,7 +899,7 @@ func (cfg *Config) FormatTargetObjectName() (err error) {
if tgtOpts := cfg.Target.Options; tgtOpts != nil {
tgtOpts.TableTmp = strings.TrimSpace(g.Rm(tgtOpts.TableTmp, m))
if tgtOpts.TableTmp != "" {
- tableTmp, err := database.ParseTableName(tgtOpts.TableTmp, cfg.TgtConn.Type)
+ tableTmp, err := database.ParseTableName(tgtOpts.TableTmp, dbType)
if err != nil {
return g.Error(err, "could not parse temp table name")
} else if tableTmp.Schema == "" {
@@ -885,17 +909,17 @@ func (cfg *Config) FormatTargetObjectName() (err error) {
}
}
- if cfg.TgtConn.Type.DBNameUpperCase() {
+ if dbType.DBNameUpperCase() {
tableTmp.Name = strings.ToUpper(tableTmp.Name)
}
tgtOpts.TableTmp = tableTmp.FullName()
- } else if g.In(cfg.TgtConn.Type, dbio.TypeDbDuckDb, dbio.TypeDbDuckLake) {
+ } else if g.In(dbType, dbio.TypeDbDuckDb, dbio.TypeDbDuckLake) {
// for duckdb and ducklake, we'll use a temp table, which uses the 'main' schema
- tableTmp := makeTempTableName(cfg.TgtConn.Type, table, "_sling_duckdb_tmp")
+ tableTmp := makeTempTableName(dbType, table, "_sling_duckdb_tmp")
tableTmp.Schema = "main"
tgtOpts.TableTmp = tableTmp.FullName()
} else {
- tableTmp := makeTempTableName(cfg.TgtConn.Type, table, "_tmp")
+ tableTmp := makeTempTableName(dbType, table, "_tmp")
tgtOpts.TableTmp = tableTmp.FullName()
}
}
@@ -944,8 +968,13 @@ func (cfg *Config) GetFormatMap() (m map[string]any, err error) {
m["target_name"] = strings.ToLower(cfg.Target.Conn)
}
- if cfg.ReplicationStream != nil && cfg.ReplicationStream.ID != "" {
- m["stream_run_id"] = cfg.ReplicationStream.ID
+ if cfg.ReplicationStream != nil {
+ if cfg.ReplicationStream.ID != "" {
+ m["stream_run_id"] = cfg.ReplicationStream.ID
+ }
+ if cfg.ReplicationStream.Description != "" {
+ m["stream_description"] = cfg.ReplicationStream.Description
+ }
}
if cfg.SrcConn.Type.IsDb() {
@@ -1227,6 +1256,7 @@ type Config struct {
IncrementalGTE bool `json:"incremental_gte,omitempty" yaml:"incremental_gte,omitempty"`
MetadataLoadedAt *bool `json:"-" yaml:"-"`
+ MetadataSyncedAt *bool `json:"-" yaml:"-"`
MetadataStreamURL bool `json:"-" yaml:"-"`
MetadataRowNum bool `json:"-" yaml:"-"`
MetadataRowID bool `json:"-" yaml:"-"`
diff --git a/core/sling/task.go b/core/sling/task.go
index 53411ea3a..2057e03e5 100644
--- a/core/sling/task.go
+++ b/core/sling/task.go
@@ -341,29 +341,34 @@ func (t *TaskExecution) GetRate(secWindow int) (rowRate, byteRate int64) {
}
func (t *TaskExecution) setGetMetadata() (metadata iop.Metadata) {
- if t.Config.MetadataLoadedAt != nil && *t.Config.MetadataLoadedAt {
- metadata.LoadedAt.Key = slingLoadedAtColumn
+ if t.Config.MetadataSyncedAt != nil && *t.Config.MetadataSyncedAt {
+ metadata.SyncedAt.Key = env.ReservedFields.SyncedAt
+ metadata.SyncedAt.Value = *t.StartTime // only timestamp
+ metadata.SyncedOp.Key = env.ReservedFields.SyncedOp
+ metadata.SyncedOp.Value = "I" // default to insert operation
+ } else if t.Config.MetadataLoadedAt != nil && *t.Config.MetadataLoadedAt {
+ metadata.SyncedAt.Key = env.ReservedFields.LoadedAt
if os.Getenv("SLING_LOADED_AT_COLUMN") == "timestamp" {
- metadata.LoadedAt.Value = *t.StartTime
+ metadata.SyncedAt.Value = *t.StartTime
} else {
- metadata.LoadedAt.Value = t.StartTime.Unix()
+ metadata.SyncedAt.Value = t.StartTime.Unix()
}
}
if t.Config.MetadataStreamURL {
- metadata.StreamURL.Key = slingStreamURLColumn
+ metadata.StreamURL.Key = env.ReservedFields.StreamURL
}
if t.Config.MetadataRowID {
- metadata.RowID.Key = slingRowIDColumn
+ metadata.RowID.Key = env.ReservedFields.RowID
}
if t.Config.MetadataExecID {
- metadata.ExecID.Key = slingExecIDColumn
+ metadata.ExecID.Key = env.ReservedFields.ExecID
metadata.ExecID.Value = t.ExecID
}
if t.Config.MetadataRowNum {
- metadata.RowNum.Key = slingRowNumColumn
+ metadata.RowNum.Key = env.ReservedFields.RowNum
}
// StarRocks: add _sling_row_id column if there is no primary,
@@ -385,8 +390,8 @@ func (t *TaskExecution) setGetMetadata() (metadata iop.Metadata) {
}
if addRowIDCol {
- metadata.RowID.Key = slingRowIDColumn
- t.Config.Target.Options.TableKeys[iop.HashKey] = []string{slingRowIDColumn}
+ metadata.RowID.Key = env.ReservedFields.RowID
+ t.Config.Target.Options.TableKeys[iop.HashKey] = []string{env.ReservedFields.RowID}
}
}
@@ -539,7 +544,7 @@ func (t *TaskExecution) getSourceOptionsMap() (options map[string]any) {
}
// set target type for column casing, name length validation
- options["target_type"] = string(t.Config.TgtConn.Type)
+ options["target_type"] = string(t.Config.TgtConn.GetType())
return
}
@@ -568,7 +573,7 @@ func (t *TaskExecution) getTargetOptionsMap() (options map[string]any) {
}
// set target type for column casing, name length validation
- options["target_type"] = string(t.Config.TgtConn.Type)
+ options["target_type"] = string(t.Config.TgtConn.GetType())
// set to delete file/folder
options["delete_file"] = true
diff --git a/core/sling/task_run.go b/core/sling/task_run.go
index e9ab204f4..99d061197 100644
--- a/core/sling/task_run.go
+++ b/core/sling/task_run.go
@@ -25,13 +25,7 @@ import (
)
var (
- start time.Time
- slingLoadedAtColumn = "_sling_loaded_at"
- slingDeletedAtColumn = "_sling_deleted_at"
- slingStreamURLColumn = "_sling_stream_url"
- slingRowNumColumn = "_sling_row_num"
- slingRowIDColumn = "_sling_row_id"
- slingExecIDColumn = "_sling_exec_id"
+ start time.Time
)
var deleteMissing func(*TaskExecution, database.Connection, database.Connection) error = func(_ *TaskExecution, _, _ database.Connection) error {
@@ -483,7 +477,7 @@ func (t *TaskExecution) runFileToDB() (err error) {
t.Context.Map.Set("incremental_value", t.Config.IncrementalValStr)
} else if t.isIncrementalWithUpdateKey() && !t.Config.IsIncrementalWithRange() {
if t.Config.Source.UpdateKey == "." {
- t.Config.Source.UpdateKey = slingLoadedAtColumn
+ t.Config.Source.UpdateKey = env.ReservedFields.LoadedAt
}
t.SetProgress("getting checkpoint value (%s)", t.Config.Source.UpdateKey)
diff --git a/core/sling/task_run_read.go b/core/sling/task_run_read.go
index 5995e0c32..c38afaf6e 100644
--- a/core/sling/task_run_read.go
+++ b/core/sling/task_run_read.go
@@ -12,6 +12,7 @@ import (
"github.com/slingdata-io/sling-cli/core/dbio/database"
"github.com/slingdata-io/sling-cli/core/dbio/filesys"
"github.com/slingdata-io/sling-cli/core/dbio/iop"
+ "github.com/slingdata-io/sling-cli/core/env"
"github.com/spf13/cast"
)
@@ -45,9 +46,19 @@ func (t *TaskExecution) ReadFromDB(cfg *Config, srcConn database.Connection) (df
if len(cfg.Source.Select) > 0 {
selectFields = lo.Map(cfg.Source.Select, func(f string, i int) string {
- // lookup column name
- col := sTable.Columns.GetColumn(srcConn.Unquote(f))
+ // Parse the expression to extract original column name
+ original, alias, isExclude, _ := iop.ParseSelectExpr(f)
+
+ if isExclude {
+ return f // Pass through exclusion as-is for later handling
+ }
+
+ // Lookup the original column (for case correction)
+ col := sTable.Columns.GetColumn(srcConn.Unquote(original))
if col != nil {
+ if alias != "" {
+ return col.Name + " as " + alias
+ }
return col.Name
}
return f
@@ -63,9 +74,12 @@ func (t *TaskExecution) ReadFromDB(cfg *Config, srcConn database.Connection) (df
}
includedCols := lo.Filter(sTable.Columns, func(c iop.Column, i int) bool {
+ colNameLower := strings.ToLower(c.Name)
for _, exField := range excluded {
exField = srcConn.Unquote(strings.TrimPrefix(exField, "-"))
- if strings.EqualFold(c.Name, exField) {
+ exFieldLower := strings.ToLower(exField)
+ // Use glob matching to support patterns like "address_*"
+ if iop.MatchesSelectGlob(colNameLower, exFieldLower) {
return false
}
}
@@ -156,8 +170,12 @@ func (t *TaskExecution) ReadFromDB(cfg *Config, srcConn database.Connection) (df
)
sFields := lo.Map(selectFields, func(sf string, i int) string {
- col := sTable.Columns.GetColumn(srcConn.Unquote(sf))
+ original, alias, _, _ := iop.ParseSelectExpr(sf)
+ col := sTable.Columns.GetColumn(srcConn.Unquote(original))
if col != nil {
+ if alias != "" {
+ return srcConn.Quote(col.Name) + " as " + srcConn.Quote(alias)
+ }
return srcConn.Quote(col.Name) // apply quotes if match
}
return sf
@@ -207,8 +225,12 @@ func (t *TaskExecution) ReadFromDB(cfg *Config, srcConn database.Connection) (df
// if {fields} placeholder is used, replace it with selected fields to avoid double wrapping
if strings.Contains(sTable.SQL, "{fields}") {
sFields := lo.Map(selectFields, func(sf string, i int) string {
- col := sTable.Columns.GetColumn(srcConn.Unquote(sf))
+ original, alias, _, _ := iop.ParseSelectExpr(sf)
+ col := sTable.Columns.GetColumn(srcConn.Unquote(original))
if col != nil {
+ if alias != "" {
+ return srcConn.Quote(col.Name) + " as " + srcConn.Quote(alias)
+ }
return srcConn.Quote(col.Name) // apply quotes if match
}
return sf
@@ -218,6 +240,11 @@ func (t *TaskExecution) ReadFromDB(cfg *Config, srcConn database.Connection) (df
selectFields = []string{"*"}
}
+ // For definition-only mode, inject WHERE 1=0 to avoid reading data
+ if cfg.Mode == DefinitionOnlyMode {
+ cfg.Source.Where = "1=0"
+ }
+
// construct select statement for selected fields or where condition
if len(selectFields) > 1 || selectFields[0] != "*" || cfg.Source.Where != "" || cfg.Source.Limit() > 0 {
if sTable.SQL != "" && !cfg.SrcConn.Type.IsNoSQL() && !strings.Contains(sTable.SQL, "{fields}") {
@@ -273,7 +300,7 @@ func (t *TaskExecution) ReadFromFile(cfg *Config) (df *iop.Dataflow, err error)
if t.Config.HasIncrementalVal() && !t.Config.IsFileStreamWithStateAndParts() {
// file stream incremental mode
- if t.Config.Source.UpdateKey == slingLoadedAtColumn {
+ if g.In(t.Config.Source.UpdateKey, env.ReservedFields.LoadedAt, env.ReservedFields.SyncedAt) {
options["SLING_FS_TIMESTAMP"] = t.Config.IncrementalValStr
g.Debug(`file stream using file_sys_timestamp=%#v and update_key=%s`, t.Config.IncrementalValStr, t.Config.Source.UpdateKey)
} else {
@@ -305,6 +332,12 @@ func (t *TaskExecution) ReadFromFile(cfg *Config) (df *iop.Dataflow, err error)
IncrementalValue: cfg.IncrementalValStr,
}
+ // limit when definition-only
+ if cfg.Mode == DefinitionOnlyMode {
+ fsCfg.SchemaOnly = true
+ fsCfg.Limit = iop.SampleSize
+ }
+
// format the uri if it has placeholders
// determine uri if it has part fields, find first parent folder
if t.Config.IsFileStreamWithStateAndParts() {
@@ -418,6 +451,12 @@ func (t *TaskExecution) ReadFromApi(cfg *Config, srcConn *api.APIConnection) (df
Range: g.PtrVal(t.Config.Source.Options.Range),
DsConfigMap: t.getSourceOptionsMap(),
}
+
+ if cfg.Mode == DefinitionOnlyMode {
+ sCfg.SchemaOnly = true
+ sCfg.Limit = iop.SampleSize
+ }
+
df, err = srcConn.ReadDataflow(cfg.StreamName, sCfg)
if err != nil {
err = g.Error(err, "Could not ReadDataflow for %s", cfg.SrcConn.Type)
diff --git a/core/sling/task_run_write.go b/core/sling/task_run_write.go
index f348a50c6..665a12728 100644
--- a/core/sling/task_run_write.go
+++ b/core/sling/task_run_write.go
@@ -29,7 +29,8 @@ func (t *TaskExecution) WriteToFile(cfg *Config, df *iop.Dataflow) (cnt uint64,
dateMap := iop.GetISO8601DateMap(time.Now())
cfg.TgtConn.Set(g.M("url", g.Rm(uri, dateMap)))
- if len(df.Buffer) == 0 && !cast.ToBool(os.Getenv("SLING_ALLOW_EMPTY")) {
+ // Skip empty buffer check for definition-only mode (we intentionally have 0 rows)
+ if len(df.Buffer) == 0 && cfg.Mode != DefinitionOnlyMode && !cast.ToBool(os.Getenv("SLING_ALLOW_EMPTY")) {
g.Warn("No data or records found in stream. Nothing to do. To allow Sling to create empty files, set SLING_ALLOW_EMPTY=TRUE")
return
}
@@ -205,6 +206,8 @@ func (t *TaskExecution) WriteToDb(cfg *Config, df *iop.Dataflow, tgtConn databas
if directInsert || writeDirectly {
if g.In(cfg.Mode, IncrementalMode, BackfillMode) && len(cfg.Source.PrimaryKey()) > 0 {
g.Warn("mode '%s' with a primary-key is not supported for direct write, falling back to using a temporary table.", cfg.Mode)
+ } else if cfg.Mode == DefinitionOnlyMode {
+ // continue as normal, since only definition
} else {
return t.writeToDbDirectly(cfg, df, tgtConn)
}
@@ -245,6 +248,34 @@ func (t *TaskExecution) WriteToDb(cfg *Config, df *iop.Dataflow, tgtConn databas
return 0, err
}
+ // Handle definition-only mode: create final table and exit without data
+ if cfg.Mode == DefinitionOnlyMode {
+ setStage("5 - prepare-final")
+
+ // Set columns and keys on target table
+ targetTable.Columns = sampleData.Columns
+ if err := targetTable.SetKeys(cfg.Source.PrimaryKey(), cfg.Source.UpdateKey, cfg.Target.Options.TableKeys); err != nil {
+ err = g.Error(err, "could not set keys for "+targetTable.FullName())
+ return 0, err
+ }
+
+ // Drop existing table if it exists
+ if err := tgtConn.DropTable(targetTable.FullName()); err != nil {
+ g.Debug("could not drop existing table %s: %v", targetTable.FullName(), err)
+ }
+
+ // Create the final table with inferred schema
+ if err := createTable(t, tgtConn, targetTable, sampleData, false); err != nil {
+ err = g.Error(err, "could not create table "+targetTable.FullName())
+ return 0, err
+ }
+
+ df.Close()
+ t.SetProgress("created table definition %s with %d columns", targetTable.FullName(), len(sampleData.Columns))
+ setStage("6 - closing")
+ return 0, nil
+ }
+
// Set table keys
tableTmp.Columns = sampleData.Columns
if err := tableTmp.SetKeys(cfg.Source.PrimaryKey(), cfg.Source.UpdateKey, cfg.Target.Options.TableKeys); err != nil {
diff --git a/core/sling/task_state.go b/core/sling/task_state.go
index 0f75eda24..7e6d5cb0b 100644
--- a/core/sling/task_state.go
+++ b/core/sling/task_state.go
@@ -2,6 +2,7 @@ package sling
import (
"os"
+ "path"
"time"
"github.com/flarco/g"
@@ -106,7 +107,8 @@ func (dts *DateTimeState) Update() {
type ExecutionState struct {
ID string `json:"id"`
- FilePath string `json:"string"`
+ FilePath string `json:"file_path"`
+ FileName string `json:"file_name"`
TotalBytes uint64 `json:"total_bytes"`
TotalRows uint64 `json:"total_rows"`
Status StatusMap `json:"status"`
@@ -161,6 +163,7 @@ type StreamState struct {
FileExt string `json:"file_ext,omitempty"`
FilePath string `json:"file_path,omitempty"`
Name string `json:"name,omitempty"`
+ Description string `json:"description,omitempty"`
Schema string `json:"schema,omitempty"`
SchemaLower string `json:"schema_lower,omitempty"`
SchemaUpper string `json:"schema_upper,omitempty"`
@@ -168,6 +171,7 @@ type StreamState struct {
TableLower string `json:"table_lower,omitempty"`
TableUpper string `json:"table_upper,omitempty"`
FullName string `json:"full_name,omitempty"`
+ ID string `json:"id,omitempty"`
}
type ObjectState struct {
@@ -193,6 +197,7 @@ func (t *TaskExecution) StateSet() {
}
state.Execution.FilePath = t.Config.Env["SLING_CONFIG_PATH"]
+ state.Execution.FileName = path.Base(state.Execution.FilePath)
fMap, _ := t.Config.GetFormatMap()
@@ -210,11 +215,13 @@ func (t *TaskExecution) StateSet() {
}
}
+ run.ID = runID
run.Stream.FileFolder = cast.ToString(fMap["stream_file_folder"])
run.Stream.FileName = cast.ToString(fMap["stream_file_name"])
run.Stream.FileExt = cast.ToString(fMap["stream_file_ext"])
run.Stream.FilePath = cast.ToString(fMap["stream_file_path"])
run.Stream.Name = cast.ToString(fMap["stream_name"])
+ run.Stream.Description = cast.ToString(fMap["stream_description"])
run.Stream.FullName = cast.ToString(fMap["stream_full_name"])
run.Stream.Schema = cast.ToString(fMap["stream_schema"])
run.Stream.SchemaLower = cast.ToString(fMap["stream_schema_lower"])
@@ -222,6 +229,7 @@ func (t *TaskExecution) StateSet() {
run.Stream.Table = cast.ToString(fMap["stream_table"])
run.Stream.TableLower = cast.ToString(fMap["stream_table_lower"])
run.Stream.TableUpper = cast.ToString(fMap["stream_table_upper"])
+ run.Stream.ID = t.Config.StreamID()
run.Object.Name = cast.ToString(fMap["object_name"])
run.Object.FullName = cast.ToString(fMap["object_full_name"])
diff --git a/core/version.go b/core/version.go
index 121cb18dc..9c5fc4bec 100755
--- a/core/version.go
+++ b/core/version.go
@@ -22,7 +22,7 @@ var TelProps = g.M(
func init() {
// dev build version is in format => 1.2.2.dev/2024-08-20
parts := strings.Split(Version, "/")
- if len(parts) != 2 {
+ if len(parts) != 2 || os.Getenv("SLING_AGENT_ID") != "" {
return
}
diff --git a/go.mod b/go.mod
index bc8bccbcd..afcde7928 100644
--- a/go.mod
+++ b/go.mod
@@ -76,7 +76,7 @@ require (
github.com/shirou/gopsutil/v3 v3.24.4
github.com/shopspring/decimal v1.4.0
github.com/sijms/go-ora/v2 v2.8.24
- github.com/slingdata-io/godbc v0.0.3
+ github.com/slingdata-io/godbc v0.0.4
github.com/slingdata-io/sling v0.0.0-20240426022644-3c31b1eb088e
github.com/snowflakedb/gosnowflake v1.17.1
github.com/spf13/cast v1.7.1
diff --git a/go.sum b/go.sum
index 4b3ae28ce..2a2d0884a 100644
--- a/go.sum
+++ b/go.sum
@@ -341,6 +341,8 @@ github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM=
github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU=
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
+github.com/fergusstrange/embedded-postgres v1.31.0 h1:JmRxw2BcPRcU141nOEuGXbIU6jsh437cBB40rmftZSk=
+github.com/fergusstrange/embedded-postgres v1.31.0/go.mod h1:w0YvnCgf19o6tskInrOOACtnqfVlOvluz3hlNLY7tRk=
github.com/flarco/bigquery v0.0.9 h1:WfxO6XuuHZTJV+55Bq24FhdHYpmAOzgVk9xOcJpEecY=
github.com/flarco/bigquery v0.0.9/go.mod h1:IpRSw4quaXxHjFyDSXUo7B6v+XcNF2pSmnNfeqXa/gM=
github.com/flarco/databricks-sql-go v0.0.0-20250613120556-51f7c1f3b4ad h1:z5mgsXmNXsgskClg/s6zelILFihJTyK6x7+zX1jUgyU=
@@ -944,8 +946,8 @@ github.com/skratchdot/open-golang v0.0.0-20200116055534-eef842397966 h1:JIAuq3EE
github.com/skratchdot/open-golang v0.0.0-20200116055534-eef842397966/go.mod h1:sUM3LWHvSMaG192sy56D9F7CNvL7jUJVXoqM1QKLnog=
github.com/slingdata-io/arrow-adbc/go/adbc v0.0.0-20260105180115-72bb86fc9587 h1:IRiAvb/AGt2qii99UwF8QvhBAs8H+/Udu2GgVWFg3ys=
github.com/slingdata-io/arrow-adbc/go/adbc v0.0.0-20260105180115-72bb86fc9587/go.mod h1:FxeVw2tiKoUgPdOOXt4pwi2BuuQ17Cop7vnJTQL/z2s=
-github.com/slingdata-io/godbc v0.0.3 h1:yj+Z/E6Hud2a/XKNIxhx3qggHSHpt70WfesgkWn6rj8=
-github.com/slingdata-io/godbc v0.0.3/go.mod h1:I7r2EZl10tyzeXMq4WLwGyLqD0ZlEtJXOsOSbAl+iLI=
+github.com/slingdata-io/godbc v0.0.4 h1:W39fdWNpBms6PItrSEfocedK1SaGYnofkeXW/xRXsYA=
+github.com/slingdata-io/godbc v0.0.4/go.mod h1:I7r2EZl10tyzeXMq4WLwGyLqD0ZlEtJXOsOSbAl+iLI=
github.com/slingdata-io/pocketbase v0.22.136 h1:RtAvPvYdK0qm9EB1r8GzNeEfSiqDK+tV8jyxwbpKKBA=
github.com/slingdata-io/pocketbase v0.22.136/go.mod h1:RYAdoMZtW+3OIgKqg+YhgWGIiwjtcBHGxRcVF2+1klA=
github.com/snowflakedb/gosnowflake v1.17.1 h1:sBYExPDRv6hHF7fCqeXMT745L326Byw/cROxvCiEJzo=
@@ -1069,6 +1071,8 @@ github.com/xeipuuv/gojsonschema v1.2.0 h1:LhYJRs+L4fBtjZUfuSZIKGeVu0QRy8e5Xi7D17
github.com/xeipuuv/gojsonschema v1.2.0/go.mod h1:anYRn/JVcOK2ZgGU+IjEV4nwlhoK5sQluxsYJ78Id3Y=
github.com/xhit/go-str2duration/v2 v2.1.0 h1:lxklc02Drh6ynqX+DdPyp5pCKLUQpRT8bp8Ydu2Bstc=
github.com/xhit/go-str2duration/v2 v2.1.0/go.mod h1:ohY8p+0f07DiV6Em5LKB0s2YpLtXVyJfNt1+BlmyAsU=
+github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 h1:nIPpBwaJSVYIxUFsDv3M8ofmx9yWTog9BfvIu0q41lo=
+github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8/go.mod h1:HUYIGzjTL3rfEspMxjDjgmT5uz5wzYJKVo23qUhYTos=
github.com/xo/dburl v0.3.0 h1:KGkeJB/oQhY/DeeJoYl/1+pNE/JnF6ouAuA8nzpQEQ8=
github.com/xo/dburl v0.3.0/go.mod h1:TM8VMBT+LWqC3MBOulZjb8FAthcvZq0t/qvDLwS6skU=
github.com/xo/terminfo v0.0.0-20210125001918-ca9a967f8778/go.mod h1:2MuV+tbUrU1zIOPMxZ5EncGwgmMJsa+9ucAQZXxsObs=
diff --git a/justfile b/justfile
index 17037057b..020e73911 100644
--- a/justfile
+++ b/justfile
@@ -30,16 +30,16 @@ test-replication-defaults:
cd cmd/sling && go test -v -run 'TestReplicationDefaults' && cd -
# Test file connections
-test-connections-file arg1="TestSuiteFile":
+test-connections-file arg1="TestSuiteFile" arg2="" arg3="":
#!/usr/bin/env bash
- echo "TESTING file connections {{arg1}}"
- cd cmd/sling && go test -v -parallel 3 -run "{{arg1}}" && cd -
+ echo "TESTING file connections {{arg1}} {{arg2}}"
+ cd cmd/sling && go test -v -parallel 3 -run "{{arg1}}" -- "{{arg2}}" "{{arg3}}" && cd -
# Test database connections
-test-connections-database arg1="TestSuiteDatabase" arg2="":
+test-connections-database arg1="TestSuiteDatabase" arg2="" arg3="":
#!/usr/bin/env bash
echo "TESTING database connections {{arg1}} {{arg2}}"
- cd cmd/sling && SKIP_CLICKHOUSE=TRUE RUN_ALL=TRUE go test -v -parallel 4 -timeout 35m -run "{{arg1}}" -- "{{arg2}}" && cd -
+ cd cmd/sling && SKIP_CLICKHOUSE=TRUE RUN_ALL=TRUE go test -v -parallel 4 -timeout 35m -run "{{arg1}}" -- "{{arg2}}" "{{arg3}}" && cd -
# Test core (sling core functionality)
test-core:
@@ -68,7 +68,7 @@ test-dbio-iop:
test-dbio-database:
#!/usr/bin/env bash
echo "TESTING dbio database"
- cd core/dbio/database && go test -v -run 'TestParseTableName|TestRegexMatch|TestParseColumnName|TestParseSQLMultiStatements|TestTrimSQLComments' && cd -
+ cd core/dbio/database && go test -v -run 'TestParseTableName|TestRegexMatch|TestParseColumnName|TestParseSQLMultiStatements|TestTrimSQLComments|TestAddPrimaryKeyToDDL' && cd -
cd core/dbio/database && go test -run TestChunkByColumnRange && cd -
# Test dbio filesys