From f4152f35e9ddf95977eb249255d25207f953588f Mon Sep 17 00:00:00 2001 From: Tom Saleeba Date: Wed, 14 Nov 2018 18:17:15 +1030 Subject: [PATCH 1/5] fix: re: chris72205's comment. rename --add_constraints to use a hypen not an underscore, like the README documents. --- lib/pgsync.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/pgsync.rb b/lib/pgsync.rb index 905ce2ea..dc6a22b7 100644 --- a/lib/pgsync.rb +++ b/lib/pgsync.rb @@ -129,7 +129,7 @@ def perform log_completed(start_time) end - if opts[:add_constraints] + if opts[:add-constraints] log "* Adding constraints/triggers" dump_command = "pg_dump -Fc --verbose --section=post-data --no-owner --no-acl #{to_url(source_uri)}" restore_command = "pg_restore --verbose --no-owner --no-acl --clean #{if_exists ? "--if-exists" : nil} -d #{to_url(destination_uri)}" @@ -317,7 +317,7 @@ def parse_args(args) o.boolean "--truncate", "truncate existing rows", default: false o.boolean "--schema-only", "schema only", default: false o.boolean "--no-constraints", "exclude constraints/triggers when syncing schema", default: false - o.boolean "--add_constraints", "add constraints and triggers after syncing data", default: false + o.boolean "--add-constraints", "add constraints and triggers after syncing data", default: false o.boolean "--no-rules", "do not apply data rules", default: false o.boolean "--setup", "setup", default: false o.boolean "--in-batches", "in batches", default: false, help: false From 26fb1855b5b4c8560e96e0ce6a748d4c10664d12 Mon Sep 17 00:00:00 2001 From: Tom Saleeba Date: Wed, 14 Nov 2018 18:26:31 +1030 Subject: [PATCH 2/5] fix: correct the hash key syntax, I'm a ruby noob --- lib/pgsync.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/pgsync.rb b/lib/pgsync.rb index dc6a22b7..ee168aa5 100644 --- a/lib/pgsync.rb +++ b/lib/pgsync.rb @@ -129,7 +129,7 @@ def perform log_completed(start_time) end - if opts[:add-constraints] + if opts["add-constraints"] log "* Adding constraints/triggers" dump_command = "pg_dump -Fc --verbose --section=post-data --no-owner --no-acl #{to_url(source_uri)}" restore_command = "pg_restore --verbose --no-owner --no-acl --clean #{if_exists ? "--if-exists" : nil} -d #{to_url(destination_uri)}" From ef6ee54d491f28b13b9017a8a918fd9470252467 Mon Sep 17 00:00:00 2001 From: Tom Saleeba Date: Fri, 8 Mar 2019 12:41:11 +1030 Subject: [PATCH 3/5] feat: add docker image infrastructure --- Dockerfile | 19 ++++++++ README.md | 3 ++ docker/README.md | 74 +++++++++++++++++++++++++++++++ docker/entrypoint.sh | 10 +++++ docker/example/.env | 6 +++ docker/example/add-data.sql | 9 ++++ docker/example/docker-compose.yml | 48 ++++++++++++++++++++ docker/run.sh | 26 +++++++++++ 8 files changed, 195 insertions(+) create mode 100644 Dockerfile create mode 100644 docker/README.md create mode 100644 docker/entrypoint.sh create mode 100644 docker/example/.env create mode 100644 docker/example/add-data.sql create mode 100644 docker/example/docker-compose.yml create mode 100644 docker/run.sh diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..ce33fb6a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,19 @@ +FROM ruby:2.5-alpine3.8 +LABEL author="Tom Saleeba" +LABEL description="pgsync running under cron for periodic DB synchronisation" + +ADD . /app/src/ +WORKDIR /app/ +RUN \ + apk add --no-cache postgresql-client postgresql-dev && \ + apk add --no-cache --virtual .build-deps git build-base && \ + cd src/ && \ + gem build pgsync.gemspec && \ + gem install pgsync-*.gem && \ + apk del .build-deps && \ + mv docker/*.sh ../ && \ + cd .. && \ + rm -r src/ + +ENTRYPOINT [ "/bin/sh", "/app/entrypoint.sh" ] + diff --git a/README.md b/README.md index 659d81fc..0b03ab9f 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,9 @@ Sync Postgres data to your local machine. Designed for: :tangerine: Battle-tested at [Instacart](https://www.instacart.com/opensource) +## Docker usage +See the [docker/README.md](./docker/README.md) file. + ## Installation pgsync is a command line tool. To install, run: diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 00000000..598a00a2 --- /dev/null +++ b/docker/README.md @@ -0,0 +1,74 @@ +> Periodically synchronises the data from one PG DB to another + +We're using a fork of a fork of pgsync. The version we're using is +https://github.com/ternandsparrow/pgsync/tree/ternandsparrow-patch-1 but the original is +https://github.com/ankane/pgsync. The `tomsaleeba-patch-1` fork doesn't add much but the reason we need the `arshsingh` +fork is that it adds support for opting out of sync-ing constraints on the DB. We have foreign keys present in our DB +but `pg_restore` (which `pgsync` uses under the covers) blindy tries to restore tables in alphabetical order. In our +case, this causes violations. The `pgsync` tool has options for configuring groups of tables so we could go to the +effort of mapping all our tables to groups and then calling sync on each group in order but that's too much effort (and +it's brittle). We've chosen this fork so we can sync **without constraints**. This shouldn't matter because we're just a +read-only mirror of production SWARM. + +## Usage + +This container is intended to be used in a docker-compose stack. If you have the target DB in your stack and the source +DB is elsewhere, you can do something like: +```yaml +version: '3' +services: + db: + image: postgres:10 + environment: + POSTGRES_DB: app_db + POSTGRES_USER: writeuser + POSTGRES_PASSWORD: pokemon + restart: unless-stopped + db-sync: + image: ternandsparrow/pgsync:dev # TODO select a tag + links: + - db:db + environment: + FROM_USER: readonlyuser + FROM_PASS: bananas + FROM_HOST: db.example.com + FROM_PORT: 5432 + FROM_DB: allthedata + TO_USER: writeuser + TO_PASS: pokemon + TO_HOST: db + TO_PORT: 5432 + TO_DB: app_db + CRON_SCHEDULE: '1 1 * * *' + restart: unless-stopped + depends_on: + - db +``` + +The periodic command that is run by `cron` will **only sync data**. This command will fail if the schema doesn't already exist. To fix this, after you've deployed the stack, you should do this manual, one-off step to create the schema: +```bash +docker exec -i example_db-sync_1 sh -c 'SCHEMA_ONLY=1 sh /run.sh' +``` + +If you're impatient and don't want to wait for the first data sync to get some data, you can trigger that using a manual step too: +```bash +docker exec -i example_db-sync_1 sh -c 'sh /run.sh' +``` + +## Run example docker-compose stack + +This example creates two PG DBs. The first is loaded with some data that we want to sync. For the purposes of this +example, we override the entrypoint so we can do a schema sync, then data sync, then print the results. You shouldn't do +this when you deploy. + +```bash +cd example/ +docker-compose up --build +# when you see the output of the select statement: +# db-sync_1 | 1 | one +# db-sync_1 | 2 | two +# db-sync_1 | 3 | three +# db-sync_1 | 4 | four +# ... then ctrl+c +docker-compose down --volumes +``` diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh new file mode 100644 index 00000000..6e6c6124 --- /dev/null +++ b/docker/entrypoint.sh @@ -0,0 +1,10 @@ +#!/bin/sh +# schedule task in cron and start daemon + +# assert env vars exist with bash parameter expansion (http://wiki.bash-hackers.org/syntax/pe#display_error_if_null_or_unset) +: ${CRON_SCHEDULE:?} + +redirectToDockerLogs='> /proc/1/fd/1 2> /proc/1/fd/2' +echo "$CRON_SCHEDULE sh /run.sh $redirectToDockerLogs" > /var/spool/cron/crontabs/root +crond -l 2 -f + diff --git a/docker/example/.env b/docker/example/.env new file mode 100644 index 00000000..83db4ee2 --- /dev/null +++ b/docker/example/.env @@ -0,0 +1,6 @@ +SRC_DB=db1 +SRC_USER=user +SRC_PASS=pass +DEST_DB=db2 +DEST_USER=user +DEST_PASS=pass diff --git a/docker/example/add-data.sql b/docker/example/add-data.sql new file mode 100644 index 00000000..a678ef73 --- /dev/null +++ b/docker/example/add-data.sql @@ -0,0 +1,9 @@ +create table blah ( + foo int, + bar char(10) +); + +insert into blah values (1, 'one'); +insert into blah values (2, 'two'); +insert into blah values (3, 'three'); +insert into blah values (4, 'four'); diff --git a/docker/example/docker-compose.yml b/docker/example/docker-compose.yml new file mode 100644 index 00000000..28e7a956 --- /dev/null +++ b/docker/example/docker-compose.yml @@ -0,0 +1,48 @@ +version: '3' +services: + db1: + image: postgres:9.5 + ports: + - "5433:5432" + environment: + POSTGRES_DB: ${SRC_DB} + POSTGRES_USER: ${SRC_USER} + POSTGRES_PASSWORD: ${SRC_PASS} + volumes: + - "./add-data.sql:/docker-entrypoint-initdb.d/add-data.sql" + db2: + image: postgres:10 + ports: + - "5434:5432" + environment: + POSTGRES_DB: ${DEST_DB} + POSTGRES_USER: ${DEST_USER} + POSTGRES_PASSWORD: ${DEST_PASS} + db-sync: + build: ../.. + links: + - db1:db1 + - db2:db2 + environment: + FROM_USER: ${SRC_USER} + FROM_PASS: ${SRC_PASS} + FROM_HOST: db1 + FROM_PORT: 5432 + FROM_DB: ${SRC_DB} + TO_USER: ${DEST_USER} + TO_PASS: ${DEST_PASS} + TO_HOST: db2 + TO_PORT: 5432 + TO_DB: ${DEST_DB} + # note: don't override the entrypoint when using this container, this is just for a demo to run SQL after the sync + entrypoint: | + /bin/sh -c " + echo 'waiting for DBs' && \ + sleep 10 && \ + SCHEMA_ONLY=1 /bin/sh /app/run.sh && \ + /bin/sh /app/run.sh && \ + sleep 1 && \ + PGPASSWORD=pass psql -h db2 -U user -d db2 -c 'select * from blah;'" + depends_on: + - db1 + - db2 diff --git a/docker/run.sh b/docker/run.sh new file mode 100644 index 00000000..02d15298 --- /dev/null +++ b/docker/run.sh @@ -0,0 +1,26 @@ +#!/bin/sh +# runs pgsync to sync from the "FROM" DB to the "TO" DB +set -e +: ${FROM_USER:?} +: ${FROM_PASS:?} +: ${FROM_HOST:?} +: ${FROM_PORT:?} +: ${FROM_DB:?} +: ${TO_USER:?} +: ${TO_PASS:?} +: ${TO_HOST:?} +: ${TO_PORT:?} +: ${TO_DB:?} +EXTRA_OPTS="" + +if [ ! -z "$SCHEMA_ONLY" ]; then + echo '[INFO] restoring schema only' + EXTRA_OPTS="--schema-only --no-constraints" +fi + +pgsync \ + $EXTRA_OPTS \ + --from "postgres://$FROM_USER:$FROM_PASS@$FROM_HOST:$FROM_PORT/$FROM_DB" \ + --to "postgres://$TO_USER:$TO_PASS@$TO_HOST:$TO_PORT/$TO_DB" \ + --to-safe + From f85741273172f4ac1ff3098b647efa49e0aaac4f Mon Sep 17 00:00:00 2001 From: Tom Saleeba Date: Tue, 19 Mar 2019 16:37:56 +1030 Subject: [PATCH 4/5] fix: correct command to run job in crontab, update other usages of run.sh script --- Dockerfile | 1 + docker/README.md | 4 ++-- docker/entrypoint.sh | 7 ++++--- docker/example/docker-compose.yml | 10 +++++++++- docker/run.sh | 1 + 5 files changed, 17 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index ce33fb6a..35bb35e5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,6 +11,7 @@ RUN \ gem build pgsync.gemspec && \ gem install pgsync-*.gem && \ apk del .build-deps && \ + chmod +x docker/*.sh && \ mv docker/*.sh ../ && \ cd .. && \ rm -r src/ diff --git a/docker/README.md b/docker/README.md index 598a00a2..58cbd146 100644 --- a/docker/README.md +++ b/docker/README.md @@ -47,12 +47,12 @@ services: The periodic command that is run by `cron` will **only sync data**. This command will fail if the schema doesn't already exist. To fix this, after you've deployed the stack, you should do this manual, one-off step to create the schema: ```bash -docker exec -i example_db-sync_1 sh -c 'SCHEMA_ONLY=1 sh /run.sh' +docker exec -i example_db-sync_1 sh -c 'SCHEMA_ONLY=1 sh /app/run.sh' ``` If you're impatient and don't want to wait for the first data sync to get some data, you can trigger that using a manual step too: ```bash -docker exec -i example_db-sync_1 sh -c 'sh /run.sh' +docker exec -i example_db-sync_1 sh -c 'sh /app/run.sh' ``` ## Run example docker-compose stack diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 6e6c6124..f2cc0e5c 100644 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -1,10 +1,11 @@ #!/bin/sh # schedule task in cron and start daemon +set -euxo pipefail +cd `dirname "$0"` # assert env vars exist with bash parameter expansion (http://wiki.bash-hackers.org/syntax/pe#display_error_if_null_or_unset) : ${CRON_SCHEDULE:?} -redirectToDockerLogs='> /proc/1/fd/1 2> /proc/1/fd/2' -echo "$CRON_SCHEDULE sh /run.sh $redirectToDockerLogs" > /var/spool/cron/crontabs/root -crond -l 2 -f +echo "$CRON_SCHEDULE sh $(pwd)/run.sh" > /var/spool/cron/crontabs/root +exec crond -f diff --git a/docker/example/docker-compose.yml b/docker/example/docker-compose.yml index 28e7a956..eba0c7d6 100644 --- a/docker/example/docker-compose.yml +++ b/docker/example/docker-compose.yml @@ -10,6 +10,7 @@ services: POSTGRES_PASSWORD: ${SRC_PASS} volumes: - "./add-data.sql:/docker-entrypoint-initdb.d/add-data.sql" + - "db1-pgdata:/var/lib/postgresql/data" db2: image: postgres:10 ports: @@ -18,6 +19,8 @@ services: POSTGRES_DB: ${DEST_DB} POSTGRES_USER: ${DEST_USER} POSTGRES_PASSWORD: ${DEST_PASS} + volumes: + - "db2-pgdata:/var/lib/postgresql/data" db-sync: build: ../.. links: @@ -42,7 +45,12 @@ services: SCHEMA_ONLY=1 /bin/sh /app/run.sh && \ /bin/sh /app/run.sh && \ sleep 1 && \ - PGPASSWORD=pass psql -h db2 -U user -d db2 -c 'select * from blah;'" + PGPASSWORD=pass psql -h db2 -U user -d db2 -c 'select * from blah;' && \ + echo 'Now running under cron every minute, use ctrl+c to exit' && \ + CRON_SCHEDULE='* * * * *' exec /app/entrypoint.sh" depends_on: - db1 - db2 +volumes: + db1-pgdata: + db2-pgdata: diff --git a/docker/run.sh b/docker/run.sh index 02d15298..8d462f60 100644 --- a/docker/run.sh +++ b/docker/run.sh @@ -18,6 +18,7 @@ if [ ! -z "$SCHEMA_ONLY" ]; then EXTRA_OPTS="--schema-only --no-constraints" fi +echo "Run started at $(date)" pgsync \ $EXTRA_OPTS \ --from "postgres://$FROM_USER:$FROM_PASS@$FROM_HOST:$FROM_PORT/$FROM_DB" \ From 623831b15dc12901b73e4b694fe3eaafdf36ef8a Mon Sep 17 00:00:00 2001 From: Tom Saleeba Date: Tue, 9 Feb 2021 19:44:06 +1030 Subject: [PATCH 5/5] feat: add ability to report to Sentry.io on error --- Dockerfile | 8 ++++++-- docker/example/docker-compose.yml | 3 +++ docker/run.sh | 10 ++++++++-- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 35bb35e5..f6ca4c04 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,10 @@ FROM ruby:2.5-alpine3.8 -LABEL author="Tom Saleeba" LABEL description="pgsync running under cron for periodic DB synchronisation" ADD . /app/src/ WORKDIR /app/ RUN \ + set -x && \ apk add --no-cache postgresql-client postgresql-dev && \ apk add --no-cache --virtual .build-deps git build-base && \ cd src/ && \ @@ -14,7 +14,11 @@ RUN \ chmod +x docker/*.sh && \ mv docker/*.sh ../ && \ cd .. && \ - rm -r src/ + rm -r src/ && \ + sentryVersion=1.62.0 && \ + wget -O /usr/local/bin/sentry-cli \ + "https://downloads.sentry-cdn.com/sentry-cli/$sentryVersion/sentry-cli-Linux-x86_64" && \ + chmod a+x /usr/local/bin/sentry-cli ENTRYPOINT [ "/bin/sh", "/app/entrypoint.sh" ] diff --git a/docker/example/docker-compose.yml b/docker/example/docker-compose.yml index eba0c7d6..2e8e225f 100644 --- a/docker/example/docker-compose.yml +++ b/docker/example/docker-compose.yml @@ -37,6 +37,9 @@ services: TO_HOST: db2 TO_PORT: 5432 TO_DB: ${DEST_DB} + # uncomment and add your DSN to enable Sentry.io reporting. A simple way + # to trigger an error is to change TO_HOST: db99 (invalid hostname). + # SENTRY_DSN: 'https://11111111111111111111111111111111@o222222.ingest.sentry.io/3333333' # note: don't override the entrypoint when using this container, this is just for a demo to run SQL after the sync entrypoint: | /bin/sh -c " diff --git a/docker/run.sh b/docker/run.sh index 8d462f60..43f6110f 100644 --- a/docker/run.sh +++ b/docker/run.sh @@ -23,5 +23,11 @@ pgsync \ $EXTRA_OPTS \ --from "postgres://$FROM_USER:$FROM_PASS@$FROM_HOST:$FROM_PORT/$FROM_DB" \ --to "postgres://$TO_USER:$TO_PASS@$TO_HOST:$TO_PORT/$TO_DB" \ - --to-safe - + --to-safe || { + if [ -z "${SENTRY_DSN:-}" ]; then + echo "[WARN] No SENTRY_DSN, cannot send error report" + else + echo "Reporting error to Sentry.io" + sentry-cli send-event -m 'Failed to sync DB' + fi +}