diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..6dd8d40 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,230 @@ +name: CI + +on: + push: + branches: [main] + tags: ['v*'] + pull_request: + branches: [main] + +env: + CARGO_TERM_COLOR: always + +jobs: + test: + runs-on: ubuntu-latest + + services: + postgres: + image: postgres:13.4 + env: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 5432:5432 + + steps: + - uses: actions/checkout@v4 + + - name: Install Nix + uses: cachix/install-nix-action@v27 + with: + github_access_token: ${{ secrets.GITHUB_TOKEN }} + + - name: Cache Nix store + uses: DeterminateSystems/magic-nix-cache-action@v7 + + - name: Cache Rust dependencies + uses: Swatinem/rust-cache@v2 + with: + shared-key: "rust-deps" + cache-on-failure: true + + - name: Check formatting + run: nix develop --command cargo fmt -- --check + + - name: Run clippy + run: nix develop --command cargo clippy --all-targets --all-features -- -D warnings + + - name: Run tests + run: nix develop --command cargo test + env: + DATABASE_URL: postgresql://postgres:postgres@localhost + + build-linux: + runs-on: ubuntu-latest + needs: test + container: + image: clux/muslrust:1.90.0-stable-2025-09-27 + + steps: + - uses: actions/checkout@v4 + + - name: Cache Rust dependencies + uses: Swatinem/rust-cache@v2 + with: + shared-key: "rust-deps-musl" + cache-on-failure: true + + - name: Build Linux binary (musl) + run: cargo build --release --target x86_64-unknown-linux-musl + + - name: Copy binary + run: cp target/x86_64-unknown-linux-musl/release/anonymiser anonymiser-x86_64-unknown-linux-musl + + - name: Inspect binary + run: | + file anonymiser-x86_64-unknown-linux-musl + ldd anonymiser-x86_64-unknown-linux-musl || echo "ldd failed or binary is static" + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: anonymiser-linux + path: anonymiser-x86_64-unknown-linux-musl + + test-alpine: + runs-on: ubuntu-latest + needs: build-linux + container: + image: alpine:3.18 + + services: + postgres: + image: postgres:13.4 + env: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + + steps: + - name: Download Linux binary + uses: actions/download-artifact@v4 + with: + name: anonymiser-linux + + - name: Make binary executable + run: chmod +x anonymiser-x86_64-unknown-linux-musl + + - name: Debug binary + run: | + apk add --no-cache file + file anonymiser-x86_64-unknown-linux-musl + ldd anonymiser-x86_64-unknown-linux-musl || echo "ldd failed or binary is static" + + - name: Test binary on Alpine + run: | + ./anonymiser-x86_64-unknown-linux-musl generate-strategies --db-url postgresql://postgres:postgres@postgres + env: + DATABASE_URL: postgresql://postgres:postgres@postgres + + test-amazon-linux: + runs-on: ubuntu-latest + needs: build-linux + container: + image: public.ecr.aws/amazonlinux/amazonlinux:latest + + services: + postgres: + image: postgres:13.4 + env: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + + steps: + - name: Download Linux binary + uses: actions/download-artifact@v4 + with: + name: anonymiser-linux + + - name: Make binary executable + run: chmod +x anonymiser-x86_64-unknown-linux-musl + + - name: Debug binary + run: | + yum install -y file + file anonymiser-x86_64-unknown-linux-musl + ldd anonymiser-x86_64-unknown-linux-musl || echo "ldd failed or binary is static" + + - name: Test binary on Amazon Linux + run: | + ./anonymiser-x86_64-unknown-linux-musl generate-strategies --db-url postgresql://postgres:postgres@postgres + env: + DATABASE_URL: postgresql://postgres:postgres@postgres + + build-macos: + runs-on: macos-14 + needs: test + + steps: + - uses: actions/checkout@v4 + + - name: Install Nix + uses: cachix/install-nix-action@v27 + with: + github_access_token: ${{ secrets.GITHUB_TOKEN }} + + - name: Cache Nix store + uses: DeterminateSystems/magic-nix-cache-action@v7 + + - name: Cache Rust dependencies + uses: Swatinem/rust-cache@v2 + with: + shared-key: "rust-deps" + cache-on-failure: true + + - name: Build macOS binary (Apple Silicon) + run: | + nix build .#anonymiser + cp result/bin/anonymiser anonymiser-aarch64-apple-darwin + + - name: Test binary + run: ./anonymiser-aarch64-apple-darwin --help + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: anonymiser-aarch64-apple-darwin + path: anonymiser-aarch64-apple-darwin + + release: + runs-on: ubuntu-latest + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') + needs: [test-alpine, test-amazon-linux, build-macos] + permissions: + contents: write + + steps: + - uses: actions/checkout@v4 + + - name: Download Linux binary + uses: actions/download-artifact@v4 + with: + name: anonymiser-linux + + - name: Download macOS ARM64 binary + uses: actions/download-artifact@v4 + with: + name: anonymiser-aarch64-apple-darwin + + - name: Create GitHub Release + env: + GH_TOKEN: ${{ github.token }} + run: | + gh release create --draft --generate-notes "${{ github.ref_name }}" \ + './anonymiser-x86_64-unknown-linux-musl#Linux' \ + './anonymiser-aarch64-apple-darwin#macOS (Apple Silicon)' diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..ab6ddd0 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,158 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +Anonymiser is a Rust CLI tool that reads PostgreSQL SQL backups (created with `pg_dump`) and anonymises them based on a strategy file. It processes SQL dumps line-by-line, transforming data according to configured strategies while preserving database structure. + +## Essential Commands + +### Building and Testing +```bash +# Build the project +cargo build + +# Run all tests (requires PostgreSQL running at localhost:5432) +./build_and_test + +# Run tests manually +cargo test + +# Format code +cargo fmt + +# Check code with clippy +cargo clippy --all-targets --all-features -- -D warnings +``` + +### Running the Anonymiser +```bash +# Generate a strategy file from a database +anonymiser generate-strategies --db-url postgresql://postgres:postgres@localhost/DB_NAME + +# Anonymise a SQL dump +anonymiser anonymise -i clear_text_dump.sql -o anonymised.sql -s strategy.json + +# Check strategy file against database +anonymiser check-strategies --db-url postgresql://... --strategy-file strategy.json + +# Fix strategy file errors +anonymiser fix-strategies --db-url postgresql://... --strategy-file strategy.json + +# Export strategies to CSV +anonymiser to-csv --strategy-file strategy.json --output-file output.csv + +# Helper functions for debugging +anonymiser anonymise-email --email "user@example.com" --salt "optional-salt" +anonymiser anonymise-id --id "user123" --transformer "FakeUUID" --args '{"deterministic": "true"}' +``` + +### Running Individual Tests +```bash +# Run a specific test +cargo test test_name + +# Run tests in a specific module +cargo test module_name:: + +# Run tests with output +cargo test -- --nocapture +``` + +## Architecture + +### High-Level Flow +1. **Strategy Loading** (`src/parsers/strategy_file.rs`): Reads strategy.json and validates configurations +2. **Database Schema Parsing** (`src/parsers/db_schema.rs`): When checking/generating strategies, connects to PostgreSQL to fetch table schemas +3. **Line-by-Line Processing** (`src/file_reader.rs`): Reads SQL dump line by line to minimize memory usage +4. **Row Parsing** (`src/parsers/row_parser.rs`): Determines row type (CREATE TABLE, COPY, data, etc.) +5. **Transformation** (`src/parsers/transformer.rs`): Applies configured transformers to data values +6. **Output Writing** (`src/file_reader.rs`): Writes transformed SQL to output file (optionally compressed) + +### Key Modules + +**Strategy Management** (`src/parsers/`): +- `strategy_structs.rs`: Core types (DataCategory, TransformerType, ColumnInfo, etc.) +- `strategies.rs`: Strategies struct that maps table names to column strategies +- `strategy_file.rs`: Reading/writing strategy.json files +- `custom_classifications.rs`: Support for custom data categories beyond built-in ones + +**SQL Parsing** (`src/parsers/`): +- `row_parser.rs`: Main entry point for parsing each line +- `copy_row.rs`: Handles PostgreSQL COPY statements that introduce table data +- `data_row.rs`: Parses and transforms actual data rows +- `create_row.rs`: Parses CREATE TABLE statements to extract column types + +**Transformation** (`src/parsers/`): +- `transformer.rs`: All transformer implementations (FakeEmail, Scramble, etc.) +- `sanitiser.rs`: Escapes special characters for SQL output +- `rng.rs`: Random number generation +- `types.rs`: PostgreSQL type system representation + +**Validation** (`src/fixers/`): +- `db_mismatch.rs`: Detects differences between strategy file and database schema +- `validation.rs`: Validates strategy file consistency (PII not using Identity, no Error transformers, etc.) +- `fixer.rs`: Automatically fixes certain strategy file errors + +**State Management**: +- `src/parsers/state.rs`: Tracks current table being processed, column types, etc. during line-by-line parsing + +### Important Patterns + +**Deterministic Transformations**: +Transformers marked with † in README support deterministic generation using `get_faker_rng()` which creates a seeded RNG from: +- Input value +- Optional ID column value (for entity-level consistency) +- Optional global salt (for run-level consistency) + +This ensures the same input always generates the same output, critical for maintaining referential integrity. + +**Global Salt**: +Strategy files can include a salt configuration as the first item: +```json +[ + {"salt": "your-global-salt-here"}, + {"table_name": "public.users", ...} +] +``` + +**Column Type Tracking**: +The system parses CREATE TABLE statements to track PostgreSQL column types, enabling type-aware transformations (e.g., array handling, date formatting). + +**Memory Efficiency**: +The tool processes SQL dumps line-by-line without loading entire files into memory, making it suitable for multi-GB dumps. Uses `mimalloc` as global allocator for performance. + +**Error Transformer Pattern**: +Columns with `"transformer": {"name": "Error"}` will cause anonymisation to fail. This forces explicit decisions about how to handle each column. + +## Testing + +Tests are embedded in source files using `#[cfg(test)]` modules. Integration tests in `src/anonymiser.rs` require: +- PostgreSQL running at `localhost:5432` with user `postgres` password `postgres` +- Permission to create/drop test databases + +Test data lives in `test_files/` directory. + +## Key Dependencies + +- `fake`: Generates fake data (names, emails, addresses) +- `postgres`: Database connection for schema inspection +- `regex`: SQL pattern matching +- `structopt`: CLI argument parsing +- `serde_json`: Strategy file parsing +- `sha2/sha256`: Deterministic hashing +- `zstd/flate2`: Output compression +- `mimalloc`: Fast memory allocator + +## Transformer Args + +When adding transformer arguments: +- Add fields to transformer struct in `strategy_structs.rs` +- Parse in `transformer.rs` transform function +- Update README.md transformer documentation +- Consider whether deterministic mode is appropriate + +## Custom Classifications + +Users can define custom data categories beyond the built-in ones (General, Pii, PotentialPii, CommercialySensitive, Security, Unknown). The `--classifications-file` flag accepts a JSON file listing valid custom categories. The system validates that all custom categories in strategy files are defined. diff --git a/flake.nix b/flake.nix index 23c5d9d..26ad818 100644 --- a/flake.nix +++ b/flake.nix @@ -16,59 +16,66 @@ flake-utils, rust-overlay, }: - flake-utils.lib.eachDefaultSystem (system: let - overlays = [rust-overlay.overlays.default]; - pkgs = import nixpkgs {inherit overlays system;}; + flake-utils.lib.eachDefaultSystem ( + system: let + overlays = [rust-overlay.overlays.default]; + pkgs = import nixpkgs {inherit overlays system;}; - rust = pkgs.rust-bin.stable.latest.default.override {extensions = ["rust-src"];}; - rustPlatform = pkgs.makeRustPlatform { - cargo = rust; - rustc = rust; - }; + rust = pkgs.rust-bin.stable.latest.default.override {extensions = ["rust-src"];}; + rustPlatform = pkgs.makeRustPlatform { + cargo = rust; + rustc = rust; + }; - manifest = (pkgs.lib.importTOML ./Cargo.toml).package; - in { - # `nix develop`. - devShells = { - default = pkgs.mkShell { - inputsFrom = [self.packages.${system}.anonymiser]; - buildInputs = with pkgs; [rust-analyzer]; + manifest = (pkgs.lib.importTOML ./Cargo.toml).package; + in { + # `nix develop`. + devShells = { + default = pkgs.mkShell { + inputsFrom = [self.packages.${system}.anonymiser]; + buildInputs = with pkgs; [rust-analyzer]; + }; }; - }; - # `nix fmt`. - formatter = pkgs.alejandra; + # `nix fmt`. + formatter = pkgs.alejandra; + + # `nix build`. + packages = { + anonymiser = rustPlatform.buildRustPackage { + pname = manifest.name; + version = manifest.version; + src = pkgs.nix-gitignore.gitignoreSource [] ./.; + cargoLock.lockFile = ./Cargo.lock; - # `nix build`. - packages = { - anonymiser = rustPlatform.buildRustPackage { - pname = manifest.name; - version = manifest.version; - src = pkgs.nix-gitignore.gitignoreSource [] ./.; - cargoLock.lockFile = ./Cargo.lock; + # Compile-time dependencies. + nativeBuildInputs = with pkgs; [ + pkg-config + cmake + perl # Required for vendored OpenSSL build + ]; + # Run-time dependencies. + buildInputs = with pkgs; + [ + openssl + ] + ++ pkgs.lib.optionals pkgs.stdenv.isDarwin ( + with pkgs.darwin.apple_sdk.frameworks; [ + Security + SystemConfiguration + ] + ); - # Compile-time dependencies. - nativeBuildInputs = with pkgs; [ - pkg-config - cmake - ]; - # Run-time dependencies. - buildInputs = with pkgs; - [ - openssl - ] - ++ pkgs.lib.optionals pkgs.stdenv.isDarwin (with pkgs.darwin.apple_sdk.frameworks; [ - Security - SystemConfiguration - ]); + checkFlags = [ + # Skip tests which require access to a PostgreSQL server. + "--skip=anonymiser::tests::successfully_transforms" + "--skip=anonymiser::tests::successfully_truncates" + "--skip=parsers::db_schema::tests::can_read_db_columns" + ]; + }; - checkFlags = [ - # Skip tests which require acces to a PostgreSQL server. - "--skip=anonymiser::tests::successfully_transforms" - "--skip=parsers::db_schema::tests::can_read_db_columns" - ]; + default = self.packages.${system}.anonymiser; }; - default = self.packages.${system}.anonymiser; - }; - }); + } + ); } diff --git a/src/file_reader.rs b/src/file_reader.rs index 3c44c91..645fca8 100644 --- a/src/file_reader.rs +++ b/src/file_reader.rs @@ -17,26 +17,41 @@ pub fn read( compress_output: Option>, ) -> Result<(), std::io::Error> { let output_file = File::create(output_file_path)?; - let mut file_writer: Box = match compress_output { - Some(Some(CompressionType::Zstd)) => { - Box::new(zstd::Encoder::new(output_file, 1)?.auto_finish()) + + match compress_output { + Some(Some(CompressionType::Zstd)) | Some(None) => { + let mut encoder = zstd::Encoder::new(output_file, 1)?; + write_data(&input_file_path, &mut encoder, strategies)?; + let file = encoder.finish()?; + file.sync_all()?; } Some(Some(CompressionType::Gzip)) => { - Box::new(GzEncoder::new(output_file, Compression::best())) + let mut encoder = GzEncoder::new(output_file, Compression::best()); + write_data(&input_file_path, &mut encoder, strategies)?; + let file = encoder.finish()?; + file.sync_all()?; + } + None => { + let mut writer = BufWriter::new(output_file); + write_data(&input_file_path, &mut writer, strategies)?; + writer.flush()?; } - Some(None) => Box::new(zstd::Encoder::new(output_file, 1)?.auto_finish()), + } - None => Box::new(BufWriter::new(output_file)), - }; + Ok(()) +} - let file_reader = File::open(&input_file_path) +fn write_data( + input_file_path: &str, + writer: &mut dyn Write, + strategies: &Strategies, +) -> Result<(), std::io::Error> { + let file_reader = File::open(input_file_path) .unwrap_or_else(|_| panic!("Input file '{}' does not exist", input_file_path)); let mut reader = BufReader::new(file_reader); let mut line = String::new(); - let mut row_parser_state = State::new(); - let mut rng = rng::get(); loop { @@ -46,9 +61,10 @@ pub fn read( } let transformed_row = row_parser::parse(&mut rng, &line, &mut row_parser_state, strategies); - file_writer.write_all(transformed_row.as_bytes())?; + writer.write_all(transformed_row.as_bytes())?; line.clear(); } + Ok(()) }