From 6c5b7ec14f19d1c7a52f81e62e9d2bce3a65319d Mon Sep 17 00:00:00 2001 From: Rob Rudin Date: Fri, 2 Jan 2026 14:53:33 -0500 Subject: [PATCH] MLE-26427 Initial exclusion support for JSON This isn't quite done - I want to do a PR for excluding XML next, and then refactor the code, likely moving the tests into a new test class. But this pushes things forward a bit with exclusions. --- .../filter/ContentExclusionUtil.java | 77 +++++++++++++++++++ .../filter/IncrementalWriteEvalFilter.java | 6 +- .../filter/IncrementalWriteFilter.java | 22 +++++- .../filter/IncrementalWriteOpticFilter.java | 6 +- .../filter/IncrementalWriteTest.java | 59 ++++++++++++++ 5 files changed, 161 insertions(+), 9 deletions(-) create mode 100644 marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/ContentExclusionUtil.java diff --git a/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/ContentExclusionUtil.java b/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/ContentExclusionUtil.java new file mode 100644 index 000000000..0510cc0b0 --- /dev/null +++ b/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/ContentExclusionUtil.java @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2010-2026 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved. + */ +package com.marklogic.client.datamovement.filter; + +import com.fasterxml.jackson.core.JsonPointer; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Utility class for applying content exclusions to documents before hash calculation. + * Supports removing specific paths from JSON and XML documents using JSON Pointer and XPath expressions. + * + * @since 8.1.0 + */ +public class ContentExclusionUtil { + + private static final Logger logger = LoggerFactory.getLogger(ContentExclusionUtil.class); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + /** + * Applies JSON Pointer exclusions to JSON content by removing the specified paths. + * + * @param uri the document URI (used for logging purposes) + * @param jsonContent the JSON content as a string + * @param jsonPointers array of RFC 6901 JSON Pointer expressions identifying properties to exclude + * @return the modified JSON content with specified paths removed + * @throws JsonProcessingException if the JSON content cannot be parsed or serialized + */ + public static String applyJsonExclusions(String uri, String jsonContent, String[] jsonPointers) throws JsonProcessingException { + if (jsonPointers == null || jsonPointers.length == 0) { + return jsonContent; + } + + JsonNode rootNode = OBJECT_MAPPER.readTree(jsonContent); + for (String jsonPointer : jsonPointers) { + removeNodeAtPointer(uri, rootNode, jsonPointer); + } + return OBJECT_MAPPER.writeValueAsString(rootNode); + } + + /** + * Removes a node at the specified JSON Pointer path from the given root node. + * + * @param uri the document URI (used for logging purposes) + * @param rootNode the root JSON node + * @param jsonPointer the JSON Pointer expression identifying the node to remove + */ + private static void removeNodeAtPointer(String uri, JsonNode rootNode, String jsonPointer) { + JsonPointer pointer = JsonPointer.compile(jsonPointer); + JsonNode targetNode = rootNode.at(pointer); + + if (targetNode.isMissingNode()) { + logger.debug("JSONPointer '{}' does not exist in document {}, skipping", jsonPointer, uri); + return; + } + + // Use Jackson's JsonPointer API to get parent and field name + JsonPointer parentPointer = pointer.head(); + JsonNode parentNode = rootNode.at(parentPointer); + + if (parentNode.isObject()) { + String fieldName = pointer.last().getMatchingProperty(); + ((ObjectNode) parentNode).remove(fieldName); + } else if (parentNode.isArray()) { + logger.warn("Array element exclusion not supported for JSONPointer '{}'. " + + "Consider excluding the entire array property instead.", jsonPointer); + } + } + + // Future method for XML exclusions + // public static String applyXmlExclusions(String xmlContent, String[] xpaths) { ... } +} diff --git a/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteEvalFilter.java b/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteEvalFilter.java index aa725eee8..54343d80e 100644 --- a/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteEvalFilter.java +++ b/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteEvalFilter.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2025 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved. + * Copyright (c) 2010-2026 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved. */ package com.marklogic.client.datamovement.filter; @@ -31,8 +31,8 @@ class IncrementalWriteEvalFilter extends IncrementalWriteFilter { """; IncrementalWriteEvalFilter(String hashKeyName, String timestampKeyName, boolean canonicalizeJson, - Consumer skippedDocumentsConsumer) { - super(hashKeyName, timestampKeyName, canonicalizeJson, skippedDocumentsConsumer); + Consumer skippedDocumentsConsumer, String[] jsonExclusions) { + super(hashKeyName, timestampKeyName, canonicalizeJson, skippedDocumentsConsumer, jsonExclusions); } @Override diff --git a/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteFilter.java b/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteFilter.java index b87f29ccb..86cc14e62 100644 --- a/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteFilter.java +++ b/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteFilter.java @@ -45,6 +45,7 @@ public static class Builder { private boolean canonicalizeJson = true; private boolean useEvalQuery = false; private Consumer skippedDocumentsConsumer; + private String[] jsonExclusions; /** * @param keyName the name of the MarkLogic metadata key that will hold the hash value; defaults to "incrementalWriteHash". @@ -93,11 +94,20 @@ public Builder onDocumentsSkipped(Consumer skippedDocu return this; } + /** + * @param jsonPointers JSON Pointer expressions (RFC 6901) identifying JSON properties to exclude from hash calculation. + * For example, "/metadata/timestamp" or "/user/lastModified". + */ + public Builder jsonExclusions(String... jsonPointers) { + this.jsonExclusions = jsonPointers; + return this; + } + public IncrementalWriteFilter build() { if (useEvalQuery) { - return new IncrementalWriteEvalFilter(hashKeyName, timestampKeyName, canonicalizeJson, skippedDocumentsConsumer); + return new IncrementalWriteEvalFilter(hashKeyName, timestampKeyName, canonicalizeJson, skippedDocumentsConsumer, jsonExclusions); } - return new IncrementalWriteOpticFilter(hashKeyName, timestampKeyName, canonicalizeJson, skippedDocumentsConsumer); + return new IncrementalWriteOpticFilter(hashKeyName, timestampKeyName, canonicalizeJson, skippedDocumentsConsumer, jsonExclusions); } } @@ -105,16 +115,18 @@ public IncrementalWriteFilter build() { private final String timestampKeyName; private final boolean canonicalizeJson; private final Consumer skippedDocumentsConsumer; + private final String[] jsonExclusions; // Hardcoding this for now, with a good general purpose hashing function. // See https://xxhash.com for benchmarks. private final LongHashFunction hashFunction = LongHashFunction.xx3(); - public IncrementalWriteFilter(String hashKeyName, String timestampKeyName, boolean canonicalizeJson, Consumer skippedDocumentsConsumer) { + public IncrementalWriteFilter(String hashKeyName, String timestampKeyName, boolean canonicalizeJson, Consumer skippedDocumentsConsumer, String[] jsonExclusions) { this.hashKeyName = hashKeyName; this.timestampKeyName = timestampKeyName; this.canonicalizeJson = canonicalizeJson; this.skippedDocumentsConsumer = skippedDocumentsConsumer; + this.jsonExclusions = jsonExclusions; } protected final DocumentWriteSet filterDocuments(Context context, Function hashRetriever) { @@ -165,6 +177,10 @@ private String serializeContent(DocumentWriteOperation doc) { if (canonicalizeJson && (Format.JSON.equals(format) || isPossiblyJsonContent(content))) { JsonCanonicalizer jc; try { + if (jsonExclusions != null && jsonExclusions.length > 0) { + // TBD on error handling here, want to get XML supported first. + content = ContentExclusionUtil.applyJsonExclusions(doc.getUri(), content, jsonExclusions); + } jc = new JsonCanonicalizer(content); return jc.getEncodedString(); } catch (IOException e) { diff --git a/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteOpticFilter.java b/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteOpticFilter.java index 5d6d57642..3cb8f44e0 100644 --- a/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteOpticFilter.java +++ b/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteOpticFilter.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2025 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved. + * Copyright (c) 2010-2026 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved. */ package com.marklogic.client.datamovement.filter; @@ -20,8 +20,8 @@ class IncrementalWriteOpticFilter extends IncrementalWriteFilter { IncrementalWriteOpticFilter(String hashKeyName, String timestampKeyName, boolean canonicalizeJson, - Consumer skippedDocumentsConsumer) { - super(hashKeyName, timestampKeyName, canonicalizeJson, skippedDocumentsConsumer); + Consumer skippedDocumentsConsumer, String[] jsonExclusions) { + super(hashKeyName, timestampKeyName, canonicalizeJson, skippedDocumentsConsumer, jsonExclusions); } @Override diff --git a/marklogic-client-api/src/test/java/com/marklogic/client/datamovement/filter/IncrementalWriteTest.java b/marklogic-client-api/src/test/java/com/marklogic/client/datamovement/filter/IncrementalWriteTest.java index 95bda1326..9929fdc98 100644 --- a/marklogic-client-api/src/test/java/com/marklogic/client/datamovement/filter/IncrementalWriteTest.java +++ b/marklogic-client-api/src/test/java/com/marklogic/client/datamovement/filter/IncrementalWriteTest.java @@ -218,6 +218,65 @@ void nullIsIgnoredForKeyNames() { assertNotNull(metadata.getMetadataValues().get("incrementalWriteTimestamp")); } + @Test + void jsonExclusions() { + filter = IncrementalWriteFilter.newBuilder() + .jsonExclusions("/timestamp", "/metadata/lastModified") + .onDocumentsSkipped(docs -> skippedCount.addAndGet(docs.length)) + .build(); + + // Write initial documents with three keys + docs = new ArrayList<>(); + for (int i = 1; i <= 5; i++) { + ObjectNode doc = objectMapper.createObjectNode(); + doc.put("id", i); + doc.put("name", "Document " + i); + doc.put("timestamp", "2025-01-01T10:00:00Z"); + doc.putObject("metadata") + .put("lastModified", "2025-01-01T10:00:00Z") + .put("author", "Test User"); + docs.add(new DocumentWriteOperationImpl("/incremental/test/json-doc-" + i + ".json", METADATA, new JacksonHandle(doc))); + } + + writeDocs(docs); + assertEquals(5, writtenCount.get()); + assertEquals(0, skippedCount.get()); + + // Write again with different values for excluded fields - should be skipped + docs = new ArrayList<>(); + for (int i = 1; i <= 5; i++) { + ObjectNode doc = objectMapper.createObjectNode(); + doc.put("id", i); + doc.put("name", "Document " + i); + doc.put("timestamp", "2026-01-02T15:30:00Z"); // Changed + doc.putObject("metadata") + .put("lastModified", "2026-01-02T15:30:00Z") // Changed + .put("author", "Test User"); + docs.add(new DocumentWriteOperationImpl("/incremental/test/json-doc-" + i + ".json", METADATA, new JacksonHandle(doc))); + } + + writeDocs(docs); + assertEquals(5, writtenCount.get(), "Documents should be skipped since only excluded fields changed"); + assertEquals(5, skippedCount.get()); + + // Write again with actual content change - should NOT be skipped + docs = new ArrayList<>(); + for (int i = 1; i <= 5; i++) { + ObjectNode doc = objectMapper.createObjectNode(); + doc.put("id", i); + doc.put("name", "Modified Document " + i); // Changed + doc.put("timestamp", "2026-01-02T16:00:00Z"); + doc.putObject("metadata") + .put("lastModified", "2026-01-02T16:00:00Z") + .put("author", "Test User"); + docs.add(new DocumentWriteOperationImpl("/incremental/test/json-doc-" + i + ".json", METADATA, new JacksonHandle(doc))); + } + + writeDocs(docs); + assertEquals(10, writtenCount.get(), "Documents should be written since non-excluded content changed"); + assertEquals(5, skippedCount.get(), "Skip count should remain at 5"); + } + private void verifyIncrementalWriteWorks() { writeTenDocuments(); verifyDocumentsHasHashInMetadataKey();