From c9f728bc4bfa941e3d0d59d86ae1523daf6d9608 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Sat, 6 Dec 2025 18:26:03 -0500 Subject: [PATCH 01/39] add checksum URI values and methods --- .../edu/harvard/iq/dataverse/DataFile.java | 33 +++++++++++++++---- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DataFile.java b/src/main/java/edu/harvard/iq/dataverse/DataFile.java index 45604a5472b..8a08cd15029 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataFile.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataFile.java @@ -109,18 +109,22 @@ public class DataFile extends DvObject implements Comparable { * The list of types should be limited to the list above in the technote * because the string gets passed into MessageDigest.getInstance() and you * can't just pass in any old string. + * + * The URIs are used in the OAI_ORE export. They are taken from the associated XML Digital Signature standards. */ public enum ChecksumType { - MD5("MD5"), - SHA1("SHA-1"), - SHA256("SHA-256"), - SHA512("SHA-512"); + MD5("MD5", "http://www.w3.org/2001/04/xmldsig-more#md5"), + SHA1("SHA-1", "http://www.w3.org/2000/09/xmldsig#sha1"), + SHA256("SHA-256", "http://www.w3.org/2001/04/xmlenc#sha256"), + SHA512("SHA-512", "http://www.w3.org/2001/04/xmlenc#sha512"); private final String text; + private final String uri; - private ChecksumType(final String text) { + private ChecksumType(final String text, final String uri) { this.text = text; + this.uri = uri; } public static ChecksumType fromString(String text) { @@ -131,13 +135,30 @@ public static ChecksumType fromString(String text) { } } } - throw new IllegalArgumentException("ChecksumType must be one of these values: " + Arrays.asList(ChecksumType.values()) + "."); + throw new IllegalArgumentException( + "ChecksumType must be one of these values: " + Arrays.asList(ChecksumType.values()) + "."); + } + + public static ChecksumType fromUri(String uri) { + if (uri != null) { + for (ChecksumType checksumType : ChecksumType.values()) { + if (uri.equals(checksumType.uri)) { + return checksumType; + } + } + } + throw new IllegalArgumentException( + "ChecksumType must be one of these values: " + Arrays.asList(ChecksumType.values()) + "."); } @Override public String toString() { return text; } + + public String toUri() { + return uri; + } } //@Expose From a25e47b12cdd4fcb0050a69f0119e9abf4c59183 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Sat, 6 Dec 2025 18:26:24 -0500 Subject: [PATCH 02/39] update version and use checksum URIs --- src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java index 4cbc2aa7b9a..aa011e2c70a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java @@ -49,7 +49,7 @@ public class OREMap { public static final String NAME = "OREMap"; //NOTE: Update this value whenever the output of this class is changed - private static final String DATAVERSE_ORE_FORMAT_VERSION = "Dataverse OREMap Format v1.0.1"; + private static final String DATAVERSE_ORE_FORMAT_VERSION = "Dataverse OREMap Format v1.0.2"; //v1.0.1 - added versionNote private static final String DATAVERSE_SOFTWARE_NAME = "Dataverse"; private static final String DATAVERSE_SOFTWARE_URL = "https://github.com/iqss/dataverse"; @@ -280,7 +280,7 @@ public JsonObjectBuilder getOREMapBuilder(boolean aggregationOnly) { JsonObject checksum = null; // Add checksum. RDA recommends SHA-512 if (df.getChecksumType() != null && df.getChecksumValue() != null) { - checksum = Json.createObjectBuilder().add("@type", df.getChecksumType().toString()) + checksum = Json.createObjectBuilder().add("@type", df.getChecksumType().toUri()) .add("@value", df.getChecksumValue()).build(); aggRes.add(JsonLDTerm.checksum.getLabel(), checksum); } From 6c0cb49513f7748cf6cf026d0b9892005820fbb5 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Sat, 6 Dec 2025 18:26:48 -0500 Subject: [PATCH 03/39] handle multiline descriptions and org names --- .../iq/dataverse/util/bagit/BagGenerator.java | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index f24ebdb8655..69e9c686133 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -548,7 +548,7 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce String childHash = null; if (child.has(JsonLDTerm.checksum.getLabel())) { - ChecksumType childHashType = ChecksumType.fromString( + ChecksumType childHashType = ChecksumType.fromUri( child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@type").getAsString()); if (hashtype == null) { //If one wasn't set as a default, pick up what the first child with one uses @@ -828,7 +828,7 @@ private String generateInfoFile() { // ToDo - make configurable info.append(CRLF); - info.append("Organization-Address: " + WordUtils.wrap(orgAddress, 78, CRLF + " ", true)); + info.append("Organization-Address: " + multilineWrap(orgAddress)); info.append(CRLF); @@ -846,10 +846,8 @@ private String generateInfoFile() { if (descriptionTerm == null) { logger.warning("No description available for BagIt Info file"); } else { - info.append( - // FixMe - handle description having subfields better - WordUtils.wrap(getSingleValue(aggregation.get(descriptionTerm.getLabel()), - descriptionTextTerm.getLabel()), 78, CRLF + " ", true)); + info.append(multilineWrap(getSingleValue(aggregation.get(descriptionTerm.getLabel()), + descriptionTextTerm.getLabel()))); info.append(CRLF); } @@ -883,6 +881,20 @@ private String generateInfoFile() { } + private String multilineWrap(String value) { + // Normalize line breaks and ensure all lines after the first are indented + String[] lines =value.split("\\r?\\n"); + StringBuilder wrappedValue = new StringBuilder(); + for (int i = 0; i < lines.length; i++) { + String wrapped = WordUtils.wrap(lines[i].trim(), 78, CRLF + " ", true); + wrappedValue.append(wrapped); + if (i < lines.length - 1) { + wrappedValue.append(CRLF).append(" "); + } + } + return wrappedValue.toString(); + } + /** * Kludge - compound values (e.g. for descriptions) are sent as an array of * objects containing key/values whereas a single value is sent as one object. From 7a34db8078b4f1605968163bf839267bdd9e5d19 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Tue, 9 Dec 2025 10:01:09 -0500 Subject: [PATCH 04/39] drop blank lines in multiline values Spec doesn't allow empty lines, dropping whitespace-only lines seems reasonable as well (users can't see from the Dataverse display whether an empty line would appear in bag-info.txt or not if we all whotespace only lines (or whitespace beyond the 78 char wrap limit) --- .../iq/dataverse/util/bagit/BagGenerator.java | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 69e9c686133..cf5bea08d99 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -886,10 +886,15 @@ private String multilineWrap(String value) { String[] lines =value.split("\\r?\\n"); StringBuilder wrappedValue = new StringBuilder(); for (int i = 0; i < lines.length; i++) { - String wrapped = WordUtils.wrap(lines[i].trim(), 78, CRLF + " ", true); - wrappedValue.append(wrapped); - if (i < lines.length - 1) { - wrappedValue.append(CRLF).append(" "); + // Skip empty lines - RFC8493 (section 7.3) doesn't allow truly empty lines, + // While trailing whitespace or whitespace-only lines appear to be allowed, it's not clear that handling them adds value (visually identical entries in Dataverse could result in entries w/ or w/o extra lines in the bag-info.txt file + String line = lines[i].trim(); + if (line.length() > 0) { + String wrapped = WordUtils.wrap(line, 78, CRLF + " ", true); + wrappedValue.append(wrapped); + if (i < lines.length - 1) { + wrappedValue.append(CRLF).append(" "); + } } } return wrappedValue.toString(); From b0daad7393a5663b5244ac89e04b0de9c630f9bf Mon Sep 17 00:00:00 2001 From: qqmyers Date: Tue, 9 Dec 2025 10:02:01 -0500 Subject: [PATCH 05/39] remove title as a folder affects manifest and pid-mapping files as well as data file placement --- .../edu/harvard/iq/dataverse/util/bagit/BagGenerator.java | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index cf5bea08d99..31ae06677c3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -482,14 +482,6 @@ public static String getValidName(String bagName) { private void processContainer(JsonObject item, String currentPath) throws IOException { JsonArray children = getChildren(item); HashSet titles = new HashSet(); - String title = null; - if (item.has(JsonLDTerm.dcTerms("Title").getLabel())) { - title = item.get("Title").getAsString(); - } else if (item.has(JsonLDTerm.schemaOrg("name").getLabel())) { - title = item.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); - } - logger.fine("Adding " + title + "/ to path " + currentPath); - currentPath = currentPath + title + "/"; int containerIndex = -1; try { createDir(currentPath); From e5457a8026f4e2e311b2ef84bea7d60f9f8020b4 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Tue, 9 Dec 2025 10:02:19 -0500 Subject: [PATCH 06/39] handle null deaccession reason --- src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java index aa011e2c70a..426d5c9aa5f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java @@ -130,7 +130,8 @@ public JsonObjectBuilder getOREMapBuilder(boolean aggregationOnly) { if(vs.equals(VersionState.DEACCESSIONED)) { JsonObjectBuilder deaccBuilder = Json.createObjectBuilder(); deaccBuilder.add(JsonLDTerm.schemaOrg("name").getLabel(), vs.name()); - deaccBuilder.add(JsonLDTerm.DVCore("reason").getLabel(), version.getDeaccessionNote()); + // Reason is supposed to not be null, but historically this has not been enforced (in the API) + addIfNotNull(deaccBuilder, JsonLDTerm.DVCore("reason"), version.getDeaccessionNote()); addIfNotNull(deaccBuilder, JsonLDTerm.DVCore("forwardUrl"), version.getDeaccessionLink()); aggBuilder.add(JsonLDTerm.schemaOrg("creativeWorkStatus").getLabel(), deaccBuilder); From 10b0556e1de1c52a9a9cf9a32c9a3c07582ce60a Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 10 Dec 2025 09:55:50 -0500 Subject: [PATCH 07/39] use static to simplify testing --- .../edu/harvard/iq/dataverse/util/bagit/BagGenerator.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 31ae06677c3..4f3d0e00280 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -35,7 +35,6 @@ import java.util.logging.Logger; import java.util.zip.ZipEntry; -import edu.harvard.iq.dataverse.util.BundleUtil; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.compress.archivers.zip.ParallelScatterZipCreator; import org.apache.commons.compress.archivers.zip.ScatterZipOutputStream; @@ -77,7 +76,6 @@ import edu.harvard.iq.dataverse.settings.JvmSettings; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.BagGeneratorThreads; import edu.harvard.iq.dataverse.util.json.JsonLDTerm; -import java.util.Optional; public class BagGenerator { @@ -873,7 +871,7 @@ private String generateInfoFile() { } - private String multilineWrap(String value) { + static private String multilineWrap(String value) { // Normalize line breaks and ensure all lines after the first are indented String[] lines =value.split("\\r?\\n"); StringBuilder wrappedValue = new StringBuilder(); From 6d241851d8860ddde6d6b1aac952c12ea426eb62 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 10 Dec 2025 13:49:17 -0500 Subject: [PATCH 08/39] Sanitize/split multiline catalog entry, add Dataverse-Bag-Version --- .../iq/dataverse/util/bagit/BagGenerator.java | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 4f3d0e00280..122ca0b6aba 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -77,6 +77,15 @@ import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.BagGeneratorThreads; import edu.harvard.iq.dataverse.util.json.JsonLDTerm; +/** + * Creates an archival zipped Bag for long-term storage. It is intended to + * include all the information needed to reconstruct the dataset version in a + * new Dataverse instance. + * + * Note that the Dataverse-Bag-Version written in the generateInfoFile() method + * should be updated any time the content/structure of the bag is changed. + * + */ public class BagGenerator { private static final Logger logger = Logger.getLogger(BagGenerator.class.getCanonicalName()); @@ -864,9 +873,13 @@ private String generateInfoFile() { if (aggregation.has(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel())) { catalog = aggregation.get(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel()).getAsString(); } - info.append(catalog + ":" + aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString()); + catalog=catalog.trim().replaceAll("[\\r\\n:]","_"); + info.append(catalog + ":" + multilineWrap(aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString())); info.append(CRLF); + //Add a version number for our bag type - should be updated with any change to the bag content/structure + info.append("Dataverse-Bag-Version: 1.0"); + info.append(CRLF); return info.toString(); } From c4daf28099d4f91705edbe94efcaeecf229ff274 Mon Sep 17 00:00:00 2001 From: Jan van Mansum Date: Thu, 11 Dec 2025 09:00:07 +0100 Subject: [PATCH 09/39] Added unit tests for multilineWrap --- .../bagit/BagGeneratorMultilineWrapTest.java | 102 ++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java diff --git a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java new file mode 100644 index 00000000000..39a713c14e4 --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java @@ -0,0 +1,102 @@ +package edu.harvard.iq.dataverse.util.bagit; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +/** + * Tests adapted for DD-2093: verify the behavior of BagGenerator.multilineWrap. + */ +public class BagGeneratorMultilineWrapTest { + + private static Method multilineWrap; + + @BeforeAll + static void setUp() throws NoSuchMethodException { + // Access the private static method via reflection + multilineWrap = BagGenerator.class.getDeclaredMethod("multilineWrap", String.class); + multilineWrap.setAccessible(true); + } + + private String callMultilineWrap(String input) { + try { + return (String) multilineWrap.invoke(null, input); + } catch (IllegalAccessException | InvocationTargetException e) { + throw new RuntimeException(e); + } + } + + @Test + void shortLine_noWrap() { + String input = "Hello world"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo("Hello world"); + } + + @Test + void exactBoundary_78chars_noWrap() { + String input = repeat('a', 78); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(input); + } + + @Test + void longSingleWord_wrapsAt78WithIndent() { + String input = repeat('a', 100); + String expected = repeat('a', 78) + "\r\n " + repeat('a', 22); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void multiline_input_indentsSecondAndSubsequentOriginalLines() { + String input = "Line1\nLine2"; + String expected = "Line1\r\n Line2"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void multiline_withCRLF_normalizedAndIndented() { + String input = "First line\r\nSecond line"; + String expected = "First line\r\n Second line"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void emptyLines_trimmedAndSkipped() { + String input = "Line1\n\nLine3"; + String expected = "Line1\r\n Line3"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void whitespaceOnlyLines_ignored() { + String input = "Line1\n \n\t\t\nLine3"; + String expected = "Line1\r\n Line3"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void longSecondLine_preservesIndentOnWraps() { + String line1 = "Header"; + String line2 = repeat('b', 90); + String input = line1 + "\n" + line2; + String expected = "Header\r\n " + repeat('b', 78) + "\r\n " + repeat('b', 12); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + private static String repeat(char c, int n) { + StringBuilder sb = new StringBuilder(n); + for (int i = 0; i < n; i++) sb.append(c); + return sb.toString(); + } +} From e76bc9135fabbbdd4cb79f8fea7ed98e518f57f8 Mon Sep 17 00:00:00 2001 From: Jan van Mansum Date: Thu, 11 Dec 2025 09:09:00 +0100 Subject: [PATCH 10/39] Removed unnecessary repeat helper method --- .../bagit/BagGeneratorMultilineWrapTest.java | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java index 39a713c14e4..a212cac6316 100644 --- a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java @@ -39,15 +39,15 @@ void shortLine_noWrap() { @Test void exactBoundary_78chars_noWrap() { - String input = repeat('a', 78); + String input = "a".repeat(78); String out = callMultilineWrap(input); assertThat(out).isEqualTo(input); } @Test void longSingleWord_wrapsAt78WithIndent() { - String input = repeat('a', 100); - String expected = repeat('a', 78) + "\r\n " + repeat('a', 22); + String input = "a".repeat(100); + String expected = "a".repeat(78) + "\r\n " + "a".repeat(22); String out = callMultilineWrap(input); assertThat(out).isEqualTo(expected); } @@ -87,16 +87,10 @@ void whitespaceOnlyLines_ignored() { @Test void longSecondLine_preservesIndentOnWraps() { String line1 = "Header"; - String line2 = repeat('b', 90); + String line2 = "b".repeat(90); String input = line1 + "\n" + line2; - String expected = "Header\r\n " + repeat('b', 78) + "\r\n " + repeat('b', 12); + String expected = "Header\r\n " + "b".repeat(78) + "\r\n " + "b".repeat(12); String out = callMultilineWrap(input); assertThat(out).isEqualTo(expected); } - - private static String repeat(char c, int n) { - StringBuilder sb = new StringBuilder(n); - for (int i = 0; i < n; i++) sb.append(c); - return sb.toString(); - } } From 108c912ee037d23456650e6d5c49c5a943d5ef42 Mon Sep 17 00:00:00 2001 From: Jan van Mansum Date: Thu, 11 Dec 2025 09:17:42 +0100 Subject: [PATCH 11/39] Alined test names with actual test being done --- .../util/bagit/BagGeneratorMultilineWrapTest.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java index a212cac6316..71ceec61adf 100644 --- a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java @@ -54,15 +54,15 @@ void longSingleWord_wrapsAt78WithIndent() { @Test void multiline_input_indentsSecondAndSubsequentOriginalLines() { - String input = "Line1\nLine2"; - String expected = "Line1\r\n Line2"; + String input = "Line1\nLine2\nLine3"; + String expected = "Line1\r\n Line2\r\n Line3"; String out = callMultilineWrap(input); assertThat(out).isEqualTo(expected); } @Test - void multiline_withCRLF_normalizedAndIndented() { - String input = "First line\r\nSecond line"; + void multiline_withLF_normalizedAndIndented() { + String input = "First line\nSecond line"; String expected = "First line\r\n Second line"; String out = callMultilineWrap(input); assertThat(out).isEqualTo(expected); From 884b81b2f0f4aa951d38b18ce8f832643275c542 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Tue, 16 Dec 2025 09:25:50 -0500 Subject: [PATCH 12/39] DD-2098 - allow archivalstatus calls on deaccessioned versions --- src/main/java/edu/harvard/iq/dataverse/api/Datasets.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java index 2378388c540..12dd984775d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java @@ -5006,7 +5006,7 @@ public Response getDatasetVersionArchivalStatus(@Context ContainerRequestContext } DataverseRequest req = createDataverseRequest(au); DatasetVersion dsv = getDatasetVersionOrDie(req, versionNumber, findDatasetOrDie(datasetId), uriInfo, - headers); + headers, true); if (dsv.getArchivalCopyLocation() == null) { return error(Status.NOT_FOUND, "This dataset version has not been archived"); @@ -5048,7 +5048,7 @@ public Response setDatasetVersionArchivalStatus(@Context ContainerRequestContext DataverseRequest req = createDataverseRequest(au); DatasetVersion dsv = getDatasetVersionOrDie(req, versionNumber, findDatasetOrDie(datasetId), - uriInfo, headers); + uriInfo, headers, true); if (dsv == null) { return error(Status.NOT_FOUND, "Dataset version not found"); @@ -5095,7 +5095,7 @@ public Response deleteDatasetVersionArchivalStatus(@Context ContainerRequestCont DataverseRequest req = createDataverseRequest(au); DatasetVersion dsv = getDatasetVersionOrDie(req, versionNumber, findDatasetOrDie(datasetId), uriInfo, - headers); + headers, true); if (dsv == null) { return error(Status.NOT_FOUND, "Dataset version not found"); } From 3076d69b2074326aee55d5d050b8c7628bdaee92 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 17 Dec 2025 15:36:16 -0500 Subject: [PATCH 13/39] set array properly --- .../java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 122ca0b6aba..473e2bab034 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -763,7 +763,6 @@ private String generateInfoFile() { logger.fine("Generating info file"); StringBuffer info = new StringBuffer(); - JsonArray contactsArray = new JsonArray(); /* Contact, and it's subfields, are terms from citation.tsv whose mapping to a formal vocabulary and label in the oremap may change * so we need to find the labels used. */ @@ -775,6 +774,7 @@ private String generateInfoFile() { JsonLDTerm contactEmailTerm = oremap.getContactEmailTerm(); if (contacts.isJsonArray()) { + JsonArray contactsArray = contacts.getAsJsonArray(); for (int i = 0; i < contactsArray.size(); i++) { info.append("Contact-Name: "); JsonElement person = contactsArray.get(i); From 1a7dafa9bb71412361890d519af21a9549b7f4da Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 19 Dec 2025 09:59:26 -0500 Subject: [PATCH 14/39] DD-2212 - use configured checksum when no files are present --- .../iq/dataverse/util/bagit/BagGenerator.java | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 473e2bab034..b9de58dce90 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -75,7 +75,10 @@ import edu.harvard.iq.dataverse.pidproviders.PidUtil; import edu.harvard.iq.dataverse.settings.JvmSettings; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.BagGeneratorThreads; + +import edu.harvard.iq.dataverse.util.SystemConfig; import edu.harvard.iq.dataverse.util.json.JsonLDTerm; +import jakarta.enterprise.inject.spi.CDI; /** * Creates an archival zipped Bag for long-term storage. It is intended to @@ -153,7 +156,6 @@ public class BagGenerator { public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxException, Exception { this.oremap = oreMap; this.oremapObject = oreMap.getOREMap(); - //(JsonObject) new JsonParser().parse(oreMap.getOREMap().toString()); this.dataciteXml = dataciteXml; try { @@ -189,10 +191,6 @@ public void setIgnoreHashes(boolean val) { ignorehashes = val; } - public void setDefaultCheckSumType(ChecksumType type) { - hashtype=type; - } - public static void println(String s) { System.out.println(s); System.out.flush(); @@ -278,6 +276,15 @@ public boolean generateBag(OutputStream outputStream) throws Exception { String path = sha1Entry.getKey(); sha1StringBuffer.append(sha1Entry.getValue() + " " + path); } + if(hashtype == null) { // No files - still want to send an empty manifest to nominally comply with BagIT specification requirement. + try { + //Use the current type if we can retrieve it + hashtype = CDI.current().select(SystemConfig.class).get().getFileFixityChecksumAlgorithm(); + } catch (Exception e) { + // Default to MD5 if we can't + hashtype=DataFile.ChecksumType.MD5; + } + } if (!(hashtype == null)) { String manifestName = "manifest-"; if (hashtype.equals(DataFile.ChecksumType.SHA1)) { From 7eea57c648f462e58fe1d776dfa7fdcee6c3dc68 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 19 Dec 2025 10:37:37 -0500 Subject: [PATCH 15/39] Revert "DD-2098 - allow archivalstatus calls on deaccessioned versions" This reverts commit 884b81b2f0f4aa951d38b18ce8f832643275c542. --- src/main/java/edu/harvard/iq/dataverse/api/Datasets.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java index 12dd984775d..2378388c540 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java @@ -5006,7 +5006,7 @@ public Response getDatasetVersionArchivalStatus(@Context ContainerRequestContext } DataverseRequest req = createDataverseRequest(au); DatasetVersion dsv = getDatasetVersionOrDie(req, versionNumber, findDatasetOrDie(datasetId), uriInfo, - headers, true); + headers); if (dsv.getArchivalCopyLocation() == null) { return error(Status.NOT_FOUND, "This dataset version has not been archived"); @@ -5048,7 +5048,7 @@ public Response setDatasetVersionArchivalStatus(@Context ContainerRequestContext DataverseRequest req = createDataverseRequest(au); DatasetVersion dsv = getDatasetVersionOrDie(req, versionNumber, findDatasetOrDie(datasetId), - uriInfo, headers, true); + uriInfo, headers); if (dsv == null) { return error(Status.NOT_FOUND, "Dataset version not found"); @@ -5095,7 +5095,7 @@ public Response deleteDatasetVersionArchivalStatus(@Context ContainerRequestCont DataverseRequest req = createDataverseRequest(au); DatasetVersion dsv = getDatasetVersionOrDie(req, versionNumber, findDatasetOrDie(datasetId), uriInfo, - headers, true); + headers); if (dsv == null) { return error(Status.NOT_FOUND, "Dataset version not found"); } From 2477cf97a2232ca68f8702dcc3706d25fa7216ec Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 19 Dec 2025 12:01:50 -0500 Subject: [PATCH 16/39] add Source-Org as a potential multiline case, remove change to Int Id --- .../edu/harvard/iq/dataverse/util/bagit/BagGenerator.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index b9de58dce90..e78d1f3edf7 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -830,7 +830,7 @@ private String generateInfoFile() { String orgAddress = JvmSettings.BAGIT_SOURCEORG_ADDRESS.lookupOptional(String.class).orElse(""); String orgEmail = JvmSettings.BAGIT_SOURCEORG_EMAIL.lookupOptional(String.class).orElse(""); - info.append("Source-Organization: " + orgName); + info.append("Source-Organization: " + multilineWrap(orgName)); // ToDo - make configurable info.append(CRLF); @@ -880,8 +880,7 @@ private String generateInfoFile() { if (aggregation.has(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel())) { catalog = aggregation.get(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel()).getAsString(); } - catalog=catalog.trim().replaceAll("[\\r\\n:]","_"); - info.append(catalog + ":" + multilineWrap(aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString())); + info.append(multilineWrap(catalog + ":" + aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString())); info.append(CRLF); //Add a version number for our bag type - should be updated with any change to the bag content/structure From 3f3908f7ccaed5c961b6bcce057b71f4208bc656 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 19 Dec 2025 12:08:05 -0500 Subject: [PATCH 17/39] release note --- doc/release-notes/12063-ORE-and-Bag-updates.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 doc/release-notes/12063-ORE-and-Bag-updates.md diff --git a/doc/release-notes/12063-ORE-and-Bag-updates.md b/doc/release-notes/12063-ORE-and-Bag-updates.md new file mode 100644 index 00000000000..e276232f33a --- /dev/null +++ b/doc/release-notes/12063-ORE-and-Bag-updates.md @@ -0,0 +1,13 @@ +This release contains multiple updates to the OAI-ORE metadata export and archival Bag output: + +OAI-ORE +- now uses URI for checksum algorithms +- a bug causing failures with deaccessioned versions when the deaccession note ("Deaccession Reason" in the UI) was null (which has been allowed via the API). +- the "https://schema.org/additionalType" is updated to "Dataverse OREMap Format v1.0.2" to indicate that the out has changed + +Archival Bag +- for dataset versions with no files, the (empty) manifest-.txt file created will now use the default algorithm defined by the "FileFixityChecksumAlgorithm" setting rather than always defaulting to "md5" +- a bug causing the bag-info.txt to not have information on contacts when the dataset version has more than one contact has been fixed +- values used in the bag-info.txt file that may be multi-line (with embedded CR or LF characters) are now properly indented/formatted per the BagIt specification (i.e. Internal-Sender-Identifier, External-Description, Source-Organization, Organization-Address). +- the name of the dataset is no longer used as a subdirectory under the data directory (dataset names can be long enough to cause failures when unzipping) +- a new key, "Dataverse-Bag-Version" has been added to bag-info.txt with a value "1.0", allowing tracking of changes to Dataverse's arhival bag generation \ No newline at end of file From aa44c0895f4cba1dbc6b145b721f2d8b79406440 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 19 Dec 2025 13:58:34 -0500 Subject: [PATCH 18/39] use constants, pass labelLength to wrapping, start custom lineWrap --- .../iq/dataverse/util/bagit/BagGenerator.java | 284 +++++++++++++----- 1 file changed, 205 insertions(+), 79 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index e78d1f3edf7..b253f961b8c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -33,6 +33,8 @@ import java.util.concurrent.TimeUnit; import java.util.logging.Level; import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import java.util.zip.ZipEntry; import org.apache.commons.codec.digest.DigestUtils; @@ -44,7 +46,6 @@ import org.apache.commons.compress.archivers.zip.ZipFile; import org.apache.commons.compress.parallel.InputStreamSupplier; import org.apache.commons.compress.utils.IOUtils; -import org.apache.commons.text.WordUtils; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; @@ -137,6 +138,20 @@ public class BagGenerator { static PrintWriter pw = null; + // Bag-info.txt field labels + private static final String CONTACT_NAME = "Contact-Name: "; + private static final String CONTACT_EMAIL = "Contact-Email: "; + private static final String SOURCE_ORGANIZATION = "Source-Organization: "; + private static final String ORGANIZATION_ADDRESS = "Organization-Address: "; + private static final String ORGANIZATION_EMAIL = "Organization-Email: "; + private static final String EXTERNAL_DESCRIPTION = "External-Description: "; + private static final String BAGGING_DATE = "Bagging-Date: "; + private static final String EXTERNAL_IDENTIFIER = "External-Identifier: "; + private static final String BAG_SIZE = "Bag-Size: "; + private static final String PAYLOAD_OXUM = "Payload-Oxum: "; + private static final String INTERNAL_SENDER_IDENTIFIER = "Internal-Sender-Identifier: "; + private static final String DATAVERSE_BAG_VERSION = "Dataverse-Bag-Version: "; + /** * This BagGenerator creates a BagIt version 1.0 * (https://tools.ietf.org/html/draft-kunze-bagit-16) compliant bag that is also @@ -149,8 +164,9 @@ public class BagGenerator { * and zipping are done in parallel, using a connection pool. The required space * on disk is ~ n+1/n of the final bag size, e.g. 125% of the bag size for a * 4-way parallel zip operation. - * @throws Exception - * @throws JsonSyntaxException + * + * @throws Exception + * @throws JsonSyntaxException */ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxException, Exception { @@ -159,8 +175,13 @@ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxExceptio this.dataciteXml = dataciteXml; try { - // Using Dataverse, all the URLs to be retrieved should be on the current server, so allowing self-signed certs and not verifying hostnames are useful in testing and - // shouldn't be a significant security issue. This should not be allowed for arbitrary OREMap sources. + /* + * Using Dataverse, all the URLs to be retrieved should be on the current + * server, so allowing self-signed certs and not verifying hostnames are useful + * in testing and shouldn't be a significant security issue. This should not be + * allowed for arbitrary OREMap sources. + * + */ SSLContextBuilder builder = new SSLContextBuilder(); try { builder.loadTrustMaterial(null, new TrustSelfSignedStrategy()); @@ -168,10 +189,11 @@ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxExceptio e.printStackTrace(); } - SSLConnectionSocketFactory sslConnectionFactory = new SSLConnectionSocketFactory(builder.build(), NoopHostnameVerifier.INSTANCE); + SSLConnectionSocketFactory sslConnectionFactory = new SSLConnectionSocketFactory(builder.build(), + NoopHostnameVerifier.INSTANCE); Registry registry = RegistryBuilder.create() - .register("http", PlainConnectionSocketFactory.getSocketFactory()) + .register("http", PlainConnectionSocketFactory.getSocketFactory()) .register("https", sslConnectionFactory).build(); cm = new PoolingHttpClientConnectionManager(registry); @@ -190,7 +212,7 @@ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxExceptio public void setIgnoreHashes(boolean val) { ignorehashes = val; } - + public static void println(String s) { System.out.println(s); System.out.flush(); @@ -208,18 +230,18 @@ public static void println(String s) { * @return success true/false */ public boolean generateBag(OutputStream outputStream) throws Exception { - File tmp = File.createTempFile("qdr-scatter-dirs", "tmp"); dirs = ScatterZipOutputStream.fileBased(tmp); - // The oremapObject is javax.json.JsonObject and we need com.google.gson.JsonObject for the aggregation object - aggregation = (JsonObject) new JsonParser().parse(oremapObject.getJsonObject(JsonLDTerm.ore("describes").getLabel()).toString()); + // The oremapObject is javax.json.JsonObject and we need + // com.google.gson.JsonObject for the aggregation object + aggregation = (JsonObject) new JsonParser() + .parse(oremapObject.getJsonObject(JsonLDTerm.ore("describes").getLabel()).toString()); String pidUrlString = aggregation.get("@id").getAsString(); - String pidString=PidUtil.parseAsGlobalID(pidUrlString).asString(); - bagID = pidString + "v." - + aggregation.get(JsonLDTerm.schemaOrg("version").getLabel()).getAsString(); - + String pidString = PidUtil.parseAsGlobalID(pidUrlString).asString(); + bagID = pidString + "v." + aggregation.get(JsonLDTerm.schemaOrg("version").getLabel()).getAsString(); + logger.info("Generating Bag: " + bagID); try { // Create valid filename from identifier and extend path with @@ -278,11 +300,11 @@ public boolean generateBag(OutputStream outputStream) throws Exception { } if(hashtype == null) { // No files - still want to send an empty manifest to nominally comply with BagIT specification requirement. try { - //Use the current type if we can retrieve it + // Use the current type if we can retrieve it hashtype = CDI.current().select(SystemConfig.class).get().getFileFixityChecksumAlgorithm(); } catch (Exception e) { // Default to MD5 if we can't - hashtype=DataFile.ChecksumType.MD5; + hashtype = DataFile.ChecksumType.MD5; } } if (!(hashtype == null)) { @@ -300,7 +322,7 @@ public boolean generateBag(OutputStream outputStream) throws Exception { } createFileFromString(manifestName, sha1StringBuffer.toString()); } else { - logger.warning("No Hash values (no files?) sending empty manifest to nominally comply with BagIT specification requirement"); + logger.warning("No Hash value defined sending empty manifest-md5 to nominally comply with BagIT specification requirement"); createFileFromString("manifest-md5.txt", ""); } // bagit.txt - Required by spec @@ -383,7 +405,7 @@ public boolean generateBag(String bagName, boolean temp) { // Create an output stream backed by the file bagFileOS = new FileOutputStream(bagFile); if (generateBag(bagFileOS)) { - //The generateBag call sets this.bagName to the correct value + // The generateBag call sets this.bagName to the correct value validateBagFile(bagFile); if (usetemp) { logger.fine("Moving tmp zip"); @@ -395,7 +417,7 @@ public boolean generateBag(String bagName, boolean temp) { return false; } } catch (Exception e) { - logger.log(Level.SEVERE,"Bag Exception: ", e); + logger.log(Level.SEVERE, "Bag Exception: ", e); e.printStackTrace(); logger.warning("Failure: Processing failure during Bagit file creation"); return false; @@ -452,9 +474,9 @@ public void validateBag(String bagId) { logger.info("HashMap Map contains: " + checksumMap.size() + " entries"); checkFiles(checksumMap, bagFile); } catch (IOException io) { - logger.log(Level.SEVERE,"Could not validate Hashes", io); + logger.log(Level.SEVERE, "Could not validate Hashes", io); } catch (Exception e) { - logger.log(Level.SEVERE,"Could not validate Hashes", e); + logger.log(Level.SEVERE, "Could not validate Hashes", e); } finally { IOUtils.closeQuietly(zf); } @@ -479,7 +501,7 @@ public File getBagFile(String bagID) throws Exception { private void validateBagFile(File bagFile) throws IOException { // Run a confirmation test - should verify all files and hashes - + // Check files calculates the hashes and file sizes and reports on // whether hashes are correct checkFiles(checksumMap, bagFile); @@ -547,28 +569,27 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce } String childPath = currentPath + childTitle; JsonElement directoryLabel = child.get(JsonLDTerm.DVCore("directoryLabel").getLabel()); - if(directoryLabel!=null) { - childPath=currentPath + directoryLabel.getAsString() + "/" + childTitle; + if (directoryLabel != null) { + childPath = currentPath + directoryLabel.getAsString() + "/" + childTitle; } - String childHash = null; if (child.has(JsonLDTerm.checksum.getLabel())) { - ChecksumType childHashType = ChecksumType.fromUri( - child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@type").getAsString()); + ChecksumType childHashType = ChecksumType + .fromUri(child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@type").getAsString()); if (hashtype == null) { - //If one wasn't set as a default, pick up what the first child with one uses + // If one wasn't set as a default, pick up what the first child with one uses hashtype = childHashType; } if (hashtype != null && !hashtype.equals(childHashType)) { logger.warning("Multiple hash values in use - will calculate " + hashtype.toString() - + " hashes for " + childTitle); + + " hashes for " + childTitle); } else { childHash = child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@value").getAsString(); if (checksumMap.containsValue(childHash)) { // Something else has this hash logger.warning("Duplicate/Collision: " + child.get("@id").getAsString() + " has SHA1 Hash: " - + childHash + " in: " + bagID); + + childHash + " in: " + bagID); } logger.fine("Adding " + childPath + " with hash " + childHash + " to checksumMap"); checksumMap.put(childPath, childHash); @@ -736,7 +757,7 @@ private void checkFiles(HashMap shaMap, File bagFile) { } } catch (InterruptedException e) { logger.log(Level.SEVERE, "Hash Calculations interrupted", e); - } + } } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); @@ -770,39 +791,41 @@ private String generateInfoFile() { logger.fine("Generating info file"); StringBuffer info = new StringBuffer(); - /* Contact, and it's subfields, are terms from citation.tsv whose mapping to a formal vocabulary and label in the oremap may change - * so we need to find the labels used. - */ + /* + * Contact, and it's subfields, are terms from citation.tsv whose mapping to a + * formal vocabulary and label in the oremap may change so we need to find the + * labels used. + */ JsonLDTerm contactTerm = oremap.getContactTerm(); if ((contactTerm != null) && aggregation.has(contactTerm.getLabel())) { JsonElement contacts = aggregation.get(contactTerm.getLabel()); JsonLDTerm contactNameTerm = oremap.getContactNameTerm(); JsonLDTerm contactEmailTerm = oremap.getContactEmailTerm(); - + if (contacts.isJsonArray()) { JsonArray contactsArray = contacts.getAsJsonArray(); for (int i = 0; i < contactsArray.size(); i++) { - info.append("Contact-Name: "); + info.append(CONTACT_NAME); JsonElement person = contactsArray.get(i); if (person.isJsonPrimitive()) { info.append(person.getAsString()); info.append(CRLF); } else { - if(contactNameTerm != null) { - info.append(((JsonObject) person).get(contactNameTerm.getLabel()).getAsString()); - info.append(CRLF); + if (contactNameTerm != null) { + info.append(((JsonObject) person).get(contactNameTerm.getLabel()).getAsString()); + info.append(CRLF); } - if ((contactEmailTerm!=null) &&((JsonObject) person).has(contactEmailTerm.getLabel())) { - info.append("Contact-Email: "); + if ((contactEmailTerm != null) && ((JsonObject) person).has(contactEmailTerm.getLabel())) { + info.append(CONTACT_EMAIL); info.append(((JsonObject) person).get(contactEmailTerm.getLabel()).getAsString()); info.append(CRLF); } } } } else { - info.append("Contact-Name: "); + info.append(CONTACT_NAME); if (contacts.isJsonPrimitive()) { info.append((String) contacts.getAsString()); @@ -810,12 +833,12 @@ private String generateInfoFile() { } else { JsonObject person = contacts.getAsJsonObject(); - if(contactNameTerm != null) { - info.append(person.get(contactNameTerm.getLabel()).getAsString()); - info.append(CRLF); + if (contactNameTerm != null) { + info.append(person.get(contactNameTerm.getLabel()).getAsString()); + info.append(CRLF); } - if ((contactEmailTerm!=null) && (person.has(contactEmailTerm.getLabel()))) { - info.append("Contact-Email: "); + if ((contactEmailTerm != null) && (person.has(contactEmailTerm.getLabel()))) { + info.append(CONTACT_EMAIL); info.append(person.get(contactEmailTerm.getLabel()).getAsString()); info.append(CRLF); } @@ -826,80 +849,92 @@ private String generateInfoFile() { logger.warning("No contact info available for BagIt Info file"); } - String orgName = JvmSettings.BAGIT_SOURCE_ORG_NAME.lookupOptional(String.class).orElse("Dataverse Installation ()"); + String orgName = JvmSettings.BAGIT_SOURCE_ORG_NAME.lookupOptional(String.class) + .orElse("Dataverse Installation ()"); String orgAddress = JvmSettings.BAGIT_SOURCEORG_ADDRESS.lookupOptional(String.class).orElse(""); String orgEmail = JvmSettings.BAGIT_SOURCEORG_EMAIL.lookupOptional(String.class).orElse(""); - info.append("Source-Organization: " + multilineWrap(orgName)); + info.append(SOURCE_ORGANIZATION + multilineWrap(orgName, SOURCE_ORGANIZATION.length())); // ToDo - make configurable info.append(CRLF); - info.append("Organization-Address: " + multilineWrap(orgAddress)); + info.append(ORGANIZATION_ADDRESS + multilineWrap(orgAddress, ORGANIZATION_ADDRESS.length())); info.append(CRLF); // Not a BagIt standard name - info.append("Organization-Email: " + orgEmail); + info.append(ORGANIZATION_EMAIL + multilineWrap(orgEmail, ORGANIZATION_EMAIL.length())); info.append(CRLF); - info.append("External-Description: "); - - /* Description, and it's subfields, are terms from citation.tsv whose mapping to a formal vocabulary and label in the oremap may change - * so we need to find the labels used. + info.append(EXTERNAL_DESCRIPTION); + + /* + * Description, and it's subfields, are terms from citation.tsv whose mapping to + * a formal vocabulary and label in the oremap may change so we need to find the + * labels used. */ JsonLDTerm descriptionTerm = oremap.getDescriptionTerm(); JsonLDTerm descriptionTextTerm = oremap.getDescriptionTextTerm(); if (descriptionTerm == null) { logger.warning("No description available for BagIt Info file"); } else { - info.append(multilineWrap(getSingleValue(aggregation.get(descriptionTerm.getLabel()), - descriptionTextTerm.getLabel()))); + info.append(multilineWrap( + getSingleValue(aggregation.get(descriptionTerm.getLabel()), descriptionTextTerm.getLabel()), + EXTERNAL_DESCRIPTION.length())); info.append(CRLF); } - info.append("Bagging-Date: "); + info.append(BAGGING_DATE); info.append((new SimpleDateFormat("yyyy-MM-dd").format(Calendar.getInstance().getTime()))); info.append(CRLF); - info.append("External-Identifier: "); + info.append(EXTERNAL_IDENTIFIER); info.append(aggregation.get("@id").getAsString()); info.append(CRLF); - info.append("Bag-Size: "); + info.append(BAG_SIZE); info.append(byteCountToDisplaySize(totalDataSize)); info.append(CRLF); - info.append("Payload-Oxum: "); + info.append(PAYLOAD_OXUM); info.append(Long.toString(totalDataSize)); info.append("."); info.append(Long.toString(dataCount)); info.append(CRLF); - info.append("Internal-Sender-Identifier: "); + info.append(INTERNAL_SENDER_IDENTIFIER); String catalog = orgName + " Catalog"; if (aggregation.has(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel())) { catalog = aggregation.get(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel()).getAsString(); } - info.append(multilineWrap(catalog + ":" + aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString())); + info.append( + multilineWrap(catalog + ":" + aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(), + INTERNAL_SENDER_IDENTIFIER.length())); info.append(CRLF); - //Add a version number for our bag type - should be updated with any change to the bag content/structure - info.append("Dataverse-Bag-Version: 1.0"); + // Add a version number for our bag type - should be updated with any change to + // the bag content/structure + info.append(DATAVERSE_BAG_VERSION + "1.0"); info.append(CRLF); return info.toString(); } - static private String multilineWrap(String value) { + static private String multilineWrap(String value, int labelLength) { // Normalize line breaks and ensure all lines after the first are indented - String[] lines =value.split("\\r?\\n"); + String[] lines = value.split("\\r?\\n"); StringBuilder wrappedValue = new StringBuilder(); for (int i = 0; i < lines.length; i++) { // Skip empty lines - RFC8493 (section 7.3) doesn't allow truly empty lines, - // While trailing whitespace or whitespace-only lines appear to be allowed, it's not clear that handling them adds value (visually identical entries in Dataverse could result in entries w/ or w/o extra lines in the bag-info.txt file + // While trailing whitespace or whitespace-only lines appear to be allowed, it's + // not clear that handling them adds value (visually identical entries in + // Dataverse could result in entries w/ or w/o extra lines in the bag-info.txt + // file String line = lines[i].trim(); if (line.length() > 0) { - String wrapped = WordUtils.wrap(line, 78, CRLF + " ", true); + // Recommended line length, including the label or indents is 79, so we'll wrap + // at 78 to assure subsequent lines with a space are still < 79 total + String wrapped = lineWrap(line, 79, CRLF + " ", true); wrappedValue.append(wrapped); if (i < lines.length - 1) { wrappedValue.append(CRLF).append(" "); @@ -909,25 +944,117 @@ static private String multilineWrap(String value) { return wrappedValue.toString(); } + public static String lineWrap(final String str, int wrapLength, String newLineStr, final boolean wrapLongWords) { + if (str == null) { + return null; + } + if (newLineStr == null) { + newLineStr = System.lineSeparator(); + } + if (wrapLength < 1) { + wrapLength = 1; + } + String wrapOn = " "; + final Pattern patternToWrapOn = Pattern.compile(wrapOn); + final int inputLineLength = str.length(); + int offset = 0; + final StringBuilder wrappedLine = new StringBuilder(inputLineLength + 32); + int matcherSize = -1; + + while (offset < inputLineLength) { + int spaceToWrapAt = -1; + Matcher matcher = patternToWrapOn.matcher(str.substring(offset, + Math.min((int) Math.min(Integer.MAX_VALUE, offset + wrapLength + 1L), inputLineLength))); + if (matcher.find()) { + if (matcher.start() == 0) { + matcherSize = matcher.end(); + if (matcherSize != 0) { + offset += matcher.end(); + continue; + } + offset += 1; + } + spaceToWrapAt = matcher.start() + offset; + } + + // only last line without leading spaces is left + if (inputLineLength - offset <= wrapLength) { + break; + } + + while (matcher.find()) { + spaceToWrapAt = matcher.start() + offset; + } + + if (spaceToWrapAt >= offset) { + // normal case + wrappedLine.append(str, offset, spaceToWrapAt); + wrappedLine.append(newLineStr); + offset = spaceToWrapAt + 1; + + } else // really long word or URL + if (wrapLongWords) { + if (matcherSize == 0) { + offset--; + } + // wrap really long word one line at a time + wrappedLine.append(str, offset, wrapLength + offset); + wrappedLine.append(newLineStr); + offset += wrapLength; + matcherSize = -1; + } else { + // do not wrap really long word, just extend beyond limit + matcher = patternToWrapOn.matcher(str.substring(offset + wrapLength)); + if (matcher.find()) { + matcherSize = matcher.end() - matcher.start(); + spaceToWrapAt = matcher.start() + offset + wrapLength; + } + + if (spaceToWrapAt >= 0) { + if (matcherSize == 0 && offset != 0) { + offset--; + } + wrappedLine.append(str, offset, spaceToWrapAt); + wrappedLine.append(newLineStr); + offset = spaceToWrapAt + 1; + } else { + if (matcherSize == 0 && offset != 0) { + offset--; + } + wrappedLine.append(str, offset, str.length()); + offset = inputLineLength; + matcherSize = -1; + } + } + } + + if (matcherSize == 0 && offset < inputLineLength) { + offset--; + } + + // Whatever is left in line is short enough to just pass through + wrappedLine.append(str, offset, str.length()); + + return wrappedLine.toString(); + } + /** * Kludge - compound values (e.g. for descriptions) are sent as an array of * objects containing key/values whereas a single value is sent as one object. * For cases where multiple values are sent, create a concatenated string so * that information is not lost. * - * @param jsonElement - * - the root json object - * @param key - * - the key to find a value(s) for + * @param jsonElement - the root json object + * @param key - the key to find a value(s) for * @return - a single string */ String getSingleValue(JsonElement jsonElement, String key) { String val = ""; - if(jsonElement.isJsonObject()) { - JsonObject jsonObject=jsonElement.getAsJsonObject(); + if (jsonElement.isJsonObject()) { + JsonObject jsonObject = jsonElement.getAsJsonObject(); val = jsonObject.get(key).getAsString(); } else if (jsonElement.isJsonArray()) { - + Iterator iter = jsonElement.getAsJsonArray().iterator(); ArrayList stringArray = new ArrayList(); while (iter.hasNext()) { @@ -1127,8 +1254,7 @@ public InputStream get() { * Returns a human-readable version of the file size, where the input represents * a specific number of bytes. * - * @param size - * the number of bytes + * @param size the number of bytes * @return a human-readable display value (includes units) */ public static String byteCountToDisplaySize(long size) { From 8227edff5601ec95ea4f8f2851d630265f23cfd4 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 19 Dec 2025 14:28:19 -0500 Subject: [PATCH 19/39] update to handle overall 79 char length --- .../iq/dataverse/util/bagit/BagGenerator.java | 53 +++++++------ .../bagit/BagGeneratorMultilineWrapTest.java | 74 +++++++++++++++++-- 2 files changed, 101 insertions(+), 26 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index b253f961b8c..847bcc08141 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -854,20 +854,18 @@ private String generateInfoFile() { String orgAddress = JvmSettings.BAGIT_SOURCEORG_ADDRESS.lookupOptional(String.class).orElse(""); String orgEmail = JvmSettings.BAGIT_SOURCEORG_EMAIL.lookupOptional(String.class).orElse(""); - info.append(SOURCE_ORGANIZATION + multilineWrap(orgName, SOURCE_ORGANIZATION.length())); + info.append(multilineWrap(SOURCE_ORGANIZATION + orgName)); // ToDo - make configurable info.append(CRLF); - info.append(ORGANIZATION_ADDRESS + multilineWrap(orgAddress, ORGANIZATION_ADDRESS.length())); + info.append(multilineWrap(ORGANIZATION_ADDRESS + orgAddress)); info.append(CRLF); // Not a BagIt standard name - info.append(ORGANIZATION_EMAIL + multilineWrap(orgEmail, ORGANIZATION_EMAIL.length())); + info.append(multilineWrap(ORGANIZATION_EMAIL + orgEmail)); info.append(CRLF); - info.append(EXTERNAL_DESCRIPTION); - /* * Description, and it's subfields, are terms from citation.tsv whose mapping to * a formal vocabulary and label in the oremap may change so we need to find the @@ -878,9 +876,8 @@ private String generateInfoFile() { if (descriptionTerm == null) { logger.warning("No description available for BagIt Info file"); } else { - info.append(multilineWrap( - getSingleValue(aggregation.get(descriptionTerm.getLabel()), descriptionTextTerm.getLabel()), - EXTERNAL_DESCRIPTION.length())); + info.append(multilineWrap(EXTERNAL_DESCRIPTION + + getSingleValue(aggregation.get(descriptionTerm.getLabel()), descriptionTextTerm.getLabel()))); info.append(CRLF); } @@ -902,14 +899,12 @@ private String generateInfoFile() { info.append(Long.toString(dataCount)); info.append(CRLF); - info.append(INTERNAL_SENDER_IDENTIFIER); String catalog = orgName + " Catalog"; if (aggregation.has(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel())) { catalog = aggregation.get(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel()).getAsString(); } - info.append( - multilineWrap(catalog + ":" + aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(), - INTERNAL_SENDER_IDENTIFIER.length())); + info.append(multilineWrap(INTERNAL_SENDER_IDENTIFIER + catalog + ":" + + aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString())); info.append(CRLF); // Add a version number for our bag type - should be updated with any change to @@ -920,7 +915,7 @@ private String generateInfoFile() { } - static private String multilineWrap(String value, int labelLength) { + static private String multilineWrap(String value) { // Normalize line breaks and ensure all lines after the first are indented String[] lines = value.split("\\r?\\n"); StringBuilder wrappedValue = new StringBuilder(); @@ -932,8 +927,7 @@ static private String multilineWrap(String value, int labelLength) { // file String line = lines[i].trim(); if (line.length() > 0) { - // Recommended line length, including the label or indents is 79, so we'll wrap - // at 78 to assure subsequent lines with a space are still < 79 total + // Recommended line length, including the label or indents is 79 String wrapped = lineWrap(line, 79, CRLF + " ", true); wrappedValue.append(wrapped); if (i < lines.length - 1) { @@ -944,6 +938,7 @@ static private String multilineWrap(String value, int labelLength) { return wrappedValue.toString(); } + /** Adapted from Apache WordUtils.wrap() - make subsequent lines shorter by the length of any spaces in newLineStr*/ public static String lineWrap(final String str, int wrapLength, String newLineStr, final boolean wrapLongWords) { if (str == null) { return null; @@ -954,17 +949,30 @@ public static String lineWrap(final String str, int wrapLength, String newLineSt if (wrapLength < 1) { wrapLength = 1; } + + // Calculate the indent length (characters after CRLF in newLineStr) + int indentLength = 0; + int crlfIndex = newLineStr.lastIndexOf("\n"); + if (crlfIndex != -1) { + indentLength = newLineStr.length() - crlfIndex -1; + } + String wrapOn = " "; final Pattern patternToWrapOn = Pattern.compile(wrapOn); final int inputLineLength = str.length(); int offset = 0; final StringBuilder wrappedLine = new StringBuilder(inputLineLength + 32); int matcherSize = -1; + boolean isFirstLine = true; while (offset < inputLineLength) { + // Adjust wrap length based on whether this is the first line or subsequent + // lines + int currentWrapLength = isFirstLine ? wrapLength : (wrapLength - indentLength); + int spaceToWrapAt = -1; Matcher matcher = patternToWrapOn.matcher(str.substring(offset, - Math.min((int) Math.min(Integer.MAX_VALUE, offset + wrapLength + 1L), inputLineLength))); + Math.min((int) Math.min(Integer.MAX_VALUE, offset + currentWrapLength + 1L), inputLineLength))); if (matcher.find()) { if (matcher.start() == 0) { matcherSize = matcher.end(); @@ -978,7 +986,7 @@ public static String lineWrap(final String str, int wrapLength, String newLineSt } // only last line without leading spaces is left - if (inputLineLength - offset <= wrapLength) { + if (inputLineLength - offset <= currentWrapLength) { break; } @@ -991,6 +999,7 @@ public static String lineWrap(final String str, int wrapLength, String newLineSt wrappedLine.append(str, offset, spaceToWrapAt); wrappedLine.append(newLineStr); offset = spaceToWrapAt + 1; + isFirstLine = false; } else // really long word or URL if (wrapLongWords) { @@ -998,16 +1007,17 @@ public static String lineWrap(final String str, int wrapLength, String newLineSt offset--; } // wrap really long word one line at a time - wrappedLine.append(str, offset, wrapLength + offset); + wrappedLine.append(str, offset, currentWrapLength + offset); wrappedLine.append(newLineStr); - offset += wrapLength; + offset += currentWrapLength; matcherSize = -1; + isFirstLine = false; } else { // do not wrap really long word, just extend beyond limit - matcher = patternToWrapOn.matcher(str.substring(offset + wrapLength)); + matcher = patternToWrapOn.matcher(str.substring(offset + currentWrapLength)); if (matcher.find()) { matcherSize = matcher.end() - matcher.start(); - spaceToWrapAt = matcher.start() + offset + wrapLength; + spaceToWrapAt = matcher.start() + offset + currentWrapLength; } if (spaceToWrapAt >= 0) { @@ -1017,6 +1027,7 @@ public static String lineWrap(final String str, int wrapLength, String newLineSt wrappedLine.append(str, offset, spaceToWrapAt); wrappedLine.append(newLineStr); offset = spaceToWrapAt + 1; + isFirstLine = false; } else { if (matcherSize == 0 && offset != 0) { offset--; diff --git a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java index 71ceec61adf..19d478f4b0d 100644 --- a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java @@ -1,3 +1,4 @@ + package edu.harvard.iq.dataverse.util.bagit; import static org.assertj.core.api.Assertions.assertThat; @@ -47,7 +48,7 @@ void exactBoundary_78chars_noWrap() { @Test void longSingleWord_wrapsAt78WithIndent() { String input = "a".repeat(100); - String expected = "a".repeat(78) + "\r\n " + "a".repeat(22); + String expected = "a".repeat(79) + "\r\n " + "a".repeat(21); String out = callMultilineWrap(input); assertThat(out).isEqualTo(expected); } @@ -62,8 +63,8 @@ void multiline_input_indentsSecondAndSubsequentOriginalLines() { @Test void multiline_withLF_normalizedAndIndented() { - String input = "First line\nSecond line"; - String expected = "First line\r\n Second line"; + String input = "a".repeat(200); + String expected = "a".repeat(79) + "\r\n " + "a".repeat(78) + "\r\n " + "a".repeat(43); String out = callMultilineWrap(input); assertThat(out).isEqualTo(expected); } @@ -89,8 +90,71 @@ void longSecondLine_preservesIndentOnWraps() { String line1 = "Header"; String line2 = "b".repeat(90); String input = line1 + "\n" + line2; - String expected = "Header\r\n " + "b".repeat(78) + "\r\n " + "b".repeat(12); + String expected = "Header\r\n " + "b".repeat(79) + "\r\n " + "b".repeat(11); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void labelLength_reducesFirstLineMaxLength() { + // With a label of length 20, first line should wrap at 78-20=58 chars + String label = "l".repeat(20); + String input = label + "a".repeat(150); + // First line: 58 chars, subsequent lines: 78 + String expected = label + "a".repeat(59) + "\r\n " + "a".repeat(78) + "\r\n " + "a".repeat(13); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void labelLength_zero_behavesAsDefault() { + String input = "a".repeat(100); + String expected = "a".repeat(79) + "\r\n " + "a".repeat(21); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void labelLength_withMultipleLines_onlyAffectsFirstLine() { + String label = "l".repeat(15); + String input = label + "a".repeat(100) + "\nSecond line content"; + // First line wraps at 79-15=64, then continues at 78 per line + // Second line starts fresh and wraps normally + String expected = label + "a".repeat(64) + "\r\n " + "a".repeat(36) + "\r\n Second line content"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void wrapsAtWordBoundary_notMidWord() { + // Create a string with a word boundary at position 75 + // "a" repeated 75 times, then a space, then more characters + String input = "a".repeat(75) + " " + "b".repeat(20); + // Should wrap at the space (position 75), not at position 79 + String expected = "a".repeat(75) + "\r\n " + "b".repeat(20); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void wrapsAtWordBoundary_multipleSpaces() { + // Test with word boundary closer to the limit + String input = "a".repeat(70) + " word " + "b".repeat(20); + // Should wrap after "word" (at position 76) + String expected = "a".repeat(70) + " word\r\n " + "b".repeat(20); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void wrapsAtWordBoundary_withLabelLength() { + String label = "l".repeat(20); + // With label length=20, first line wraps at 78-20=58 + // Create string with word boundary at position 55 + String input = label + "a".repeat(55) + " " + "b".repeat(30); + // Should wrap at the space (position 55) + String expected = label + "a".repeat(55) + "\r\n " + "b".repeat(30); String out = callMultilineWrap(input); assertThat(out).isEqualTo(expected); } -} +} \ No newline at end of file From d0749fcd39abefcf0ee13c6fcb042d235f6119dd Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 19 Dec 2025 14:33:41 -0500 Subject: [PATCH 20/39] wrap any other potentially long values --- .../iq/dataverse/util/bagit/BagGenerator.java | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 847bcc08141..b4a80d4d9a9 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -806,40 +806,36 @@ private String generateInfoFile() { if (contacts.isJsonArray()) { JsonArray contactsArray = contacts.getAsJsonArray(); for (int i = 0; i < contactsArray.size(); i++) { - info.append(CONTACT_NAME); + JsonElement person = contactsArray.get(i); if (person.isJsonPrimitive()) { - info.append(person.getAsString()); + info.append(multilineWrap(CONTACT_NAME + person.getAsString())); info.append(CRLF); } else { if (contactNameTerm != null) { - info.append(((JsonObject) person).get(contactNameTerm.getLabel()).getAsString()); + info.append(multilineWrap(CONTACT_NAME + ((JsonObject) person).get(contactNameTerm.getLabel()).getAsString())); info.append(CRLF); } if ((contactEmailTerm != null) && ((JsonObject) person).has(contactEmailTerm.getLabel())) { - info.append(CONTACT_EMAIL); - info.append(((JsonObject) person).get(contactEmailTerm.getLabel()).getAsString()); + info.append(multilineWrap(CONTACT_EMAIL + ((JsonObject) person).get(contactEmailTerm.getLabel()).getAsString())); info.append(CRLF); } } } } else { - info.append(CONTACT_NAME); - if (contacts.isJsonPrimitive()) { - info.append((String) contacts.getAsString()); + info.append(multilineWrap(CONTACT_NAME + (String) contacts.getAsString())); info.append(CRLF); } else { JsonObject person = contacts.getAsJsonObject(); if (contactNameTerm != null) { - info.append(person.get(contactNameTerm.getLabel()).getAsString()); + info.append(multilineWrap(CONTACT_NAME + person.get(contactNameTerm.getLabel()).getAsString())); info.append(CRLF); } if ((contactEmailTerm != null) && (person.has(contactEmailTerm.getLabel()))) { - info.append(CONTACT_EMAIL); - info.append(person.get(contactEmailTerm.getLabel()).getAsString()); + info.append(multilineWrap(CONTACT_EMAIL + person.get(contactEmailTerm.getLabel()).getAsString())); info.append(CRLF); } } @@ -885,8 +881,7 @@ private String generateInfoFile() { info.append((new SimpleDateFormat("yyyy-MM-dd").format(Calendar.getInstance().getTime()))); info.append(CRLF); - info.append(EXTERNAL_IDENTIFIER); - info.append(aggregation.get("@id").getAsString()); + info.append(multilineWrap(EXTERNAL_IDENTIFIER + aggregation.get("@id").getAsString())); info.append(CRLF); info.append(BAG_SIZE); From 24a625f187ecb662b242d613e3fe8d48dd9a9e92 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 19 Dec 2025 15:03:14 -0500 Subject: [PATCH 21/39] cleanup deprecated code, auto-gen comments --- .../iq/dataverse/util/bagit/BagGenerator.java | 50 +++++++------------ 1 file changed, 18 insertions(+), 32 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index b4a80d4d9a9..adca7dd40c3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -235,8 +235,8 @@ public boolean generateBag(OutputStream outputStream) throws Exception { dirs = ScatterZipOutputStream.fileBased(tmp); // The oremapObject is javax.json.JsonObject and we need // com.google.gson.JsonObject for the aggregation object - aggregation = (JsonObject) new JsonParser() - .parse(oremapObject.getJsonObject(JsonLDTerm.ore("describes").getLabel()).toString()); + aggregation = (JsonObject) JsonParser + .parseString(oremapObject.getJsonObject(JsonLDTerm.ore("describes").getLabel()).toString()); String pidUrlString = aggregation.get("@id").getAsString(); String pidString = PidUtil.parseAsGlobalID(pidUrlString).asString(); @@ -394,7 +394,6 @@ public boolean generateBag(OutputStream outputStream) throws Exception { public boolean generateBag(String bagName, boolean temp) { usetemp = temp; - FileOutputStream bagFileOS = null; try { File origBagFile = getBagFile(bagName); File bagFile = origBagFile; @@ -403,36 +402,36 @@ public boolean generateBag(String bagName, boolean temp) { logger.fine("Writing to: " + bagFile.getAbsolutePath()); } // Create an output stream backed by the file - bagFileOS = new FileOutputStream(bagFile); - if (generateBag(bagFileOS)) { - // The generateBag call sets this.bagName to the correct value - validateBagFile(bagFile); - if (usetemp) { - logger.fine("Moving tmp zip"); - origBagFile.delete(); - bagFile.renameTo(origBagFile); + try (FileOutputStream bagFileOS = new FileOutputStream(bagFile)) { + if (generateBag(bagFileOS)) { + // The generateBag call sets this.bagName to the correct value + validateBagFile(bagFile); + if (usetemp) { + logger.fine("Moving tmp zip"); + origBagFile.delete(); + bagFile.renameTo(origBagFile); + } + return true; + } else { + return false; } - return true; - } else { - return false; } } catch (Exception e) { logger.log(Level.SEVERE, "Bag Exception: ", e); e.printStackTrace(); logger.warning("Failure: Processing failure during Bagit file creation"); return false; - } finally { - IOUtils.closeQuietly(bagFileOS); } } + @SuppressWarnings("deprecation") public void validateBag(String bagId) { logger.info("Validating Bag"); ZipFile zf = null; InputStream is = null; try { File bagFile = getBagFile(bagId); - zf = new ZipFile(bagFile); + zf = ZipFile.builder().setFile(bagFile).get(); ZipArchiveEntry entry = zf.getEntry(getValidName(bagId) + "/manifest-sha1.txt"); if (entry != null) { logger.info("SHA1 hashes used"); @@ -602,9 +601,7 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce try { if ((childHash == null) | ignorehashes) { // Generate missing hashInputStream inputStream = null; - InputStream inputStream = null; - try { - inputStream = getInputStreamSupplier(dataUrl).get(); + try (InputStream inputStream = getInputStreamSupplier(dataUrl).get()) { if (hashtype != null) { if (hashtype.equals(DataFile.ChecksumType.SHA1)) { @@ -621,8 +618,6 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce } catch (IOException e) { logger.severe("Failed to read " + childPath); throw e; - } finally { - IOUtils.closeQuietly(inputStream); } if (childHash != null) { JsonObject childHashObject = new JsonObject(); @@ -732,9 +727,7 @@ private void createFileFromURL(final String relPath, final String uri) private void checkFiles(HashMap shaMap, File bagFile) { ExecutorService executor = Executors.newFixedThreadPool(numConnections); - ZipFile zf = null; - try { - zf = new ZipFile(bagFile); + try (ZipFile zf = ZipFile.builder().setFile(bagFile).get()) { BagValidationJob.setZipFile(zf); BagValidationJob.setBagGenerator(this); @@ -759,10 +752,7 @@ private void checkFiles(HashMap shaMap, File bagFile) { logger.log(Level.SEVERE, "Hash Calculations interrupted", e); } } catch (IOException e1) { - // TODO Auto-generated catch block e1.printStackTrace(); - } finally { - IOUtils.closeQuietly(zf); } logger.fine("Hash Validations Completed"); @@ -1153,10 +1143,8 @@ private HttpGet createNewGetRequest(URI url, String returnType) { urlString = urlString + ((urlString.indexOf('?') != -1) ? "&key=" : "?key=") + apiKey; request = new HttpGet(new URI(urlString)); } catch (MalformedURLException e) { - // TODO Auto-generated catch block e.printStackTrace(); } catch (URISyntaxException e) { - // TODO Auto-generated catch block e.printStackTrace(); } } else { @@ -1211,7 +1199,6 @@ public InputStream get() { } } catch (ClientProtocolException e) { tries += 5; - // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // Retry if this is a potentially temporary error such @@ -1228,7 +1215,6 @@ public InputStream get() { } } catch (URISyntaxException e) { - // TODO Auto-generated catch block e.printStackTrace(); } logger.severe("Could not read: " + uriString); From bf036f3f85066a6a148af9fff3119d8156e63d0b Mon Sep 17 00:00:00 2001 From: qqmyers Date: Mon, 22 Dec 2025 16:26:17 -0500 Subject: [PATCH 22/39] update comment --- .../java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index adca7dd40c3..3c82a9719d3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -1035,7 +1035,7 @@ public static String lineWrap(final String str, int wrapLength, String newLineSt } /** - * Kludge - compound values (e.g. for descriptions) are sent as an array of + * Compound values (e.g. for descriptions) are sent as an array of * objects containing key/values whereas a single value is sent as one object. * For cases where multiple values are sent, create a concatenated string so * that information is not lost. From be65611fb9578c96ed4a1aa28e730a693b85f437 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Mon, 22 Dec 2025 16:26:54 -0500 Subject: [PATCH 23/39] add tests --- .../util/bagit/BagGeneratorInfoFileTest.java | 295 ++++++++++++++++++ 1 file changed, 295 insertions(+) create mode 100644 src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorInfoFileTest.java diff --git a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorInfoFileTest.java b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorInfoFileTest.java new file mode 100644 index 00000000000..dbbf3241318 --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorInfoFileTest.java @@ -0,0 +1,295 @@ + +package edu.harvard.iq.dataverse.util.bagit; + +import edu.harvard.iq.dataverse.util.json.JsonLDTerm; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +import com.google.gson.JsonParser; + +import jakarta.json.Json; +import jakarta.json.JsonArrayBuilder; +import jakarta.json.JsonObject; +import jakarta.json.JsonObjectBuilder; + +import java.lang.reflect.Field; +import java.lang.reflect.Method; + +import static org.junit.jupiter.api.Assertions.*; +import static org.mockito.Mockito.*; + +public class BagGeneratorInfoFileTest { + + private BagGenerator bagGenerator; + private JsonObjectBuilder testAggregationBuilder; + + @Mock + private OREMap mockOreMap; + + @BeforeEach + public void setUp() throws Exception { + MockitoAnnotations.openMocks(this); + + // Create base test aggregation builder with required fields + testAggregationBuilder = Json.createObjectBuilder(); + testAggregationBuilder.add("@id", "doi:10.5072/FK2/TEST123"); + testAggregationBuilder.add(JsonLDTerm.schemaOrg("name").getLabel(), "Test Dataset"); + testAggregationBuilder.add(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel(), "Test Catalog"); + } + + /** + * Helper method to finalize the aggregation and create the BagGenerator + */ + private void initializeBagGenerator() throws Exception { + JsonObject testAggregation = testAggregationBuilder.build(); + + JsonObjectBuilder oremapJsonBuilder = Json.createObjectBuilder(); + oremapJsonBuilder.add(JsonLDTerm.ore("describes").getLabel(), testAggregation); + JsonObject oremapObject = oremapJsonBuilder.build(); + // Mock the OREMap.getOREMap() method to return the built JSON + when(mockOreMap.getOREMap()).thenReturn(oremapObject); + + // Initialize BagGenerator with test data + bagGenerator = new BagGenerator(mockOreMap, ""); + setPrivateField(bagGenerator, "aggregation", (com.google.gson.JsonObject) JsonParser + .parseString(oremapObject.getJsonObject(JsonLDTerm.ore("describes").getLabel()).toString())); + setPrivateField(bagGenerator, "totalDataSize", 1024000L); + setPrivateField(bagGenerator, "dataCount", 10L); + } + + @Test + public void testGenerateInfoFileWithSingleContact() throws Exception { + // Arrange + JsonLDTerm contactTerm = JsonLDTerm.schemaOrg("creator"); + JsonLDTerm contactNameTerm = JsonLDTerm.schemaOrg("name"); + JsonLDTerm contactEmailTerm = JsonLDTerm.schemaOrg("email"); + + when(mockOreMap.getContactTerm()).thenReturn(contactTerm); + when(mockOreMap.getContactNameTerm()).thenReturn(contactNameTerm); + when(mockOreMap.getContactEmailTerm()).thenReturn(contactEmailTerm); + + JsonObjectBuilder contactBuilder = Json.createObjectBuilder(); + contactBuilder.add(contactNameTerm.getLabel(), "John Doe"); + contactBuilder.add(contactEmailTerm.getLabel(), "john.doe@example.com"); + testAggregationBuilder.add(contactTerm.getLabel(), contactBuilder); + + initializeBagGenerator(); + + // Act + String infoFile = invokeGenerateInfoFile(); + + // Assert + assertNotNull(infoFile); + assertTrue(infoFile.contains("Contact-Name: John Doe")); + assertTrue(infoFile.contains("Contact-Email: john.doe@example.com")); + } + + @Test + public void testGenerateInfoFileWithMultipleContacts() throws Exception { + // Arrange + JsonLDTerm contactTerm = JsonLDTerm.schemaOrg("creator"); + JsonLDTerm contactNameTerm = JsonLDTerm.schemaOrg("name"); + JsonLDTerm contactEmailTerm = JsonLDTerm.schemaOrg("email"); + + when(mockOreMap.getContactTerm()).thenReturn(contactTerm); + when(mockOreMap.getContactNameTerm()).thenReturn(contactNameTerm); + when(mockOreMap.getContactEmailTerm()).thenReturn(contactEmailTerm); + + JsonArrayBuilder contactsBuilder = Json.createArrayBuilder(); + + JsonObjectBuilder contact1 = Json.createObjectBuilder(); + contact1.add(contactNameTerm.getLabel(), "John Doe"); + contact1.add(contactEmailTerm.getLabel(), "john.doe@example.com"); + + JsonObjectBuilder contact2 = Json.createObjectBuilder(); + contact2.add(contactNameTerm.getLabel(), "Jane Smith"); + contact2.add(contactEmailTerm.getLabel(), "jane.smith@example.com"); + + JsonObjectBuilder contact3 = Json.createObjectBuilder(); + contact3.add(contactNameTerm.getLabel(), "Bob Johnson"); + contact3.add(contactEmailTerm.getLabel(), "bob.johnson@example.com"); + + contactsBuilder.add(contact1); + contactsBuilder.add(contact2); + contactsBuilder.add(contact3); + + testAggregationBuilder.add(contactTerm.getLabel(), contactsBuilder); + + initializeBagGenerator(); + + // Act + String infoFile = invokeGenerateInfoFile(); + + // Assert + assertNotNull(infoFile); + assertTrue(infoFile.contains("Contact-Name: John Doe")); + assertTrue(infoFile.contains("Contact-Email: john.doe@example.com")); + assertTrue(infoFile.contains("Contact-Name: Jane Smith")); + assertTrue(infoFile.contains("Contact-Email: jane.smith@example.com")); + assertTrue(infoFile.contains("Contact-Name: Bob Johnson")); + assertTrue(infoFile.contains("Contact-Email: bob.johnson@example.com")); + } + + @Test + public void testGenerateInfoFileWithSingleDescription() throws Exception { + // Arrange + JsonLDTerm descriptionTerm = JsonLDTerm.schemaOrg("description"); + JsonLDTerm descriptionTextTerm = JsonLDTerm.schemaOrg("value"); + + when(mockOreMap.getDescriptionTerm()).thenReturn(descriptionTerm); + when(mockOreMap.getDescriptionTextTerm()).thenReturn(descriptionTextTerm); + + JsonObjectBuilder descriptionBuilder = Json.createObjectBuilder(); + descriptionBuilder.add(descriptionTextTerm.getLabel(), "This is a test dataset description."); + testAggregationBuilder.add(descriptionTerm.getLabel(), descriptionBuilder); + + initializeBagGenerator(); + + // Act + String infoFile = invokeGenerateInfoFile(); + + // Assert + assertNotNull(infoFile); + assertTrue(infoFile.contains("External-Description: This is a test dataset description.")); + } + + @Test + public void testGenerateInfoFileWithMultipleDescriptions() throws Exception { + // Arrange + JsonLDTerm descriptionTerm = JsonLDTerm.schemaOrg("description"); + JsonLDTerm descriptionTextTerm = JsonLDTerm.schemaOrg("value"); + + when(mockOreMap.getDescriptionTerm()).thenReturn(descriptionTerm); + when(mockOreMap.getDescriptionTextTerm()).thenReturn(descriptionTextTerm); + + JsonArrayBuilder descriptionsBuilder = Json.createArrayBuilder(); + + JsonObjectBuilder desc1 = Json.createObjectBuilder(); + desc1.add(descriptionTextTerm.getLabel(), "First description of the dataset."); + + JsonObjectBuilder desc2 = Json.createObjectBuilder(); + desc2.add(descriptionTextTerm.getLabel(), "Second description with additional details."); + + JsonObjectBuilder desc3 = Json.createObjectBuilder(); + desc3.add(descriptionTextTerm.getLabel(), "Third description for completeness."); + + descriptionsBuilder.add(desc1); + descriptionsBuilder.add(desc2); + descriptionsBuilder.add(desc3); + + testAggregationBuilder.add(descriptionTerm.getLabel(), descriptionsBuilder); + + initializeBagGenerator(); + + // Act + String infoFile = invokeGenerateInfoFile(); + // Assert + assertNotNull(infoFile); + // Multiple descriptions should be concatenated with commas as per getSingleValue method + assertTrue(infoFile.contains("External-Description: First description of the dataset.,Second description with\r\n additional details.,Third description for completeness.")); + } + + @Test + public void testGenerateInfoFileWithRequiredFields() throws Exception { + // Arrange - minimal setup with required fields already in setUp() + JsonLDTerm contactTerm = JsonLDTerm.schemaOrg("creator"); + JsonLDTerm contactNameTerm = JsonLDTerm.schemaOrg("name"); + JsonLDTerm descriptionTerm = JsonLDTerm.schemaOrg("description"); + JsonLDTerm descriptionTextTerm = JsonLDTerm.schemaOrg("value"); + + when(mockOreMap.getContactTerm()).thenReturn(contactTerm); + when(mockOreMap.getContactNameTerm()).thenReturn(contactNameTerm); + when(mockOreMap.getContactEmailTerm()).thenReturn(null); + when(mockOreMap.getDescriptionTerm()).thenReturn(descriptionTerm); + when(mockOreMap.getDescriptionTextTerm()).thenReturn(descriptionTextTerm); + + JsonObjectBuilder contactBuilder = Json.createObjectBuilder(); + contactBuilder.add(contactNameTerm.getLabel(), "Test Contact"); + testAggregationBuilder.add(contactTerm.getLabel(), contactBuilder); + + JsonObjectBuilder descriptionBuilder = Json.createObjectBuilder(); + descriptionBuilder.add(descriptionTextTerm.getLabel(), "Test description"); + testAggregationBuilder.add(descriptionTerm.getLabel(), descriptionBuilder); + + initializeBagGenerator(); + + // Act + String infoFile = invokeGenerateInfoFile(); + + // Assert + assertNotNull(infoFile); + assertTrue(infoFile.contains("Contact-Name: Test Contact")); + assertTrue(infoFile.contains("External-Description: Test description")); + assertTrue(infoFile.contains("Source-Organization:")); + assertTrue(infoFile.contains("Organization-Address:")); + assertTrue(infoFile.contains("Organization-Email:")); + assertTrue(infoFile.contains("Bagging-Date:")); + assertTrue(infoFile.contains("External-Identifier: doi:10.5072/FK2/TEST123")); + assertTrue(infoFile.contains("Bag-Size:")); + assertTrue(infoFile.contains("Payload-Oxum: 1024000.10")); + assertTrue(infoFile.contains("Internal-Sender-Identifier: Test Catalog:Test Dataset")); + } + + @Test + public void testGenerateInfoFileWithDifferentBagSizes() throws Exception { + // Arrange + JsonLDTerm contactTerm = JsonLDTerm.schemaOrg("creator"); + when(mockOreMap.getContactTerm()).thenReturn(contactTerm); + when(mockOreMap.getContactNameTerm()).thenReturn(null); + when(mockOreMap.getContactEmailTerm()).thenReturn(null); + when(mockOreMap.getDescriptionTerm()).thenReturn(null); + + initializeBagGenerator(); + + // Test with bytes + setPrivateField(bagGenerator, "totalDataSize", 512L); + setPrivateField(bagGenerator, "dataCount", 5L); + String infoFile1 = invokeGenerateInfoFile(); + assertTrue(infoFile1.contains("Bag-Size: 512 bytes")); + assertTrue(infoFile1.contains("Payload-Oxum: 512.5")); + + // Test with KB + setPrivateField(bagGenerator, "totalDataSize", 2048L); + setPrivateField(bagGenerator, "dataCount", 3L); + String infoFile2 = invokeGenerateInfoFile(); + assertTrue(infoFile2.contains("Bag-Size: 2.05 KB")); + assertTrue(infoFile2.contains("Payload-Oxum: 2048.3")); + + // Test with MB + setPrivateField(bagGenerator, "totalDataSize", 5242880L); + setPrivateField(bagGenerator, "dataCount", 100L); + String infoFile3 = invokeGenerateInfoFile(); + assertTrue(infoFile3.contains("Bag-Size: 5.24 MB")); + assertTrue(infoFile3.contains("Payload-Oxum: 5242880.100")); + + // Test with GB + setPrivateField(bagGenerator, "totalDataSize", 2147483648L); + setPrivateField(bagGenerator, "dataCount", 1000L); + + String infoFile4 = invokeGenerateInfoFile(); + assertTrue(infoFile4.contains("Bag-Size: 2.15 GB")); + assertTrue(infoFile4.contains("Payload-Oxum: 2147483648.1000")); + } + + // Helper methods + + /** + * Invokes the private generateInfoFile method using reflection + */ + private String invokeGenerateInfoFile() throws Exception { + Method method = BagGenerator.class.getDeclaredMethod("generateInfoFile"); + method.setAccessible(true); + return (String) method.invoke(bagGenerator); + } + + /** + * Sets a private field value using reflection + */ + private void setPrivateField(Object target, String fieldName, Object value) throws Exception { + Field field = BagGenerator.class.getDeclaredField(fieldName); + field.setAccessible(true); + field.set(target, value); + } +} \ No newline at end of file From 24d098a0f70dff33c6ca48049ed0e668e8809792 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Mon, 22 Dec 2025 17:00:19 -0500 Subject: [PATCH 24/39] QDR updates to apache 5, better fault tolerance for file retrieval --- .../iq/dataverse/util/bagit/BagGenerator.java | 172 +++++++++++------- 1 file changed, 111 insertions(+), 61 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 3c82a9719d3..5c5b88a521b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -4,12 +4,15 @@ import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileOutputStream; +import java.io.FilterInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.io.InterruptedIOException; import java.io.OutputStream; import java.io.PrintWriter; import java.net.MalformedURLException; +import java.net.SocketTimeoutException; import java.net.URI; import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; @@ -46,23 +49,24 @@ import org.apache.commons.compress.archivers.zip.ZipFile; import org.apache.commons.compress.parallel.InputStreamSupplier; import org.apache.commons.compress.utils.IOUtils; -import org.apache.http.client.ClientProtocolException; -import org.apache.http.client.config.CookieSpecs; -import org.apache.http.client.config.RequestConfig; -import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.client.methods.HttpGet; -import org.apache.http.config.Registry; -import org.apache.http.config.RegistryBuilder; -import org.apache.http.conn.socket.ConnectionSocketFactory; -import org.apache.http.conn.socket.PlainConnectionSocketFactory; -import org.apache.http.conn.ssl.NoopHostnameVerifier; -import org.apache.http.conn.ssl.SSLConnectionSocketFactory; -import org.apache.http.conn.ssl.TrustSelfSignedStrategy; -import org.apache.http.ssl.SSLContextBuilder; -import org.apache.http.util.EntityUtils; -import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.HttpClients; -import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; +import org.apache.hc.client5.http.ClientProtocolException; +import org.apache.hc.client5.http.classic.methods.HttpGet; +import org.apache.hc.client5.http.config.RequestConfig; +import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; +import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; +import org.apache.hc.client5.http.impl.classic.HttpClients; +import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager; +import org.apache.hc.client5.http.protocol.HttpClientContext; +import org.apache.hc.client5.http.socket.ConnectionSocketFactory; +import org.apache.hc.client5.http.socket.PlainConnectionSocketFactory; +import org.apache.hc.client5.http.ssl.NoopHostnameVerifier; +import org.apache.hc.client5.http.ssl.SSLConnectionSocketFactory; +import org.apache.hc.client5.http.ssl.TrustSelfSignedStrategy; +import org.apache.hc.core5.http.HttpEntity; +import org.apache.hc.core5.http.config.Registry; +import org.apache.hc.core5.http.config.RegistryBuilder; +import org.apache.hc.core5.ssl.SSLContextBuilder; +import org.apache.hc.core5.util.Timeout; import org.json.JSONArray; import com.google.gson.JsonArray; import com.google.gson.JsonElement; @@ -103,10 +107,11 @@ public class BagGenerator { private HashMap pidMap = new LinkedHashMap(); private HashMap checksumMap = new LinkedHashMap(); - private int timeout = 60; - private RequestConfig config = RequestConfig.custom().setConnectTimeout(timeout * 1000) - .setConnectionRequestTimeout(timeout * 1000).setSocketTimeout(timeout * 1000) - .setCookieSpec(CookieSpecs.STANDARD).build(); + private int timeout = 300; + private RequestConfig config = RequestConfig.custom() + .setConnectionRequestTimeout(Timeout.ofSeconds(timeout)) + .setResponseTimeout(Timeout.ofSeconds(timeout)) + .build(); protected CloseableHttpClient client; private PoolingHttpClientConnectionManager cm = null; @@ -131,7 +136,7 @@ public class BagGenerator { private boolean usetemp = false; - private int numConnections = 8; + private static int numConnections = 2; public static final String BAG_GENERATOR_THREADS = BagGeneratorThreads.toString(); private OREMap oremap; @@ -152,6 +157,11 @@ public class BagGenerator { private static final String INTERNAL_SENDER_IDENTIFIER = "Internal-Sender-Identifier: "; private static final String DATAVERSE_BAG_VERSION = "Dataverse-Bag-Version: "; + // Implement exponential backoff with jitter + static final long baseWaitTimeMs = 1000; // Start with 1 second + static final long maxWaitTimeMs = 30000; // Cap at 30 seconds + + /** * This BagGenerator creates a BagIt version 1.0 * (https://tools.ietf.org/html/draft-kunze-bagit-16) compliant bag that is also @@ -189,8 +199,10 @@ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxExceptio e.printStackTrace(); } - SSLConnectionSocketFactory sslConnectionFactory = new SSLConnectionSocketFactory(builder.build(), - NoopHostnameVerifier.INSTANCE); + SSLConnectionSocketFactory sslConnectionFactory = new SSLConnectionSocketFactory( + builder.build(), + NoopHostnameVerifier.INSTANCE + ); Registry registry = RegistryBuilder.create() .register("http", PlainConnectionSocketFactory.getSocketFactory()) @@ -200,11 +212,14 @@ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxExceptio cm.setDefaultMaxPerRoute(numConnections); cm.setMaxTotal(numConnections > 20 ? numConnections : 20); - client = HttpClients.custom().setConnectionManager(cm).setDefaultRequestConfig(config).build(); + client = HttpClients.custom() + .setConnectionManager(cm) + .setDefaultRequestConfig(config) + .build(); scatterZipCreator = new ParallelScatterZipCreator(Executors.newFixedThreadPool(numConnections)); } catch (NoSuchAlgorithmException | KeyManagementException e) { - logger.warning("Aint gonna work"); + logger.warning("Failed to initialize HTTP client"); e.printStackTrace(); } } @@ -424,7 +439,6 @@ public boolean generateBag(String bagName, boolean temp) { } } - @SuppressWarnings("deprecation") public void validateBag(String bagId) { logger.info("Validating Bag"); ZipFile zf = null; @@ -1156,6 +1170,10 @@ private HttpGet createNewGetRequest(URI url, String returnType) { return request; } + /** Get a stream supplier for the given URI. + * + * Caller must close the stream when done. + */ InputStreamSupplier getInputStreamSupplier(final String uriString) { return new InputStreamSupplier() { @@ -1168,56 +1186,88 @@ public InputStream get() { logger.fine("Get # " + tries + " for " + uriString); HttpGet getFile = createNewGetRequest(uri, null); - logger.finest("Retrieving " + tries + ": " + uriString); - CloseableHttpResponse response = null; + try { - response = client.execute(getFile); - // Note - if we ever need to pass an HttpClientContext, we need a new one per - // thread. - int statusCode = response.getStatusLine().getStatusCode(); + // Execute the request directly and keep the response open + final CloseableHttpResponse response = (CloseableHttpResponse) client.executeOpen(null, getFile, HttpClientContext.create()); + int statusCode = response.getCode(); + if (statusCode == 200) { logger.finest("Retrieved: " + uri); - return response.getEntity().getContent(); - } - logger.warning("Attempt: " + tries + " - Unexpected Status when retrieving " + uriString - + " : " + statusCode); - if (statusCode < 500) { - logger.fine("Will not retry for 40x errors"); - tries += 5; + // Return a wrapped stream that will close the response when the stream is closed + final HttpEntity entity = response.getEntity(); + if (entity != null) { + // Create a wrapper stream that closes the response when the stream is closed + return new FilterInputStream(entity.getContent()) { + @Override + public void close() throws IOException { + try { + super.close(); + } finally { + response.close(); + } + } + }; + } else { + response.close(); + logger.warning("No content in response for: " + uriString); + return null; + } } else { + // Close the response for non-200 responses + response.close(); + + logger.warning("Attempt: " + tries + " - Unexpected Status when retrieving " + uriString + + " : " + statusCode); tries++; - } - // Error handling - if (response != null) { try { - EntityUtils.consumeQuietly(response.getEntity()); - response.close(); - } catch (IOException io) { - logger.warning( - "Exception closing response after status: " + statusCode + " on " + uri); + // Calculate exponential backoff: 2^tries * baseWaitTimeMs (1 sec) + long waitTime = (long) (Math.pow(2, tries) * baseWaitTimeMs); + + // Add jitter: random value between 0-30% of the wait time + long jitter = (long) (waitTime * 0.3 * Math.random()); + waitTime = waitTime + jitter; + + // Cap the wait time at maxWaitTimeMs (30 seconds) + waitTime = Math.min(waitTime, maxWaitTimeMs); + + logger.fine("Sleeping for " + waitTime + "ms before retry attempt " + tries); + Thread.sleep(waitTime); + } catch (InterruptedException ie) { + logger.log(Level.SEVERE, "InterruptedException during retry delay for file: " + uriString, ie); + Thread.currentThread().interrupt(); // Restore interrupt status + tries += 5; // Skip remaining attempts } } } catch (ClientProtocolException e) { tries += 5; - e.printStackTrace(); + logger.log(Level.SEVERE, "ClientProtocolException when retrieving file: " + uriString + " (attempt " + tries + ")", e); + } catch (SocketTimeoutException e) { + // Specific handling for timeout exceptions + tries++; + logger.log(Level.SEVERE, "SocketTimeoutException when retrieving file: " + uriString + " (attempt " + tries + " of 5) - Request exceeded timeout", e); + if (tries == 5) { + logger.log(Level.SEVERE, "FINAL FAILURE: File could not be retrieved after all retries due to timeouts: " + uriString, e); + } + } catch (InterruptedIOException e) { + // Catches interruptions during I/O operations + tries += 5; + logger.log(Level.SEVERE, "InterruptedIOException when retrieving file: " + uriString + " - Operation was interrupted", e); + Thread.currentThread().interrupt(); // Restore interrupt status } catch (IOException e) { - // Retry if this is a potentially temporary error such - // as a timeout + // Retry if this is a potentially temporary error such as a timeout tries++; - logger.log(Level.WARNING, "Attempt# " + tries + " : Unable to retrieve file: " + uriString, - e); + logger.log(Level.WARNING, "IOException when retrieving file: " + uriString + " (attempt " + tries + " of 5)", e); if (tries == 5) { - logger.severe("Final attempt failed for " + uriString); + logger.log(Level.SEVERE, "FINAL FAILURE: File could not be retrieved after all retries: " + uriString, e); } - e.printStackTrace(); } } - } catch (URISyntaxException e) { - e.printStackTrace(); + logger.log(Level.SEVERE, "URISyntaxException for file: " + uriString + " - Invalid URI format", e); } - logger.severe("Could not read: " + uriString); + logger.severe("FAILED TO RETRIEVE FILE after all retries: " + uriString); return null; } }; @@ -1268,9 +1318,9 @@ public void setAuthenticationKey(String tokenString) { apiKey = tokenString; } - public void setNumConnections(int numConnections) { - this.numConnections = numConnections; - logger.fine("BagGenerator will use " + numConnections + " threads"); + public static void setNumConnections(int numConnections) { + BagGenerator.numConnections = numConnections; + logger.fine("All BagGenerators will use " + numConnections + " threads"); } } \ No newline at end of file From b4a3799ca82aa48e299e8d5a4351da62b4cad29c Mon Sep 17 00:00:00 2001 From: qqmyers Date: Mon, 22 Dec 2025 17:06:56 -0500 Subject: [PATCH 25/39] release note update --- doc/release-notes/12063-ORE-and-Bag-updates.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/release-notes/12063-ORE-and-Bag-updates.md b/doc/release-notes/12063-ORE-and-Bag-updates.md index e276232f33a..b2926f40c96 100644 --- a/doc/release-notes/12063-ORE-and-Bag-updates.md +++ b/doc/release-notes/12063-ORE-and-Bag-updates.md @@ -10,4 +10,5 @@ Archival Bag - a bug causing the bag-info.txt to not have information on contacts when the dataset version has more than one contact has been fixed - values used in the bag-info.txt file that may be multi-line (with embedded CR or LF characters) are now properly indented/formatted per the BagIt specification (i.e. Internal-Sender-Identifier, External-Description, Source-Organization, Organization-Address). - the name of the dataset is no longer used as a subdirectory under the data directory (dataset names can be long enough to cause failures when unzipping) -- a new key, "Dataverse-Bag-Version" has been added to bag-info.txt with a value "1.0", allowing tracking of changes to Dataverse's arhival bag generation \ No newline at end of file +- a new key, "Dataverse-Bag-Version" has been added to bag-info.txt with a value "1.0", allowing tracking of changes to Dataverse's arhival bag generation +- improvements to file retrieval w.r.t. retries on errors or throttling \ No newline at end of file From 1b429780634c9cb0140fdc2a4cd5475320559cd4 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 28 Jan 2026 15:51:35 -0500 Subject: [PATCH 26/39] suppress counting file retrieval to bag as a download in gb table --- doc/release-notes/12063-ORE-and-Bag-updates.md | 3 ++- .../edu/harvard/iq/dataverse/util/bagit/BagGenerator.java | 8 +++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/doc/release-notes/12063-ORE-and-Bag-updates.md b/doc/release-notes/12063-ORE-and-Bag-updates.md index b2926f40c96..bbc22b22182 100644 --- a/doc/release-notes/12063-ORE-and-Bag-updates.md +++ b/doc/release-notes/12063-ORE-and-Bag-updates.md @@ -11,4 +11,5 @@ Archival Bag - values used in the bag-info.txt file that may be multi-line (with embedded CR or LF characters) are now properly indented/formatted per the BagIt specification (i.e. Internal-Sender-Identifier, External-Description, Source-Organization, Organization-Address). - the name of the dataset is no longer used as a subdirectory under the data directory (dataset names can be long enough to cause failures when unzipping) - a new key, "Dataverse-Bag-Version" has been added to bag-info.txt with a value "1.0", allowing tracking of changes to Dataverse's arhival bag generation -- improvements to file retrieval w.r.t. retries on errors or throttling \ No newline at end of file +- improvements to file retrieval w.r.t. retries on errors or throttling +- retrieval of files for inclusion in the bag is no longer counted as a download by Dataverse \ No newline at end of file diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 5c5b88a521b..1864361d755 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -1179,12 +1179,14 @@ InputStreamSupplier getInputStreamSupplier(final String uriString) { return new InputStreamSupplier() { public InputStream get() { try { - URI uri = new URI(uriString); - + // Adding gbrecs to suppress counting this access as a download (archiving is not a download indicating scientific use) + String modifiedUriString = uriString + (uriString.contains("?") ? "&" : "?") + "gbrecs=true"; + URI uri = new URI(modifiedUriString); + logger.finest("Final URI used (with gbrecs param): " + modifiedUriString); int tries = 0; while (tries < 5) { - logger.fine("Get # " + tries + " for " + uriString); + logger.finest("Get # " + tries + " for " + uriString); HttpGet getFile = createNewGetRequest(uri, null); try { From 49f4818c4954f7caf677e7def281437b0b3a9ba5 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 30 Jan 2026 11:17:15 -0500 Subject: [PATCH 27/39] basic fetch --- .../iq/dataverse/settings/JvmSettings.java | 4 + .../iq/dataverse/util/bagit/BagGenerator.java | 88 +++++++++++++++++-- 2 files changed, 83 insertions(+), 9 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java index 05390ba8a8c..b32b7a8d77d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java @@ -276,6 +276,10 @@ public enum JvmSettings { BAGIT_SOURCE_ORG_NAME(SCOPE_BAGIT_SOURCEORG, "name"), BAGIT_SOURCEORG_ADDRESS(SCOPE_BAGIT_SOURCEORG, "address"), BAGIT_SOURCEORG_EMAIL(SCOPE_BAGIT_SOURCEORG, "email"), + SCOPE_BAGIT_HOLEY(SCOPE_BAGIT, "holey"), + BAGIT_HOLEY_MAX_FILE_SIZE(SCOPE_BAGIT_HOLEY, "max-file-size"), + BAGIT_HOLEY_MAX_DATA_SIZE(SCOPE_BAGIT_HOLEY, "max-data-size"), + // STORAGE USE SETTINGS SCOPE_STORAGEUSE(PREFIX, "storageuse"), diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 1864361d755..e61ba6b7b0e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -142,6 +142,13 @@ public class BagGenerator { private OREMap oremap; static PrintWriter pw = null; + + //Holey Bags + private long maxDataFileSize = Long.MAX_VALUE; + private long maxTotalDataSize = Long.MAX_VALUE; + private long currentBagDataSize = 0; + private StringBuilder fetchFileContent = new StringBuilder(); + private boolean usingFetchFile = false; // Bag-info.txt field labels private static final String CONTACT_NAME = "Contact-Name: "; @@ -222,6 +229,13 @@ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxExceptio logger.warning("Failed to initialize HTTP client"); e.printStackTrace(); } + initializeHoleyBagLimits(); + } + + private void initializeHoleyBagLimits() { + this.maxDataFileSize = JvmSettings.BAGIT_HOLEY_MAX_FILE_SIZE.lookupOptional(Long.class).orElse(Long.MAX_VALUE); + this.maxTotalDataSize = JvmSettings.BAGIT_HOLEY_MAX_DATA_SIZE.lookupOptional(Long.class).orElse(Long.MAX_VALUE); + logger.fine("BagGenerator size limits - maxDataFileSize: " + maxDataFileSize + ", maxTotalDataSize: " + maxTotalDataSize); } public void setIgnoreHashes(boolean val) { @@ -363,6 +377,8 @@ public boolean generateBag(OutputStream outputStream) throws Exception { logger.fine("Creating bag: " + bagName); + writeFetchFile(); + ZipArchiveOutputStream zipArchiveOutputStream = new ZipArchiveOutputStream(outputStream); /* @@ -570,7 +586,6 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce } else { resourceUsed[index] = true; // add item - // ToDo String dataUrl = child.get(JsonLDTerm.schemaOrg("sameAs").getLabel()).getAsString(); logger.fine("File url: " + dataUrl); String childTitle = child.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); @@ -585,6 +600,15 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce if (directoryLabel != null) { childPath = currentPath + directoryLabel.getAsString() + "/" + childTitle; } + // Get file size + Long fileSize = null; + if (child.has(JsonLDTerm.filesize.getLabel())) { + fileSize = child.get(JsonLDTerm.filesize.getLabel()).getAsLong(); + } + if(fileSize == null) { + logger.severe("File size missing for " + childPath); + throw new IOException("Unable to create bag due to missing file size"); + } String childHash = null; if (child.has(JsonLDTerm.checksum.getLabel())) { @@ -614,7 +638,7 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce } try { if ((childHash == null) | ignorehashes) { - // Generate missing hashInputStream inputStream = null; + // Generate missing hash try (InputStream inputStream = getInputStreamSupplier(dataUrl).get()) { if (hashtype != null) { @@ -644,17 +668,30 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce logger.warning("Unable to calculate a " + hashtype + " for " + dataUrl); } } - logger.fine("Requesting: " + childPath + " from " + dataUrl); - createFileFromURL(childPath, dataUrl); + + // Add file to bag or fetch file + if (shouldAddToFetchFile(fileSize)) { + // Add to fetch file instead of including in bag + logger.fine("Adding to fetch file: " + childPath + " from " + dataUrl); + addToFetchFile(dataUrl, fileSize, childPath); + usingFetchFile = true; + } else { + // Add file to bag as before + logger.fine("Requesting: " + childPath + " from " + dataUrl); + createFileFromURL(childPath, dataUrl); + if (fileSize != null) { + currentBagDataSize += fileSize; + } + } + dataCount++; if (dataCount % 1000 == 0) { logger.info("Retrieval in progress: " + dataCount + " files retrieved"); } - if (child.has(JsonLDTerm.filesize.getLabel())) { - Long size = child.get(JsonLDTerm.filesize.getLabel()).getAsLong(); - totalDataSize += size; - if (size > maxFileSize) { - maxFileSize = size; + if (fileSize != null) { + totalDataSize += fileSize; + if (fileSize > maxFileSize) { + maxFileSize = fileSize; } } if (child.has(JsonLDTerm.schemaOrg("fileFormat").getLabel())) { @@ -674,6 +711,39 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce } } + // Helper method to determine if file should go to fetch file + private boolean shouldAddToFetchFile(long fileSize) { + + // Check individual file size limit + if (fileSize > maxDataFileSize) { + logger.fine("File size " + fileSize + " exceeds max data file size " + maxDataFileSize); + return true; + } + + // Check total bag size limit + if (currentBagDataSize + fileSize > maxTotalDataSize) { + logger.fine("Adding file would exceed max total data size. Current: " + currentBagDataSize + + ", File: " + fileSize + ", Max: " + maxTotalDataSize); + return true; + } + + return false; + } + + // Method to append to fetch file content + private void addToFetchFile(String url, long size, String filename) { + // Format: URL size filename + fetchFileContent.append(url).append(" ").append(Long.toString(size)).append(" ").append(filename).append("\n"); + } + + // Method to write fetch file to bag (call this before finalizing the bag) + private void writeFetchFile() throws IOException, ExecutionException, InterruptedException { + if (usingFetchFile && fetchFileContent.length() > 0) { + logger.info("Creating fetch.txt file for holey bag"); + createFileFromString("fetch.txt", fetchFileContent.toString()); + } + } + private int getUnusedIndexOf(String childId) { int index = resourceIndex.indexOf(childId); if (resourceUsed[index] != null) { From 7f5179f82535d997f68396b791bc283d1808d527 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 30 Jan 2026 13:52:18 -0500 Subject: [PATCH 28/39] order by file size --- .../iq/dataverse/util/bagit/BagGenerator.java | 290 +++++++++++------- 1 file changed, 171 insertions(+), 119 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index e61ba6b7b0e..4b94fa44bbd 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -23,10 +23,11 @@ import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; +import java.util.Collections; import java.util.HashMap; -import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; +import java.util.List; import java.util.Set; import java.util.TreeSet; import java.util.Map.Entry; @@ -296,7 +297,15 @@ public boolean generateBag(OutputStream outputStream) throws Exception { resourceUsed = new Boolean[aggregates.size() + 1]; // Process current container (the aggregation itself) and its // children - processContainer(aggregation, currentPath); + // Recursively collect all files from the entire tree, start with an empty set of processedContainers + List allFiles = new ArrayList<>(); + collectAllFiles(aggregation, currentPath, allFiles); + + // Sort files by size (smallest first) + Collections.sort(allFiles); + + // Process all files in sorted order + processAllFiles(allFiles); } // Create manifest files // pid-mapping.txt - a DataOne recommendation to connect ids and @@ -545,17 +554,31 @@ public static String getValidName(String bagName) { } private void processContainer(JsonObject item, String currentPath) throws IOException { + // Collect all files recursively and process containers to create dirs in the zip + private void collectAllFiles(JsonObject item, String currentPath, List allFiles) + throws IOException { JsonArray children = getChildren(item); HashSet titles = new HashSet(); + + String title = null; + if (item.has(JsonLDTerm.dcTerms("Title").getLabel())) { + title = item.get("Title").getAsString(); + } else if (item.has(JsonLDTerm.schemaOrg("name").getLabel())) { + title = item.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); + } + logger.fine("Collecting files from " + title + "/ at path " + currentPath); + currentPath = currentPath + title + "/"; + + // Mark this container as processed + String containerId = item.get("@id").getAsString(); + + // Create directory and update tracking for this container int containerIndex = -1; try { createDir(currentPath); - // Add containers to pid map and mark as 'used', but no sha1 hash - // value - containerIndex = getUnusedIndexOf(item.get("@id").getAsString()); + containerIndex = getUnusedIndexOf(containerId); resourceUsed[containerIndex] = true; - pidMap.put(item.get("@id").getAsString(), currentPath); - + pidMap.put(containerId, currentPath); } catch (InterruptedException | IOException | ExecutionException e) { e.printStackTrace(); logger.severe(e.getMessage()); @@ -563,14 +586,14 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce resourceUsed[containerIndex] = false; } throw new IOException("Unable to create bag"); - } + for (int i = 0; i < children.size(); i++) { // Find the ith child in the overall array of aggregated // resources String childId = children.get(i).getAsString(); - logger.fine("Processing: " + childId); + logger.fine("Examining: " + childId); int index = getUnusedIndexOf(childId); if (resourceUsed[index] != null) { System.out.println("Warning: reusing resource " + index); @@ -580,137 +603,147 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce // entries JsonObject child = aggregates.get(index - 1).getAsJsonObject(); if (childIsContainer(child)) { - // create dir and process children - // processContainer will mark this item as used - processContainer(child, currentPath); + // Recursively collect files from this container + collectAllFiles(child, currentPath, allFiles); } else { - resourceUsed[index] = true; - // add item - String dataUrl = child.get(JsonLDTerm.schemaOrg("sameAs").getLabel()).getAsString(); - logger.fine("File url: " + dataUrl); - String childTitle = child.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); - if (titles.contains(childTitle)) { - logger.warning("**** Multiple items with the same title in: " + currentPath); - logger.warning("**** Will cause failure in hash and size validation in: " + bagID); - } else { - titles.add(childTitle); - } - String childPath = currentPath + childTitle; - JsonElement directoryLabel = child.get(JsonLDTerm.DVCore("directoryLabel").getLabel()); - if (directoryLabel != null) { - childPath = currentPath + directoryLabel.getAsString() + "/" + childTitle; - } // Get file size Long fileSize = null; if (child.has(JsonLDTerm.filesize.getLabel())) { fileSize = child.get(JsonLDTerm.filesize.getLabel()).getAsLong(); } - if(fileSize == null) { - logger.severe("File size missing for " + childPath); + if (fileSize == null) { + logger.severe("File size missing for child: " + childId); throw new IOException("Unable to create bag due to missing file size"); } - String childHash = null; - if (child.has(JsonLDTerm.checksum.getLabel())) { - ChecksumType childHashType = ChecksumType - .fromUri(child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@type").getAsString()); - if (hashtype == null) { - // If one wasn't set as a default, pick up what the first child with one uses - hashtype = childHashType; - } - if (hashtype != null && !hashtype.equals(childHashType)) { - logger.warning("Multiple hash values in use - will calculate " + hashtype.toString() - + " hashes for " + childTitle); - } else { - childHash = child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@value").getAsString(); - if (checksumMap.containsValue(childHash)) { - // Something else has this hash - logger.warning("Duplicate/Collision: " + child.get("@id").getAsString() + " has SHA1 Hash: " - + childHash + " in: " + bagID); - } - logger.fine("Adding " + childPath + " with hash " + childHash + " to checksumMap"); - checksumMap.put(childPath, childHash); - } + // Store minimal info for sorting - JsonObject is just a reference + allFiles.add(new FileEntry(fileSize, child, currentPath, index)); + } + } + } + + + // Process all files in sorted order + private void processAllFiles(List sortedFiles) + throws IOException, ExecutionException, InterruptedException { + + if ((hashtype == null) | ignorehashes) { + hashtype = DataFile.ChecksumType.SHA512; + } + + for (FileEntry entry : sortedFiles) { + // Extract all needed information from the JsonObject reference + JsonObject child = entry.jsonObject; + String dataUrl = child.get(JsonLDTerm.schemaOrg("sameAs").getLabel()).getAsString(); + String childTitle = child.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); + + // Build full path using stored currentPath + String childPath = entry.currentPath + childTitle; + JsonElement directoryLabel = child.get(JsonLDTerm.DVCore("directoryLabel").getLabel()); + if (directoryLabel != null) { + childPath = entry.currentPath + directoryLabel.getAsString() + "/" + childTitle; + } + + // Get hash if exists + String childHash = null; + if (child.has(JsonLDTerm.checksum.getLabel())) { + ChecksumType childHashType = ChecksumType.fromString( + child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@type").getAsString()); + if (hashtype == null) { + hashtype = childHashType; } - if ((hashtype == null) | ignorehashes) { - // Pick sha512 when ignoring hashes or none exist - hashtype = DataFile.ChecksumType.SHA512; + if (hashtype != null && !hashtype.equals(childHashType)) { + logger.warning("Multiple hash values in use - will calculate " + hashtype.toString() + + " hashes for " + childTitle); + } else { + childHash = child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@value").getAsString(); } - try { - if ((childHash == null) | ignorehashes) { - // Generate missing hash - try (InputStream inputStream = getInputStreamSupplier(dataUrl).get()) { - - if (hashtype != null) { - if (hashtype.equals(DataFile.ChecksumType.SHA1)) { - childHash = DigestUtils.sha1Hex(inputStream); - } else if (hashtype.equals(DataFile.ChecksumType.SHA256)) { - childHash = DigestUtils.sha256Hex(inputStream); - } else if (hashtype.equals(DataFile.ChecksumType.SHA512)) { - childHash = DigestUtils.sha512Hex(inputStream); - } else if (hashtype.equals(DataFile.ChecksumType.MD5)) { - childHash = DigestUtils.md5Hex(inputStream); - } + } + + resourceUsed[entry.resourceIndex] = true; + + try { + if ((childHash == null) | ignorehashes) { + // Generate missing hash + InputStream inputStream = null; + try { + inputStream = getInputStreamSupplier(dataUrl).get(); + + if (hashtype != null) { + if (hashtype.equals(DataFile.ChecksumType.SHA1)) { + childHash = DigestUtils.sha1Hex(inputStream); + } else if (hashtype.equals(DataFile.ChecksumType.SHA256)) { + childHash = DigestUtils.sha256Hex(inputStream); + } else if (hashtype.equals(DataFile.ChecksumType.SHA512)) { + childHash = DigestUtils.sha512Hex(inputStream); + } else if (hashtype.equals(DataFile.ChecksumType.MD5)) { + childHash = DigestUtils.md5Hex(inputStream); } - - } catch (IOException e) { - logger.severe("Failed to read " + childPath); - throw e; - } - if (childHash != null) { - JsonObject childHashObject = new JsonObject(); - childHashObject.addProperty("@type", hashtype.toString()); - childHashObject.addProperty("@value", childHash); - child.add(JsonLDTerm.checksum.getLabel(), (JsonElement) childHashObject); - - checksumMap.put(childPath, childHash); - } else { - logger.warning("Unable to calculate a " + hashtype + " for " + dataUrl); } + + } catch (IOException e) { + logger.severe("Failed to read " + childPath); + throw e; + } finally { + IOUtils.closeQuietly(inputStream); } - - // Add file to bag or fetch file - if (shouldAddToFetchFile(fileSize)) { - // Add to fetch file instead of including in bag - logger.fine("Adding to fetch file: " + childPath + " from " + dataUrl); - addToFetchFile(dataUrl, fileSize, childPath); - usingFetchFile = true; + if (childHash != null) { + JsonObject childHashObject = new JsonObject(); + childHashObject.addProperty("@type", hashtype.toString()); + childHashObject.addProperty("@value", childHash); + child.add(JsonLDTerm.checksum.getLabel(), (JsonElement) childHashObject); + + checksumMap.put(childPath, childHash); } else { - // Add file to bag as before - logger.fine("Requesting: " + childPath + " from " + dataUrl); - createFileFromURL(childPath, dataUrl); - if (fileSize != null) { - currentBagDataSize += fileSize; - } + logger.warning("Unable to calculate a " + hashtype + " for " + dataUrl); } - - dataCount++; - if (dataCount % 1000 == 0) { - logger.info("Retrieval in progress: " + dataCount + " files retrieved"); - } - if (fileSize != null) { - totalDataSize += fileSize; - if (fileSize > maxFileSize) { - maxFileSize = fileSize; - } - } - if (child.has(JsonLDTerm.schemaOrg("fileFormat").getLabel())) { - mimetypes.add(child.get(JsonLDTerm.schemaOrg("fileFormat").getLabel()).getAsString()); + } else { + // Hash already exists, add to checksumMap + if (checksumMap.containsValue(childHash)) { + logger.warning("Duplicate/Collision: " + child.get("@id").getAsString() + + " has hash: " + childHash + " in: " + bagID); } - - } catch (Exception e) { - resourceUsed[index] = false; - e.printStackTrace(); - throw new IOException("Unable to create bag"); + logger.fine("Adding " + childPath + " with hash " + childHash + " to checksumMap"); + checksumMap.put(childPath, childHash); + } + + // Add file to bag or fetch file + if (shouldAddToFetchFile(entry.size)) { + logger.fine("Adding to fetch file: " + childPath + " from " + dataUrl + + " (size: " + entry.size + " bytes)"); + addToFetchFile(dataUrl, entry.size, childPath); + usingFetchFile = true; + } else { + logger.fine("Requesting: " + childPath + " from " + dataUrl + + " (size: " + entry.size + " bytes)"); + createFileFromURL(childPath, dataUrl); + currentBagDataSize += entry.size; + } + + dataCount++; + if (dataCount % 1000 == 0) { + logger.info("Retrieval in progress: " + dataCount + " files retrieved"); + } + + totalDataSize += entry.size; + if (entry.size > maxFileSize) { + maxFileSize = entry.size; + } + + if (child.has(JsonLDTerm.schemaOrg("fileFormat").getLabel())) { + mimetypes.add(child.get(JsonLDTerm.schemaOrg("fileFormat").getLabel()).getAsString()); } - // Check for nulls! - pidMap.put(child.get("@id").getAsString(), childPath); - + } catch (Exception e) { + resourceUsed[entry.resourceIndex] = false; + e.printStackTrace(); + throw new IOException("Unable to create bag"); } + + pidMap.put(child.get("@id").getAsString(), childPath); } } - + // Helper method to determine if file should go to fetch file private boolean shouldAddToFetchFile(long fileSize) { @@ -1394,5 +1427,24 @@ public static void setNumConnections(int numConnections) { BagGenerator.numConnections = numConnections; logger.fine("All BagGenerators will use " + numConnections + " threads"); } - + + // Inner class to hold file information before processing + private static class FileEntry implements Comparable { + final long size; + final JsonObject jsonObject; // Direct reference, not a copy + final String currentPath; // Parent directory path + final int resourceIndex; // Still need this for resourceUsed tracking + + FileEntry(long size, JsonObject jsonObject, String currentPath, int resourceIndex) { + this.size = size; + this.jsonObject = jsonObject; + this.currentPath = currentPath; + this.resourceIndex = resourceIndex; + } + + @Override + public int compareTo(FileEntry other) { + return Long.compare(this.size, other.size); + } + } } \ No newline at end of file From bc63285cb16a4215fefbc8a1e48bb12b8f60fdfe Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 30 Jan 2026 15:10:28 -0500 Subject: [PATCH 29/39] only add subcollection folders (if they exist) --- .../iq/dataverse/util/bagit/BagGenerator.java | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 4b94fa44bbd..6de7d970605 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -299,7 +299,7 @@ public boolean generateBag(OutputStream outputStream) throws Exception { // children // Recursively collect all files from the entire tree, start with an empty set of processedContainers List allFiles = new ArrayList<>(); - collectAllFiles(aggregation, currentPath, allFiles); + collectAllFiles(aggregation, currentPath, allFiles, false); // Sort files by size (smallest first) Collections.sort(allFiles); @@ -555,20 +555,21 @@ public static String getValidName(String bagName) { private void processContainer(JsonObject item, String currentPath) throws IOException { // Collect all files recursively and process containers to create dirs in the zip - private void collectAllFiles(JsonObject item, String currentPath, List allFiles) + private void collectAllFiles(JsonObject item, String currentPath, List allFiles, boolean addTitle) throws IOException { JsonArray children = getChildren(item); HashSet titles = new HashSet(); - String title = null; - if (item.has(JsonLDTerm.dcTerms("Title").getLabel())) { - title = item.get("Title").getAsString(); - } else if (item.has(JsonLDTerm.schemaOrg("name").getLabel())) { - title = item.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); + if (addTitle) { + String title = null; + if (item.has(JsonLDTerm.dcTerms("Title").getLabel())) { + title = item.get("Title").getAsString(); + } else if (item.has(JsonLDTerm.schemaOrg("name").getLabel())) { + title = item.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); + } + logger.fine("Collecting files from " + title + "/ at path " + currentPath); + currentPath = currentPath + title + "/"; } - logger.fine("Collecting files from " + title + "/ at path " + currentPath); - currentPath = currentPath + title + "/"; - // Mark this container as processed String containerId = item.get("@id").getAsString(); @@ -602,9 +603,10 @@ private void collectAllFiles(JsonObject item, String currentPath, List Date: Fri, 30 Jan 2026 15:22:28 -0500 Subject: [PATCH 30/39] replace deprecated constructs --- .../iq/dataverse/util/bagit/BagGenerator.java | 74 +++++++++---------- 1 file changed, 34 insertions(+), 40 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 6de7d970605..bd65bd35340 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -48,6 +48,7 @@ import org.apache.commons.compress.archivers.zip.ZipArchiveEntryRequest; import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream; import org.apache.commons.compress.archivers.zip.ZipFile; +import org.apache.commons.compress.archivers.zip.ZipFile.Builder; import org.apache.commons.compress.parallel.InputStreamSupplier; import org.apache.commons.compress.utils.IOUtils; import org.apache.hc.client5.http.ClientProtocolException; @@ -466,57 +467,54 @@ public boolean generateBag(String bagName, boolean temp) { public void validateBag(String bagId) { logger.info("Validating Bag"); - ZipFile zf = null; - InputStream is = null; try { File bagFile = getBagFile(bagId); - zf = ZipFile.builder().setFile(bagFile).get(); - ZipArchiveEntry entry = zf.getEntry(getValidName(bagId) + "/manifest-sha1.txt"); - if (entry != null) { - logger.info("SHA1 hashes used"); - hashtype = DataFile.ChecksumType.SHA1; - } else { - entry = zf.getEntry(getValidName(bagId) + "/manifest-sha512.txt"); + try (ZipFile zf = ZipFile.builder().setFile(bagFile).get()) { + ZipArchiveEntry entry = zf.getEntry(getValidName(bagId) + "/manifest-sha1.txt"); if (entry != null) { - logger.info("SHA512 hashes used"); - hashtype = DataFile.ChecksumType.SHA512; + logger.info("SHA1 hashes used"); + hashtype = DataFile.ChecksumType.SHA1; } else { - entry = zf.getEntry(getValidName(bagId) + "/manifest-sha256.txt"); + entry = zf.getEntry(getValidName(bagId) + "/manifest-sha512.txt"); if (entry != null) { - logger.info("SHA256 hashes used"); - hashtype = DataFile.ChecksumType.SHA256; + logger.info("SHA512 hashes used"); + hashtype = DataFile.ChecksumType.SHA512; } else { - entry = zf.getEntry(getValidName(bagId) + "/manifest-md5.txt"); + entry = zf.getEntry(getValidName(bagId) + "/manifest-sha256.txt"); if (entry != null) { - logger.info("MD5 hashes used"); - hashtype = DataFile.ChecksumType.MD5; + logger.info("SHA256 hashes used"); + hashtype = DataFile.ChecksumType.SHA256; + } else { + entry = zf.getEntry(getValidName(bagId) + "/manifest-md5.txt"); + if (entry != null) { + logger.info("MD5 hashes used"); + hashtype = DataFile.ChecksumType.MD5; + } } } } + if (entry == null) + throw new IOException("No manifest file found"); + try (InputStream is = zf.getInputStream(entry)) { + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + String line = br.readLine(); + while (line != null) { + logger.fine("Hash entry: " + line); + int breakIndex = line.indexOf(' '); + String hash = line.substring(0, breakIndex); + String path = line.substring(breakIndex + 1); + logger.fine("Adding: " + path + " with hash: " + hash); + checksumMap.put(path, hash); + line = br.readLine(); + } + } } - if (entry == null) - throw new IOException("No manifest file found"); - is = zf.getInputStream(entry); - BufferedReader br = new BufferedReader(new InputStreamReader(is)); - String line = br.readLine(); - while (line != null) { - logger.fine("Hash entry: " + line); - int breakIndex = line.indexOf(' '); - String hash = line.substring(0, breakIndex); - String path = line.substring(breakIndex + 1); - logger.fine("Adding: " + path + " with hash: " + hash); - checksumMap.put(path, hash); - line = br.readLine(); - } - IOUtils.closeQuietly(is); logger.info("HashMap Map contains: " + checksumMap.size() + " entries"); checkFiles(checksumMap, bagFile); } catch (IOException io) { logger.log(Level.SEVERE, "Could not validate Hashes", io); } catch (Exception e) { logger.log(Level.SEVERE, "Could not validate Hashes", e); - } finally { - IOUtils.closeQuietly(zf); } return; } @@ -667,10 +665,8 @@ private void processAllFiles(List sortedFiles) try { if ((childHash == null) | ignorehashes) { // Generate missing hash - InputStream inputStream = null; - try { - inputStream = getInputStreamSupplier(dataUrl).get(); - + + try (InputStream inputStream = getInputStreamSupplier(dataUrl).get()){ if (hashtype != null) { if (hashtype.equals(DataFile.ChecksumType.SHA1)) { childHash = DigestUtils.sha1Hex(inputStream); @@ -686,8 +682,6 @@ private void processAllFiles(List sortedFiles) } catch (IOException e) { logger.severe("Failed to read " + childPath); throw e; - } finally { - IOUtils.closeQuietly(inputStream); } if (childHash != null) { JsonObject childHashObject = new JsonObject(); From 69c9a0d822dc8bc5904b5f08d0ff6e8516194979 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 30 Jan 2026 16:35:36 -0500 Subject: [PATCH 31/39] restore name collision check --- .../iq/dataverse/util/bagit/BagGenerator.java | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index bd65bd35340..63969a21c5b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -25,6 +25,7 @@ import java.util.Calendar; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; @@ -151,7 +152,7 @@ public class BagGenerator { private long currentBagDataSize = 0; private StringBuilder fetchFileContent = new StringBuilder(); private boolean usingFetchFile = false; - + // Bag-info.txt field labels private static final String CONTACT_NAME = "Contact-Name: "; private static final String CONTACT_EMAIL = "Contact-Email: "; @@ -627,6 +628,9 @@ private void collectAllFiles(JsonObject item, String currentPath, List sortedFiles) throws IOException, ExecutionException, InterruptedException { + // Track titles to detect duplicates + Set titles = new HashSet<>(); + if ((hashtype == null) | ignorehashes) { hashtype = DataFile.ChecksumType.SHA512; } @@ -637,6 +641,14 @@ private void processAllFiles(List sortedFiles) String dataUrl = child.get(JsonLDTerm.schemaOrg("sameAs").getLabel()).getAsString(); String childTitle = child.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); + // Check for duplicate titles + if (titles.contains(childTitle)) { + logger.warning("**** Multiple items with the same title in: " + entry.currentPath); + logger.warning("**** Will cause failure in hash and size validation in: " + bagID); + } else { + titles.add(childTitle); + } + // Build full path using stored currentPath String childPath = entry.currentPath + childTitle; JsonElement directoryLabel = child.get(JsonLDTerm.DVCore("directoryLabel").getLabel()); From 422435a22c97b55b5d51aca13a287a77d0821022 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 30 Jan 2026 16:35:53 -0500 Subject: [PATCH 32/39] add null check to quiet log/avoid exception --- .../harvard/iq/dataverse/util/bagit/OREMap.java | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java index 426d5c9aa5f..0d99a5bddd1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java @@ -506,11 +506,16 @@ private static void addCvocValue(String val, JsonArrayBuilder vals, JsonObject c for (String prefix : context.keySet()) { localContext.putIfAbsent(prefix, context.getString(prefix)); } - JsonObjectBuilder job = Json.createObjectBuilder(datasetFieldService.getExternalVocabularyValue(val)); - job.add("@id", val); - JsonObject extVal = job.build(); - logger.fine("Adding: " + extVal); - vals.add(extVal); + JsonObject cachedValue = datasetFieldService.getExternalVocabularyValue(val); + if (cachedValue != null) { + JsonObjectBuilder job = Json.createObjectBuilder(cachedValue); + job.add("@id", val); + JsonObject extVal = job.build(); + logger.fine("Adding: " + extVal); + vals.add(extVal); + } else { + vals.add(val); + } } else { vals.add(val); } From d9cfe1df63dd6be3677c603a5aa3339a0dfb4284 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 30 Jan 2026 17:39:17 -0500 Subject: [PATCH 33/39] cleanup - checksum change --- .../edu/harvard/iq/dataverse/util/bagit/BagGenerator.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 63969a21c5b..f23df2947bd 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -49,7 +49,6 @@ import org.apache.commons.compress.archivers.zip.ZipArchiveEntryRequest; import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream; import org.apache.commons.compress.archivers.zip.ZipFile; -import org.apache.commons.compress.archivers.zip.ZipFile.Builder; import org.apache.commons.compress.parallel.InputStreamSupplier; import org.apache.commons.compress.utils.IOUtils; import org.apache.hc.client5.http.ClientProtocolException; @@ -593,7 +592,7 @@ private void collectAllFiles(JsonObject item, String currentPath, List sortedFiles) // Get hash if exists String childHash = null; if (child.has(JsonLDTerm.checksum.getLabel())) { - ChecksumType childHashType = ChecksumType.fromString( - child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@type").getAsString()); + ChecksumType childHashType = ChecksumType + .fromUri(child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@type").getAsString()); if (hashtype == null) { hashtype = childHashType; } From 4895f80b6530489988828e72aa1149e984864c7c Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 30 Jan 2026 18:09:17 -0500 Subject: [PATCH 34/39] cleanup, suppress downloads with gbrec for fetch file --- .../iq/dataverse/util/bagit/BagGenerator.java | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index f23df2947bd..56116976e18 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -50,7 +50,6 @@ import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream; import org.apache.commons.compress.archivers.zip.ZipFile; import org.apache.commons.compress.parallel.InputStreamSupplier; -import org.apache.commons.compress.utils.IOUtils; import org.apache.hc.client5.http.ClientProtocolException; import org.apache.hc.client5.http.classic.methods.HttpGet; import org.apache.hc.client5.http.config.RequestConfig; @@ -551,14 +550,12 @@ public static String getValidName(String bagName) { return bagName.replaceAll("\\W", "-"); } - private void processContainer(JsonObject item, String currentPath) throws IOException { // Collect all files recursively and process containers to create dirs in the zip private void collectAllFiles(JsonObject item, String currentPath, List allFiles, boolean addTitle) throws IOException { JsonArray children = getChildren(item); - HashSet titles = new HashSet(); - if (addTitle) { + if (addTitle) { //For any sub-collections (non-Dataverse) String title = null; if (item.has(JsonLDTerm.dcTerms("Title").getLabel())) { title = item.get("Title").getAsString(); @@ -716,6 +713,7 @@ private void processAllFiles(List sortedFiles) // Add file to bag or fetch file if (shouldAddToFetchFile(entry.size)) { + dataUrl = suppressDownloadCounts(dataUrl); logger.fine("Adding to fetch file: " + childPath + " from " + dataUrl + " (size: " + entry.size + " bytes)"); addToFetchFile(dataUrl, entry.size, childPath); @@ -1291,7 +1289,7 @@ InputStreamSupplier getInputStreamSupplier(final String uriString) { public InputStream get() { try { // Adding gbrecs to suppress counting this access as a download (archiving is not a download indicating scientific use) - String modifiedUriString = uriString + (uriString.contains("?") ? "&" : "?") + "gbrecs=true"; + String modifiedUriString = suppressDownloadCounts(uriString); URI uri = new URI(modifiedUriString); logger.finest("Final URI used (with gbrecs param): " + modifiedUriString); int tries = 0; @@ -1386,6 +1384,11 @@ public void close() throws IOException { }; } + private String suppressDownloadCounts(String uriString ) { + // Adding gbrecs to suppress counting this access as a download (archiving is not a download indicating scientific use) + return uriString + (uriString.contains("?") ? "&" : "?") + "gbrecs=true"; + } + /** * Adapted from org/apache/commons/io/FileUtils.java change to SI - add 2 digits * of precision From 62a03b2f097860ba1c81fbd85d7bfd15e7dd9b31 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Sun, 1 Feb 2026 14:21:52 -0500 Subject: [PATCH 35/39] add setting, refactor, for non-holey option --- .../iq/dataverse/settings/JvmSettings.java | 7 ++-- .../iq/dataverse/util/bagit/BagGenerator.java | 36 +++++++++++++------ 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java index b32b7a8d77d..086ed7929aa 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java @@ -276,9 +276,10 @@ public enum JvmSettings { BAGIT_SOURCE_ORG_NAME(SCOPE_BAGIT_SOURCEORG, "name"), BAGIT_SOURCEORG_ADDRESS(SCOPE_BAGIT_SOURCEORG, "address"), BAGIT_SOURCEORG_EMAIL(SCOPE_BAGIT_SOURCEORG, "email"), - SCOPE_BAGIT_HOLEY(SCOPE_BAGIT, "holey"), - BAGIT_HOLEY_MAX_FILE_SIZE(SCOPE_BAGIT_HOLEY, "max-file-size"), - BAGIT_HOLEY_MAX_DATA_SIZE(SCOPE_BAGIT_HOLEY, "max-data-size"), + SCOPE_BAGIT_ZIP(SCOPE_BAGIT, "zip"), + BAGIT_ZIP_MAX_FILE_SIZE(SCOPE_BAGIT_ZIP, "max-file-size"), + BAGIT_ZIP_MAX_DATA_SIZE(SCOPE_BAGIT_ZIP, "max-data-size"), + BAGIT_ZIP_HOLEY(SCOPE_BAGIT_ZIP, "holey"), // STORAGE USE SETTINGS diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 56116976e18..2ca833ba839 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -144,12 +144,14 @@ public class BagGenerator { static PrintWriter pw = null; - //Holey Bags + // Size limits and holey Bags private long maxDataFileSize = Long.MAX_VALUE; private long maxTotalDataSize = Long.MAX_VALUE; private long currentBagDataSize = 0; private StringBuilder fetchFileContent = new StringBuilder(); private boolean usingFetchFile = false; + private boolean createHoleyBag = false; + private List oversizedFiles = new ArrayList<>(); // Bag-info.txt field labels private static final String CONTACT_NAME = "Contact-Name: "; @@ -234,9 +236,12 @@ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxExceptio } private void initializeHoleyBagLimits() { - this.maxDataFileSize = JvmSettings.BAGIT_HOLEY_MAX_FILE_SIZE.lookupOptional(Long.class).orElse(Long.MAX_VALUE); - this.maxTotalDataSize = JvmSettings.BAGIT_HOLEY_MAX_DATA_SIZE.lookupOptional(Long.class).orElse(Long.MAX_VALUE); - logger.fine("BagGenerator size limits - maxDataFileSize: " + maxDataFileSize + ", maxTotalDataSize: " + maxTotalDataSize); + this.maxDataFileSize = JvmSettings.BAGIT_ZIP_MAX_FILE_SIZE.lookupOptional(Long.class).orElse(Long.MAX_VALUE); + this.maxTotalDataSize = JvmSettings.BAGIT_ZIP_MAX_DATA_SIZE.lookupOptional(Long.class).orElse(Long.MAX_VALUE); + this.createHoleyBag = JvmSettings.BAGIT_ZIP_HOLEY.lookupOptional(Boolean.class).orElse(false); + logger.fine("BagGenerator size limits - maxDataFileSize: " + maxDataFileSize + + ", maxTotalDataSize: " + maxTotalDataSize + + ", createHoleyBag: " + createHoleyBag); } public void setIgnoreHashes(boolean val) { @@ -603,6 +608,7 @@ private void collectAllFiles(JsonObject item, String currentPath, List sortedFiles) } // Add file to bag or fetch file - if (shouldAddToFetchFile(entry.size)) { + if (!addToZip(entry.size)) { + if(createHoleyBag) { dataUrl = suppressDownloadCounts(dataUrl); logger.fine("Adding to fetch file: " + childPath + " from " + dataUrl + " (size: " + entry.size + " bytes)"); addToFetchFile(dataUrl, entry.size, childPath); usingFetchFile = true; + } else { + // Add to list for archiver to retrieve + oversizedFiles.add(entry); + logger.fine("Adding " + childPath + " to oversized files list for archiver"); + } } else { logger.fine("Requesting: " + childPath + " from " + dataUrl + " (size: " + entry.size + " bytes)"); @@ -750,28 +762,28 @@ private void processAllFiles(List sortedFiles) } // Helper method to determine if file should go to fetch file - private boolean shouldAddToFetchFile(long fileSize) { + private boolean addToZip(long fileSize) { // Check individual file size limit if (fileSize > maxDataFileSize) { logger.fine("File size " + fileSize + " exceeds max data file size " + maxDataFileSize); - return true; + return false; } // Check total bag size limit if (currentBagDataSize + fileSize > maxTotalDataSize) { logger.fine("Adding file would exceed max total data size. Current: " + currentBagDataSize + ", File: " + fileSize + ", Max: " + maxTotalDataSize); - return true; + return false; } - return false; + return true; } // Method to append to fetch file content private void addToFetchFile(String url, long size, String filename) { // Format: URL size filename - fetchFileContent.append(url).append(" ").append(Long.toString(size)).append(" ").append(filename).append("\n"); + fetchFileContent.append(url).append(" ").append(Long.toString(size)).append(" ").append(filename).append("\r\n"); } // Method to write fetch file to bag (call this before finalizing the bag) @@ -1389,6 +1401,10 @@ private String suppressDownloadCounts(String uriString ) { return uriString + (uriString.contains("?") ? "&" : "?") + "gbrecs=true"; } + public List getOversizedFiles() { + return oversizedFiles; + } + /** * Adapted from org/apache/commons/io/FileUtils.java change to SI - add 2 digits * of precision From 637b2e30e25b57d7fa87d5d1b2b70eeb08ac5ad5 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 4 Feb 2026 11:00:21 -0500 Subject: [PATCH 36/39] Update to track non-zipped files, add method --- .../iq/dataverse/util/bagit/BagGenerator.java | 50 ++++++++++++------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 2ca833ba839..60cabc9ac99 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -640,8 +640,8 @@ private void processAllFiles(List sortedFiles) for (FileEntry entry : sortedFiles) { // Extract all needed information from the JsonObject reference JsonObject child = entry.jsonObject; - String dataUrl = child.get(JsonLDTerm.schemaOrg("sameAs").getLabel()).getAsString(); - String childTitle = child.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); + + String childTitle = entry.getChildTitle(); // Check for duplicate titles if (titles.contains(childTitle)) { @@ -651,12 +651,7 @@ private void processAllFiles(List sortedFiles) titles.add(childTitle); } - // Build full path using stored currentPath - String childPath = entry.currentPath + childTitle; - JsonElement directoryLabel = child.get(JsonLDTerm.DVCore("directoryLabel").getLabel()); - if (directoryLabel != null) { - childPath = entry.currentPath + directoryLabel.getAsString() + "/" + childTitle; - } + String childPath= entry.getChildPath(childTitle); // Get hash if exists String childHash = null; @@ -675,6 +670,7 @@ private void processAllFiles(List sortedFiles) } resourceUsed[entry.resourceIndex] = true; + String dataUrl = entry.getDataUrl(); try { if ((childHash == null) | ignorehashes) { @@ -716,11 +712,9 @@ private void processAllFiles(List sortedFiles) logger.fine("Adding " + childPath + " with hash " + childHash + " to checksumMap"); checksumMap.put(childPath, childHash); } - // Add file to bag or fetch file if (!addToZip(entry.size)) { if(createHoleyBag) { - dataUrl = suppressDownloadCounts(dataUrl); logger.fine("Adding to fetch file: " + childPath + " from " + dataUrl + " (size: " + entry.size + " bytes)"); addToFetchFile(dataUrl, entry.size, childPath); @@ -1300,10 +1294,7 @@ InputStreamSupplier getInputStreamSupplier(final String uriString) { return new InputStreamSupplier() { public InputStream get() { try { - // Adding gbrecs to suppress counting this access as a download (archiving is not a download indicating scientific use) - String modifiedUriString = suppressDownloadCounts(uriString); - URI uri = new URI(modifiedUriString); - logger.finest("Final URI used (with gbrecs param): " + modifiedUriString); + URI uri = new URI(uriString); int tries = 0; while (tries < 5) { @@ -1396,10 +1387,7 @@ public void close() throws IOException { }; } - private String suppressDownloadCounts(String uriString ) { - // Adding gbrecs to suppress counting this access as a download (archiving is not a download indicating scientific use) - return uriString + (uriString.contains("?") ? "&" : "?") + "gbrecs=true"; - } + public List getOversizedFiles() { return oversizedFiles; @@ -1456,7 +1444,7 @@ public static void setNumConnections(int numConnections) { } // Inner class to hold file information before processing - private static class FileEntry implements Comparable { + public static class FileEntry implements Comparable { final long size; final JsonObject jsonObject; // Direct reference, not a copy final String currentPath; // Parent directory path @@ -1469,6 +1457,30 @@ private static class FileEntry implements Comparable { this.resourceIndex = resourceIndex; } + public String getDataUrl() { + return suppressDownloadCounts(jsonObject.get(JsonLDTerm.schemaOrg("sameAs").getLabel()).getAsString()); + } + + public String getChildTitle() { + return jsonObject.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); + } + + public String getChildPath(String title) { + // Build full path using stored currentPath + String childPath = currentPath + title; + JsonElement directoryLabel = jsonObject.get(JsonLDTerm.DVCore("directoryLabel").getLabel()); + if (directoryLabel != null) { + childPath = currentPath + directoryLabel.getAsString() + "/" + title; + } + return childPath; + } + + private String suppressDownloadCounts(String uriString) { + // Adding gbrecs to suppress counting this access as a download (archiving is + // not a download indicating scientific use) + return uriString + (uriString.contains("?") ? "&" : "?") + "gbrecs=true"; + } + @Override public int compareTo(FileEntry other) { return Long.compare(this.size, other.size); From a6b05056401065e356dd2dfed13a4aa080702a7a Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 4 Feb 2026 14:20:28 -0500 Subject: [PATCH 37/39] reuse stream supplier, update archivers to send oversized files --- .../impl/DuraCloudSubmitToArchiveCommand.java | 65 +++++-- .../GoogleCloudSubmitToArchiveCommand.java | 177 +++++++++++++----- .../impl/LocalSubmitToArchiveCommand.java | 14 +- .../impl/S3SubmitToArchiveCommand.java | 47 ++++- .../iq/dataverse/util/bagit/BagGenerator.java | 6 +- 5 files changed, 244 insertions(+), 65 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java index fe4a25091d7..b65f39fa484 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java @@ -7,16 +7,24 @@ import edu.harvard.iq.dataverse.authorization.users.ApiToken; import edu.harvard.iq.dataverse.engine.command.DataverseRequest; import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator; +import edu.harvard.iq.dataverse.util.bagit.OREMap; + import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.DuraCloudContext; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.DuraCloudHost; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.DuraCloudPort; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; +import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.InputStream; import java.io.PipedInputStream; import java.io.PipedOutputStream; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; import java.security.DigestInputStream; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; @@ -96,6 +104,8 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, "Bag not transferred"); + Path tempBagFile = null; + try { /* * If there is a failure in creating a space, it is likely that a prior version @@ -161,20 +171,38 @@ public void run() { // Add BagIt ZIP file // Although DuraCloud uses SHA-256 internally, it's API uses MD5 to verify the // transfer + Path bagFile = null; + - messageDigest = MessageDigest.getInstance("MD5"); - try (PipedInputStream in = new PipedInputStream(100000); - DigestInputStream digestInputStream2 = new DigestInputStream(in, messageDigest)) { - Thread bagThread = startBagThread(dv, in, digestInputStream2, dataciteXml, token); - checksum = store.addContent(spaceName, fileName, digestInputStream2, -1l, null, null, null); - bagThread.join(); - if (success) { - logger.fine("Content: " + fileName + " added with checksum: " + checksum); - localchecksum = Hex.encodeHexString(digestInputStream2.getMessageDigest().digest()); + tempBagFile = Files.createTempFile("dataverse-bag-", ".zip"); + logger.fine("Creating bag in temporary file: " + tempBagFile.toString()); + + BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); + bagger.setAuthenticationKey(token.getTokenString()); + // Generate bag to temporary file using the provided ore JsonObject + try (FileOutputStream fos = new FileOutputStream(tempBagFile.toFile())) { + if (!bagger.generateBag(fos)) { + throw new IOException("Bag generation failed"); } - if (!success || !checksum.equals(localchecksum)) { + } + + // Store BagIt file + long bagSize = Files.size(tempBagFile); + logger.fine("Bag created successfully, size: " + bagSize + " bytes"); + + // Now upload the bag file + messageDigest = MessageDigest.getInstance("MD5"); + try (InputStream is = Files.newInputStream(bagFile); + DigestInputStream bagDigestInputStream = new DigestInputStream(is, messageDigest)) { + checksum = store.addContent(spaceName, fileName, bagDigestInputStream, bagFile.toFile().length(), "application/zip", null, null); + localchecksum = Hex.encodeHexString(bagDigestInputStream.getMessageDigest().digest()); + + if (checksum != null && checksum.equals(localchecksum)) { + logger.fine("Content: " + fileName + " added with checksum: " + checksum); + success = true; + } else { logger.severe("Failure on " + fileName); - logger.severe(success ? checksum + " not equal to " + localchecksum : "failed to transfer to DuraCloud"); + logger.severe(checksum + " not equal to " + localchecksum); try { store.deleteContent(spaceName, fileName); store.deleteContent(spaceName, baseFileName + "_datacite.xml"); @@ -185,9 +213,6 @@ public void run() { "DuraCloud Submission Failure: incomplete archive transfer"); } } - - logger.fine("DuraCloud Submission step: Content Transferred"); - // Document the location of dataset archival copy location (actually the URL // where you can // view it as an admin) @@ -223,8 +248,20 @@ public void run() { return new Failure("Unable to create DuraCloud space with name: " + baseFileName, mesg); } catch (NoSuchAlgorithmException e) { logger.severe("MD5 MessageDigest not available!"); + } catch (Exception e) { + logger.warning(e.getLocalizedMessage()); + e.printStackTrace(); + return new Failure("Error in transferring file to DuraCloud", + "DuraCloud Submission Failure: internal error"); } finally { + if (tempBagFile != null) { + try { + Files.deleteIfExists(tempBagFile); + } catch (IOException e) { + logger.warning("Failed to delete temporary bag file: " + tempBagFile + " : " + e.getMessage()); + } + } dv.setArchivalCopyLocation(statusObject.build().toString()); } } else { diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java index 7dfb9f07e19..21038a1eab6 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java @@ -14,20 +14,29 @@ import edu.harvard.iq.dataverse.engine.command.DataverseRequest; import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; import edu.harvard.iq.dataverse.settings.JvmSettings; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator; +import edu.harvard.iq.dataverse.util.bagit.OREMap; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator.FileEntry; + import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.GoogleCloudBucket; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.GoogleCloudProject; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; import org.apache.commons.codec.binary.Hex; +import org.apache.commons.compress.parallel.InputStreamSupplier; import jakarta.json.Json; import jakarta.json.JsonObjectBuilder; import java.io.File; import java.io.FileInputStream; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.InputStream; import java.io.PipedInputStream; import java.io.PipedOutputStream; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; import java.security.DigestInputStream; import java.security.MessageDigest; import java.util.Map; @@ -45,26 +54,28 @@ public GoogleCloudSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersi } @Override - public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, Map requestedSettings) { + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, + Map requestedSettings) { logger.fine("In GoogleCloudSubmitToArchiveCommand..."); String bucketName = requestedSettings.get(GOOGLECLOUD_BUCKET); String projectName = requestedSettings.get(GOOGLECLOUD_PROJECT); logger.fine("Project: " + projectName + " Bucket: " + bucketName); if (bucketName != null && projectName != null) { Storage storage; - //Set a failure status that will be updated if we succeed + // Set a failure status that will be updated if we succeed JsonObjectBuilder statusObject = Json.createObjectBuilder(); statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, "Bag not transferred"); - + String cloudKeyFile = JvmSettings.FILES_DIRECTORY.lookup() + File.separator + "googlecloudkey.json"; - + + // Create temporary file for bag + Path tempBagFile = null; + try (FileInputStream cloudKeyStream = new FileInputStream(cloudKeyFile)) { storage = StorageOptions.newBuilder() - .setCredentials(ServiceAccountCredentials.fromStream(cloudKeyStream)) - .setProjectId(projectName) - .build() - .getService(); + .setCredentials(ServiceAccountCredentials.fromStream(cloudKeyStream)).setProjectId(projectName) + .build().getService(); Bucket bucket = storage.get(bucketName); Dataset dataset = dv.getDataset(); @@ -72,6 +83,7 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-') .replace('.', '-').toLowerCase(); + String bagFileName = spaceName + "/" + spaceName + ".v" + dv.getFriendlyVersionNumber() + ".zip"; String dataciteXml = getDataCiteXml(dv); MessageDigest messageDigest = MessageDigest.getInstance("MD5"); @@ -102,7 +114,8 @@ public void run() { Thread.sleep(10); i++; } - Blob dcXml = bucket.create(spaceName + "/datacite.v" + dv.getFriendlyVersionNumber() + ".xml", digestInputStream, "text/xml", Bucket.BlobWriteOption.doesNotExist()); + Blob dcXml = bucket.create(spaceName + "/datacite.v" + dv.getFriendlyVersionNumber() + ".xml", + digestInputStream, "text/xml", Bucket.BlobWriteOption.doesNotExist()); dcThread.join(); String checksum = dcXml.getMd5ToHexString(); @@ -110,7 +123,8 @@ public void run() { String localchecksum = Hex.encodeHexString(digestInputStream.getMessageDigest().digest()); if (!success || !checksum.equals(localchecksum)) { logger.severe("Failure on " + spaceName); - logger.severe(success ? checksum + " not equal to " + localchecksum : "datacite.xml transfer did not succeed"); + logger.severe(success ? checksum + " not equal to " + localchecksum + : "datacite.xml transfer did not succeed"); try { dcXml.delete(Blob.BlobSourceOption.generationMatch()); } catch (StorageException se) { @@ -119,55 +133,112 @@ public void run() { return new Failure("Error in transferring DataCite.xml file to GoogleCloud", "GoogleCloud Submission Failure: incomplete metadata transfer"); } + } + + tempBagFile = Files.createTempFile("dataverse-bag-", ".zip"); + logger.fine("Creating bag in temporary file: " + tempBagFile.toString()); + + BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); + bagger.setAuthenticationKey(token.getTokenString()); + // Generate bag to temporary file using the provided ore JsonObject + try (FileOutputStream fos = new FileOutputStream(tempBagFile.toFile())) { + if (!bagger.generateBag(fos)) { + throw new IOException("Bag generation failed"); + } + } + + // Store BagIt file + long bagSize = Files.size(tempBagFile); + logger.fine("Bag created successfully, size: " + bagSize + " bytes"); + + if (bagSize == 0) { + throw new IOException("Generated bag file is empty"); + } + + // Upload bag file and calculate checksum during upload + messageDigest = MessageDigest.getInstance("MD5"); + String localChecksum; + + try (FileInputStream fis = new FileInputStream(tempBagFile.toFile()); + DigestInputStream dis = new DigestInputStream(fis, messageDigest)) { + + logger.fine("Uploading bag to GoogleCloud: " + bagFileName); + + Blob bag = bucket.create(bagFileName, dis, "application/zip", + Bucket.BlobWriteOption.doesNotExist()); + + if (bag.getSize() == 0) { + throw new IOException("Uploaded bag has zero size"); + } + + // Get checksum after upload completes + localChecksum = Hex.encodeHexString(dis.getMessageDigest().digest()); + String remoteChecksum = bag.getMd5ToHexString(); + + logger.fine("Bag: " + bagFileName + " uploaded"); + logger.fine("Local checksum: " + localChecksum); + logger.fine("Remote checksum: " + remoteChecksum); - // Store BagIt file - success = false; - String fileName = spaceName + ".v" + dv.getFriendlyVersionNumber() + ".zip"; + if (!localChecksum.equals(remoteChecksum)) { + logger.severe("Bag checksum mismatch!"); + logger.severe("Local: " + localChecksum + " != Remote: " + remoteChecksum); + try { + bag.delete(Blob.BlobSourceOption.generationMatch()); + } catch (StorageException se) { + logger.warning(se.getMessage()); + } + return new Failure("Error in transferring Zip file to GoogleCloud", + "GoogleCloud Submission Failure: bag checksum mismatch"); + } + } - // Add BagIt ZIP file - // Google uses MD5 as one way to verify the - // transfer + logger.fine("GoogleCloud Submission step: Content Transferred Successfully"); + + // Now upload any files that were too large for the bag + for (FileEntry entry : bagger.getOversizedFiles()) { + String childPath = entry.getChildPath(entry.getChildTitle()); + String fileKey = spaceName + "/" + childPath; + logger.fine("Uploading oversized file to GoogleCloud: " + fileKey); messageDigest = MessageDigest.getInstance("MD5"); - try (PipedInputStream in = new PipedInputStream(100000); - DigestInputStream digestInputStream2 = new DigestInputStream(in, messageDigest)) { - Thread bagThread = startBagThread(dv, in, digestInputStream2, dataciteXml, token); - Blob bag = bucket.create(spaceName + "/" + fileName, digestInputStream2, "application/zip", - Bucket.BlobWriteOption.doesNotExist()); - if (bag.getSize() == 0) { - throw new IOException("Empty Bag"); + InputStreamSupplier supplier = bagger.getInputStreamSupplier(entry.getDataUrl()); + try (InputStream is = supplier.get(); + DigestInputStream dis = new DigestInputStream(is, messageDigest)) { + Blob oversizedFileBlob = bucket.create(fileKey, dis, Bucket.BlobWriteOption.doesNotExist()); + if (oversizedFileBlob.getSize() == 0) { + throw new IOException("Uploaded oversized file has zero size: " + fileKey); } - bagThread.join(); - - checksum = bag.getMd5ToHexString(); - logger.fine("Bag: " + fileName + " added with checksum: " + checksum); - localchecksum = Hex.encodeHexString(digestInputStream2.getMessageDigest().digest()); - if (!success || !checksum.equals(localchecksum)) { - logger.severe(success ? checksum + " not equal to " + localchecksum - : "bag transfer did not succeed"); + localChecksum = Hex.encodeHexString(dis.getMessageDigest().digest()); + String remoteChecksum = oversizedFileBlob.getMd5ToHexString(); + logger.fine("Oversized file: " + fileKey + " uploaded"); + logger.fine("Local checksum: " + localChecksum); + logger.fine("Remote checksum: " + remoteChecksum); + if (!localChecksum.equals(remoteChecksum)) { + logger.severe("Oversized file checksum mismatch!"); + logger.severe("Local: " + localChecksum + " != Remote: " + remoteChecksum); try { - bag.delete(Blob.BlobSourceOption.generationMatch()); + oversizedFileBlob.delete(Blob.BlobSourceOption.generationMatch()); } catch (StorageException se) { logger.warning(se.getMessage()); } - return new Failure("Error in transferring Zip file to GoogleCloud", - "GoogleCloud Submission Failure: incomplete archive transfer"); + return new Failure("Error in transferring oversized file to GoogleCloud", + "GoogleCloud Submission Failure: oversized file transfer incomplete"); } + } catch (IOException e) { + logger.warning("Failed to upload oversized file: " + childPath + " : " + e.getMessage()); + return new Failure("Error uploading oversized file to Google Cloud: " + childPath); } + } - logger.fine("GoogleCloud Submission step: Content Transferred"); - - // Document the location of dataset archival copy location (actually the URL - // where you can view it as an admin) - // Changed to point at bucket where the zip and datacite.xml are visible + // Document the location of dataset archival copy location (actually the URL + // to the bucket). + statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); + statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, + String.format("https://storage.cloud.google.com/%s/%s", bucketName, spaceName)); - StringBuffer sb = new StringBuffer("https://console.cloud.google.com/storage/browser/"); - sb.append(bucketName + "/" + spaceName); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, sb.toString()); - - } } else { - logger.warning("GoogleCloud Submision Workflow aborted: Dataset locked for pidRegister"); + logger.warning( + "GoogleCloud Archiver Submision Workflow aborted: Dataset locked for publication/pidRegister"); + return new Failure("Dataset locked"); } } catch (Exception e) { @@ -177,11 +248,21 @@ public void run() { e.getLocalizedMessage() + ": check log for details"); } finally { + if (tempBagFile != null) { + try { + Files.deleteIfExists(tempBagFile); + } catch (IOException e) { + logger.warning("Failed to delete temporary bag file: " + tempBagFile + " : " + e.getMessage()); + } + } dv.setArchivalCopyLocation(statusObject.build().toString()); } return WorkflowStepResult.OK; - } else { - return new Failure("GoogleCloud Submission not configured - no \":GoogleCloudBucket\" and/or \":GoogleCloudProject\"."); + } else + + { + return new Failure( + "GoogleCloud Submission not configured - no \":GoogleCloudBucket\" and/or \":GoogleCloudProject\"."); } } diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java index 462879f2ec9..76d7ae87f38 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java @@ -10,6 +10,7 @@ import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.BagItLocalPath; import edu.harvard.iq.dataverse.util.bagit.BagGenerator; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator.FileEntry; import edu.harvard.iq.dataverse.util.bagit.OREMap; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; @@ -23,6 +24,7 @@ import java.io.File; import java.io.FileOutputStream; +import java.io.InputStream; import org.apache.commons.io.FileUtils; @@ -63,12 +65,22 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t new File(localPath + "/" + spaceName + "-datacite.v" + dv.getFriendlyVersionNumber() + ".xml"), dataciteXml, StandardCharsets.UTF_8); BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); - bagger.setNumConnections(getNumberOfBagGeneratorThreads()); bagger.setAuthenticationKey(token.getTokenString()); zipName = localPath + "/" + spaceName + "v" + dv.getFriendlyVersionNumber() + ".zip"; //ToDo: generateBag(File f, true) seems to do the same thing (with a .tmp extension) - since we don't have to use a stream here, could probably just reuse the existing code? bagger.generateBag(new FileOutputStream(zipName + ".partial")); + // Now download any files that were too large for the bag + for (FileEntry entry : bagger.getOversizedFiles()) { + String childPath = entry.getChildPath(entry.getChildTitle()); + File destFile = new File(localPath, localPath + "/" + spaceName + "v" + dv.getFriendlyVersionNumber() + "/" + childPath); + logger.fine("Downloading oversized file to " + destFile.getAbsolutePath()); + destFile.getParentFile().mkdirs(); + try (InputStream is = bagger.getInputStreamSupplier(entry.getDataUrl()).get()) { + FileUtils.copyInputStreamToFile(is, destFile); + } + } + File srcFile = new File(zipName + ".partial"); File destFile = new File(zipName); diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java index 65531d775c8..072fd0edb48 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java @@ -9,6 +9,7 @@ import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.S3ArchiverConfig; import edu.harvard.iq.dataverse.util.bagit.BagGenerator; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator.FileEntry; import edu.harvard.iq.dataverse.util.bagit.OREMap; import edu.harvard.iq.dataverse.util.json.JsonUtil; import edu.harvard.iq.dataverse.workflow.step.Failure; @@ -17,9 +18,15 @@ import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; import java.nio.charset.StandardCharsets; +import java.util.List; import java.util.Map; import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.logging.Level; import java.util.logging.Logger; import jakarta.annotation.Resource; @@ -28,6 +35,7 @@ import jakarta.json.JsonObject; import jakarta.json.JsonObjectBuilder; +import org.apache.commons.compress.parallel.InputStreamSupplier; import org.eclipse.microprofile.config.Config; import org.eclipse.microprofile.config.ConfigProvider; @@ -55,8 +63,11 @@ import software.amazon.awssdk.utils.StringUtils; import software.amazon.awssdk.transfer.s3.S3TransferManager; import software.amazon.awssdk.transfer.s3.model.CompletedFileUpload; +import software.amazon.awssdk.transfer.s3.model.CompletedUpload; import software.amazon.awssdk.transfer.s3.model.FileUpload; +import software.amazon.awssdk.transfer.s3.model.Upload; import software.amazon.awssdk.transfer.s3.model.UploadFileRequest; +import software.amazon.awssdk.transfer.s3.model.UploadRequest; @RequiredPermissions(Permission.PublishDataset) public class S3SubmitToArchiveCommand extends AbstractSubmitToArchiveCommand { @@ -98,7 +109,8 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t JsonObjectBuilder statusObject = Json.createObjectBuilder(); statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, "Bag not transferred"); - + ExecutorService executor = Executors.newCachedThreadPool(); + try { Dataset dataset = dv.getDataset(); @@ -150,7 +162,39 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t if (uploadResult.response().sdkHttpResponse().isSuccessful()) { logger.fine("S3 Submission step: Content Transferred"); + List bigFiles = bagger.getOversizedFiles(); + + for (FileEntry entry : bigFiles) { + String childPath = entry.getChildPath(entry.getChildTitle()); + String fileKey = spaceName + "/" + childPath; + InputStreamSupplier supplier = bagger.getInputStreamSupplier(entry.getDataUrl()); + try (InputStream is = supplier.get()) { + + PutObjectRequest filePutRequest = PutObjectRequest.builder().bucket(bucketName) + .key(fileKey).build(); + + UploadRequest uploadRequest = UploadRequest.builder() + .putObjectRequest(filePutRequest) + .requestBody(AsyncRequestBody.fromInputStream(is, entry.getSize(), executor)) + .build(); + + Upload upload = tm.upload(uploadRequest); + CompletedUpload completedUpload = upload.completionFuture().join(); + + if (completedUpload.response().sdkHttpResponse().isSuccessful()) { + logger.fine("Successfully uploaded oversized file: " + fileKey); + } else { + logger.warning("Failed to upload oversized file: " + fileKey); + return new Failure("Error uploading oversized file to S3: " + fileKey); + } + } catch (IOException e) { + logger.log(Level.WARNING, + "Failed to get input stream for oversized file: " + fileKey, e); + return new Failure("Error getting input stream for oversized file: " + fileKey); + } + } statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); + statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, String.format("https://%s.s3.amazonaws.com/%s", bucketName, bagKey)); } else { @@ -175,6 +219,7 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t e.getLocalizedMessage() + ": check log for details"); } finally { + executor.shutdown(); if (tm != null) { tm.close(); } diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 60cabc9ac99..55235f85491 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -1289,7 +1289,7 @@ private HttpGet createNewGetRequest(URI url, String returnType) { * * Caller must close the stream when done. */ - InputStreamSupplier getInputStreamSupplier(final String uriString) { + public InputStreamSupplier getInputStreamSupplier(final String uriString) { return new InputStreamSupplier() { public InputStream get() { @@ -1485,5 +1485,9 @@ private String suppressDownloadCounts(String uriString) { public int compareTo(FileEntry other) { return Long.compare(this.size, other.size); } + + public long getSize() { + return size; + } } } \ No newline at end of file From 5739e3521fefd90c4f7a5c1c7940f25acd670294 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 4 Feb 2026 15:08:58 -0500 Subject: [PATCH 38/39] docs, release note update --- doc/release-notes/12144-un-holey-bags.md | 21 +++++++++++++++++++ .../source/admin/big-data-administration.rst | 1 + .../source/installation/config.rst | 17 +++++++++++++++ 3 files changed, 39 insertions(+) create mode 100644 doc/release-notes/12144-un-holey-bags.md diff --git a/doc/release-notes/12144-un-holey-bags.md b/doc/release-notes/12144-un-holey-bags.md new file mode 100644 index 00000000000..3c9c632eb6c --- /dev/null +++ b/doc/release-notes/12144-un-holey-bags.md @@ -0,0 +1,21 @@ +This release contains multiple updates to the OAI-ORE metadata export and archival Bag output: + +OAI-ORE +- now uses URI for checksum algorithms +- a bug causing failures with deaccessioned versions when the deaccession note ("Deaccession Reason" in the UI) was null (which has been allowed via the API). +- the "https://schema.org/additionalType" is updated to "Dataverse OREMap Format v1.0.2" to indicate that the out has changed + +Archival Bag +- for dataset versions with no files, the (empty) manifest-.txt file created will now use the default algorithm defined by the "FileFixityChecksumAlgorithm" setting rather than always defaulting to "md5" +- a bug causing the bag-info.txt to not have information on contacts when the dataset version has more than one contact has been fixed +- values used in the bag-info.txt file that may be multi-line (with embedded CR or LF characters) are now properly indented/formatted per the BagIt specification (i.e. Internal-Sender-Identifier, External-Description, Source-Organization, Organization-Address). +- the name of the dataset is no longer used as a subdirectory under the data directory (dataset names can be long enough to cause failures when unzipping) +- a new key, "Dataverse-Bag-Version" has been added to bag-info.txt with a value "1.0", allowing tracking of changes to Dataverse's arhival bag generation +- improvements to file retrieval w.r.t. retries on errors or throttling +- retrieval of files for inclusion in the bag is no longer counted as a download by Dataverse +- the size of data files and total dataset size that will be included in an archival bag can now be limited. Admins can choose whether files above these limits are transferred along with the zipped bag (creating a complete archival copy) or are just referenced (using the concept of a "holey" bag and just listing the oversized files and the Dataverse urls from which they can be retrieved. In the holey bag case, an active service on the archiving platform must retrieve the oversized files (using appropriate credentials as needed) to make a complete copy + +### New JVM Options (MicroProfile Config Settings) +dataverse.bagit.zip.holey +dataverse.bagit.zip.max-data-size +dataverse.bagit.zip.max-file-size \ No newline at end of file diff --git a/doc/sphinx-guides/source/admin/big-data-administration.rst b/doc/sphinx-guides/source/admin/big-data-administration.rst index c4a98a6987a..c1d2a02c4a2 100644 --- a/doc/sphinx-guides/source/admin/big-data-administration.rst +++ b/doc/sphinx-guides/source/admin/big-data-administration.rst @@ -302,6 +302,7 @@ There are a broad range of options (that are not turned on by default) for impro - :ref:`:DisableSolrFacetsWithoutJsession` - disables facets for users who have disabled cookies (e.g. for bots) - :ref:`:DisableUncheckedTypesFacet` - only disables the facet showing the number of collections, datasets, files matching the query (this facet is potentially less useful than others) - :ref:`:StoreIngestedTabularFilesWithVarHeaders` - by default, Dataverse stores ingested files without headers and dynamically adds them back at download time. Once this setting is enabled, Dataverse will leave the headers in place (for newly ingested files), reducing the cost of downloads +- :ref:`dataverse.bagit.zip.max-file-size`, :ref:`dataverse.bagit.zip.max-data-size`, and :ref:`dataverse.bagit.zip.holey` - options to control the size and temporary storage requirements when generating archival Bags - see :ref:`BagIt Export` Scaling Infrastructure diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index a9d5c7c0041..fff7a747063 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -2259,6 +2259,8 @@ These archival Bags include all of the files and metadata in a given dataset ver The Dataverse Software offers an internal archive workflow which may be configured as a PostPublication workflow via an admin API call to manually submit previously published Datasets and prior versions to a configured archive such as Chronopolis. The workflow creates a `JSON-LD `_ serialized `OAI-ORE `_ map file, which is also available as a metadata export format in the Dataverse Software web interface. +The size of the zipped archival Bag can be limited, and files that don't fit within that limit can either be transferred separately (placed so that they are correctly positioned according to the BagIt specification when the zipped bag in unzipped in place) or just referenced for later download (using the BagIt concept of a 'holey' bag with a list of files in a ``fetch.txt`` file) can now be configured for all archivers. These settings allow for managing large datasets by excluding files over a certain size or total data size, which can be useful for archivers with size limitations or to reduce transfer times. See the :ref:`dataverse.bagit.zip.max-file-size`, :ref:`dataverse.bagit.zip.max-data-size`, and :ref:`dataverse.bagit.zip.holey` JVM options for more details. + At present, archiving classes include the DuraCloudSubmitToArchiveCommand, LocalSubmitToArchiveCommand, GoogleCloudSubmitToArchive, and S3SubmitToArchiveCommand , which all extend the AbstractSubmitToArchiveCommand and use the configurable mechanisms discussed below. (A DRSSubmitToArchiveCommand, which works with Harvard's DRS also exists and, while specific to DRS, is a useful example of how Archivers can support single-version-only semantics and support archiving only from specified collections (with collection specific parameters)). All current options support the :ref:`Archival Status API` calls and the same status is available in the dataset page version table (for contributors/those who could view the unpublished dataset, with more detail available to superusers). @@ -3868,6 +3870,21 @@ This can instead be restricted to only superusers who can publish the dataset us Example: ``dataverse.coar-notify.relationship-announcement.notify-superusers-only=true`` +.. _dataverse.bagit.zip.holey: + +``dataverse.bagit.zip.holey`` + A boolean that, if true, will cause the BagIt archiver to create a "holey" bag. In a holey bag, files that are not included in the bag are listed in the ``fetch.txt`` file with a URL from which they can be downloaded. This is used in conjunction with ``dataverse.bagit.zip.max-file-size`` and/or ``dataverse.bagit.zip.max-data-size``. Default: false. + +.. _dataverse.bagit.zip.max-data-size: + +``dataverse.bagit.zip.max-data-size`` + The maximum total (uncompressed) size of data files (in bytes) to include in a BagIt zip archive. If the total size of the dataset files exceeds this limit, files will be excluded from the zipped bag (starting from the largest) until the total size is under the limit. Excluded files will be handled as defined by ``dataverse.bagit.zip.holey`` - just listed if that setting is true or being transferred separately and placed next to the zipped bag. When not set, there is no limit. + +.. _dataverse.bagit.zip.max-file-size: + +``dataverse.bagit.zip.max-file-size`` + The maximum (uncompressed) size of a single file (in bytes) to include in a BagIt zip archive. Any file larger than this will be excluded. Excluded files will be handled as defined by ``dataverse.bagit.zip.holey`` - just listed if that setting is true or being transferred separately and placed next to the zipped bag. When not set, there is no limit. + .. _feature-flags: Feature Flags From 5c82ab8504579b6204105485dce96c12dea6fe89 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 4 Feb 2026 15:09:53 -0500 Subject: [PATCH 39/39] style fix --- .../command/impl/GoogleCloudSubmitToArchiveCommand.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java index 21038a1eab6..17e7b641cf6 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java @@ -258,9 +258,7 @@ public void run() { dv.setArchivalCopyLocation(statusObject.build().toString()); } return WorkflowStepResult.OK; - } else - - { + } else { return new Failure( "GoogleCloud Submission not configured - no \":GoogleCloudBucket\" and/or \":GoogleCloudProject\"."); }