From f8f7739423c1f1af8fa7b1d1092b73523181a285 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Wed, 7 Jan 2026 13:35:19 -0500 Subject: [PATCH 01/14] initial impl --- .../impl/AbstractSubmitToArchiveCommand.java | 83 +++++++++++++++++-- .../settings/SettingsServiceBean.java | 6 ++ .../ArchivalSubmissionWorkflowStep.java | 2 +- 3 files changed, 82 insertions(+), 9 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java index 29c27d0396d..b4400e7b957 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java @@ -15,15 +15,21 @@ import edu.harvard.iq.dataverse.engine.command.exception.CommandException; import edu.harvard.iq.dataverse.pidproviders.doi.datacite.DOIDataCiteRegisterService; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; +import edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key; +import edu.harvard.iq.dataverse.util.ListSplitUtil; import edu.harvard.iq.dataverse.util.bagit.BagGenerator; import edu.harvard.iq.dataverse.util.bagit.OREMap; +import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; +import jakarta.json.Json; +import jakarta.json.JsonObjectBuilder; import java.io.IOException; import java.io.PipedInputStream; import java.io.PipedOutputStream; import java.security.DigestInputStream; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.logging.Logger; @@ -45,14 +51,16 @@ public AbstractSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion @Override public DatasetVersion execute(CommandContext ctxt) throws CommandException { + + String settings = ctxt.settings().getValueForKey(SettingsServiceBean.Key.ArchiverSettings); - String[] settingsArray = settings.split(","); - for (String setting : settingsArray) { - setting = setting.trim(); - if (!setting.startsWith(":")) { - logger.warning("Invalid Archiver Setting: " + setting); + List settingsList = ListSplitUtil.split(settings); + for (String settingName : settingsList) { + Key setting = Key.parse(settingName); + if (setting == null) { + logger.warning("Invalid Archiver Setting: " + settingName); } else { - requestedSettings.put(setting, ctxt.settings().get(setting)); + requestedSettings.put(settingName, ctxt.settings().getValueForKey(setting)); } } @@ -62,22 +70,81 @@ public DatasetVersion execute(CommandContext ctxt) throws CommandException { //No un-expired token token = ctxt.authentication().generateApiTokenForUser(user); } - performArchiveSubmission(version, token, requestedSettings); + runArchivingProcess(version, token, requestedSettings); return ctxt.em().merge(version); } + /** + * Note that this method may be called from the execute method above OR from a + * workflow in which execute() is never called and therefore in which all + * variables must be sent as method parameters. (Nominally version is set in the + * constructor and could be dropped from the parameter list.) + * @param ctxt + * + * @param version - the DatasetVersion to archive + * @param token - an API Token for the user performing this action + * @param requestedSettings - a map of the names/values for settings required by this archiver (sent because this class is not part of the EJB context (by design) and has no direct access to service beans). + */ + public WorkflowStepResult runArchivingProcess(DatasetVersion version, ApiToken token, Map requestedSetttings) { + // Check if earlier versions must be archived first + String requireEarlierArchivedValue = requestedSettings.get(SettingsServiceBean.Key.ArchiverOnlyIfEarlierVersionsAreArchived.toString()); + boolean requireEarlierArchived = Boolean.parseBoolean(requireEarlierArchivedValue); + if (requireEarlierArchived) { + + Dataset dataset = version.getDataset(); + List versions = dataset.getVersions(); + + // Check all earlier versions (those with version numbers less than current) + for (DatasetVersion earlierVersion : versions) { + // Skip the current version and any versions that come after it + if (earlierVersion.getId().equals(version.getId())) { + continue; + } + + // Compare version numbers to ensure we only check earlier versions + if (earlierVersion.getVersionNumber() != null && version.getVersionNumber() != null) { + if (earlierVersion.getVersionNumber() < version.getVersionNumber() + || (earlierVersion.getVersionNumber().equals(version.getVersionNumber()) + && earlierVersion.getMinorVersionNumber() < version.getMinorVersionNumber())) { + + // Check if this earlier version has been successfully archived + String archivalStatus = earlierVersion.getArchivalCopyLocationStatus(); + if (archivalStatus == null || !archivalStatus.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS) +// || !archivalStatus.equals(DatasetVersion.ARCHIVAL_STATUS_OBSOLETE) + ) { + JsonObjectBuilder statusObjectBuilder = Json.createObjectBuilder(); + statusObjectBuilder.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); + statusObjectBuilder.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, + "Successful archiving of earlier versions is required."); + version.setArchivalCopyLocation(statusObjectBuilder.build().toString()); + return new Failure( + "Earlier versions must be successfully archived first", + "Archival prerequisites not met" + ); + } + } + } + } + } + // Delegate to the archiver-specific implementation + return performArchiveSubmission(version, token, requestedSettings); + } + + /** * This method is the only one that should be overwritten by other classes. Note * that this method may be called from the execute method above OR from a * workflow in which execute() is never called and therefore in which all * variables must be sent as method parameters. (Nominally version is set in the * constructor and could be dropped from the parameter list.) + * @param ctxt * * @param version - the DatasetVersion to archive * @param token - an API Token for the user performing this action * @param requestedSettings - a map of the names/values for settings required by this archiver (sent because this class is not part of the EJB context (by design) and has no direct access to service beans). */ - abstract public WorkflowStepResult performArchiveSubmission(DatasetVersion version, ApiToken token, Map requestedSetttings); + protected abstract WorkflowStepResult performArchiveSubmission(DatasetVersion version, ApiToken token, + Map requestedSettings); protected int getNumberOfBagGeneratorThreads() { if (requestedSettings.get(BagGenerator.BAG_GENERATOR_THREADS) != null) { diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java index 37d26995017..d8495a2dc8a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java @@ -485,6 +485,12 @@ Whether Harvesting (OAI) service is enabled */ ArchiverClassName, + /* + * Only create an archival Bag for a dataset version if all prior versions have + * been successfully archived + */ + ArchiverOnlyIfEarlierVersionsAreArchived, + /** * Custom settings for each archiver. See list below. */ diff --git a/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java b/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java index b0567bff107..3e3962d0334 100644 --- a/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java +++ b/src/main/java/edu/harvard/iq/dataverse/workflow/internalspi/ArchivalSubmissionWorkflowStep.java @@ -49,7 +49,7 @@ public WorkflowStepResult run(WorkflowContext context) { String className = requestedSettings.get(SettingsServiceBean.Key.ArchiverClassName.toString()); AbstractSubmitToArchiveCommand archiveCommand = ArchiverUtil.createSubmitToArchiveCommand(className, dvr, context.getDataset().getReleasedVersion()); if (archiveCommand != null) { - return (archiveCommand.performArchiveSubmission(context.getDataset().getReleasedVersion(), context.getApiToken(), requestedSettings)); + return (archiveCommand.runArchivingProcess(context.getDataset().getReleasedVersion(), context.getApiToken(), requestedSettings)); } else { logger.severe("No Archiver instance could be created for name: " + className); return new Failure("No Archiver", "Could not create instance of class: " + className); From 5bd6f8d92581ed8ee6e65b4cb394d0f67be804cc Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 8 Jan 2026 09:25:41 -0500 Subject: [PATCH 02/14] fix requestedSettings handling --- .../impl/AbstractSubmitToArchiveCommand.java | 15 ++++++--------- .../command/impl/DRSSubmitToArchiveCommand.java | 5 ++--- .../impl/DuraCloudSubmitToArchiveCommand.java | 3 +-- .../impl/GoogleCloudSubmitToArchiveCommand.java | 2 +- .../command/impl/LocalSubmitToArchiveCommand.java | 3 +-- .../command/impl/S3SubmitToArchiveCommand.java | 3 +-- .../dataverse/workflow/WorkflowServiceBean.java | 4 ++-- 7 files changed, 14 insertions(+), 21 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java index b4400e7b957..bcb8f37dede 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java @@ -3,7 +3,6 @@ import edu.harvard.iq.dataverse.DataCitation; import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetVersion; -import edu.harvard.iq.dataverse.DvObject; import edu.harvard.iq.dataverse.SettingsWrapper; import edu.harvard.iq.dataverse.authorization.Permission; import edu.harvard.iq.dataverse.authorization.users.ApiToken; @@ -37,7 +36,7 @@ public abstract class AbstractSubmitToArchiveCommand extends AbstractCommand { private final DatasetVersion version; - private final Map requestedSettings = new HashMap(); + protected final Map requestedSettings = new HashMap(); protected boolean success=false; private static final Logger logger = Logger.getLogger(AbstractSubmitToArchiveCommand.class.getName()); private static final int MAX_ZIP_WAIT = 20000; @@ -50,8 +49,6 @@ public AbstractSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion @Override public DatasetVersion execute(CommandContext ctxt) throws CommandException { - - String settings = ctxt.settings().getValueForKey(SettingsServiceBean.Key.ArchiverSettings); List settingsList = ListSplitUtil.split(settings); @@ -85,7 +82,9 @@ public DatasetVersion execute(CommandContext ctxt) throws CommandException { * @param token - an API Token for the user performing this action * @param requestedSettings - a map of the names/values for settings required by this archiver (sent because this class is not part of the EJB context (by design) and has no direct access to service beans). */ - public WorkflowStepResult runArchivingProcess(DatasetVersion version, ApiToken token, Map requestedSetttings) { + public WorkflowStepResult runArchivingProcess(DatasetVersion version, ApiToken token, Map requestedSettings) { + // this.requestedSettings won't be set yet in the workflow case, so set it now (used in getNumberOfBagGeneratorThreads) + this.requestedSettings.putAll(requestedSettings); // Check if earlier versions must be archived first String requireEarlierArchivedValue = requestedSettings.get(SettingsServiceBean.Key.ArchiverOnlyIfEarlierVersionsAreArchived.toString()); boolean requireEarlierArchived = Boolean.parseBoolean(requireEarlierArchivedValue); @@ -127,7 +126,7 @@ public WorkflowStepResult runArchivingProcess(DatasetVersion version, ApiToken t } } // Delegate to the archiver-specific implementation - return performArchiveSubmission(version, token, requestedSettings); + return performArchiveSubmission(version, token); } @@ -141,10 +140,8 @@ public WorkflowStepResult runArchivingProcess(DatasetVersion version, ApiToken t * * @param version - the DatasetVersion to archive * @param token - an API Token for the user performing this action - * @param requestedSettings - a map of the names/values for settings required by this archiver (sent because this class is not part of the EJB context (by design) and has no direct access to service beans). */ - protected abstract WorkflowStepResult performArchiveSubmission(DatasetVersion version, ApiToken token, - Map requestedSettings); + protected abstract WorkflowStepResult performArchiveSubmission(DatasetVersion version, ApiToken token); protected int getNumberOfBagGeneratorThreads() { if (requestedSettings.get(BagGenerator.BAG_GENERATOR_THREADS) != null) { diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DRSSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DRSSubmitToArchiveCommand.java index 78e8454255b..01b9b4621e1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DRSSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DRSSubmitToArchiveCommand.java @@ -82,8 +82,7 @@ public DRSSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion versi } @Override - public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, - Map requestedSettings) { + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token) { logger.fine("In DRSSubmitToArchiveCommand..."); JsonObject drsConfigObject = null; @@ -113,7 +112,7 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t JsonObject collectionConfig = adminMetadata.getJsonObject(COLLECTIONS).getJsonObject(alias); - WorkflowStepResult s3Result = super.performArchiveSubmission(dv, token, requestedSettings); + WorkflowStepResult s3Result = super.performArchiveSubmission(dv, token); JsonObjectBuilder statusObject = Json.createObjectBuilder(); statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java index fe4a25091d7..71855abd927 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java @@ -49,8 +49,7 @@ public DuraCloudSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion } @Override - public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, - Map requestedSettings) { + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token) { String port = requestedSettings.get(DURACLOUD_PORT) != null ? requestedSettings.get(DURACLOUD_PORT) : DEFAULT_PORT; diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java index 7dfb9f07e19..5d27e71583b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java @@ -45,7 +45,7 @@ public GoogleCloudSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersi } @Override - public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, Map requestedSettings) { + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token) { logger.fine("In GoogleCloudSubmitToArchiveCommand..."); String bucketName = requestedSettings.get(GOOGLECLOUD_BUCKET); String projectName = requestedSettings.get(GOOGLECLOUD_PROJECT); diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java index 462879f2ec9..d590e605985 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java @@ -36,8 +36,7 @@ public LocalSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion ver } @Override - public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, - Map requestedSettings) { + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token) { logger.fine("In LocalCloudSubmitToArchive..."); String localPath = requestedSettings.get(BagItLocalPath.toString()); String zipName = null; diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java index 65531d775c8..e3d5a0d8ae0 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java @@ -78,8 +78,7 @@ public S3SubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion versio } @Override - public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, - Map requestedSettings) { + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token) { logger.fine("In S3SubmitToArchiveCommand..."); JsonObject configObject = null; diff --git a/src/main/java/edu/harvard/iq/dataverse/workflow/WorkflowServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/workflow/WorkflowServiceBean.java index ae1175f0e1d..fce13d1c181 100644 --- a/src/main/java/edu/harvard/iq/dataverse/workflow/WorkflowServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/workflow/WorkflowServiceBean.java @@ -180,12 +180,12 @@ private Map retrieveRequestedSettings(Map requir break; } case "boolean": { - retrievedSettings.put(setting, settings.isTrue(settingType, false)); + retrievedSettings.put(setting, settings.isTrue(setting, false)); break; } case "long": { retrievedSettings.put(setting, - settings.getValueForKeyAsLong(SettingsServiceBean.Key.valueOf(setting))); + settings.getValueForKeyAsLong(SettingsServiceBean.Key.parse(setting))); break; } } From 4aaf6ca3ceff1f772dad5821e3f7a8b76342060d Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 8 Jan 2026 09:26:00 -0500 Subject: [PATCH 03/14] efficiency improvement --- .../iq/dataverse/settings/SettingsServiceBean.java | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java index d8495a2dc8a..1c67cb85060 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java @@ -802,16 +802,13 @@ public static SettingsServiceBean.Key parse(String key) { // Cut off the ":" we verified is present before String normalizedKey = key.substring(1); - // Iterate through all the known keys and return on match (case sensitive!) // We are case sensitive here because Dataverse implicitely uses case sensitive keys everywhere! - for (SettingsServiceBean.Key k : SettingsServiceBean.Key.values()) { - if (k.name().equals(normalizedKey)) { - return k; - } + try { + return SettingsServiceBean.Key.valueOf(normalizedKey); + } catch (IllegalArgumentException e) { + // Fall through on no match - return null for invalid keys + return null; } - - // Fall through on no match - return null; } } From 7cdef818079a6c3aa253063e1c68b41a54c4c0ed Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 8 Jan 2026 09:40:53 -0500 Subject: [PATCH 04/14] QDR fixes transx timeout, ignored bag thread setting, add deletable --- .../impl/AbstractSubmitToArchiveCommand.java | 18 +++++++++++++++++- .../iq/dataverse/util/bagit/BagGenerator.java | 8 ++++---- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java index bcb8f37dede..98e9dfb68e1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java @@ -20,6 +20,9 @@ import edu.harvard.iq.dataverse.util.bagit.OREMap; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; + +import jakarta.ejb.TransactionAttribute; +import jakarta.ejb.TransactionAttributeType; import jakarta.json.Json; import jakarta.json.JsonObjectBuilder; @@ -48,6 +51,7 @@ public AbstractSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion } @Override + @TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED) public DatasetVersion execute(CommandContext ctxt) throws CommandException { String settings = ctxt.settings().getValueForKey(SettingsServiceBean.Key.ArchiverSettings); @@ -174,8 +178,8 @@ public Thread startBagThread(DatasetVersion dv, PipedInputStream in, DigestInput public void run() { try (PipedOutputStream out = new PipedOutputStream(in)) { // Generate bag + BagGenerator.setNumConnections(getNumberOfBagGeneratorThreads()); BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); - bagger.setNumConnections(getNumberOfBagGeneratorThreads()); bagger.setAuthenticationKey(token.getTokenString()); bagger.generateBag(out); success = true; @@ -247,4 +251,16 @@ public static boolean isSingleVersion(SettingsWrapper settingsWrapper) { public static boolean isSingleVersion(SettingsServiceBean settingsService) { return false; } + + /** Whether the archiver can delete existing archival files (and thus can retry when the existing files are incomplete/obsolete) + * A static version supports calls via reflection while the instance method supports inheritance for use on actual command instances (see DatasetPage for both use cases). + * @return + */ + public static boolean supportsDelete() { + return false; + } + + public boolean canDelete() { + return supportsDelete(); + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index f24ebdb8655..3035694ae3d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -120,7 +120,7 @@ public class BagGenerator { private boolean usetemp = false; - private int numConnections = 8; + private static int numConnections = 2; public static final String BAG_GENERATOR_THREADS = BagGeneratorThreads.toString(); private OREMap oremap; @@ -1124,9 +1124,9 @@ public void setAuthenticationKey(String tokenString) { apiKey = tokenString; } - public void setNumConnections(int numConnections) { - this.numConnections = numConnections; - logger.fine("BagGenerator will use " + numConnections + " threads"); + public static void setNumConnections(int numConnections) { + BagGenerator.numConnections = numConnections; + logger.fine("All BagGenerators will use " + numConnections + " threads"); } } \ No newline at end of file From 67e01e050d933005f69b4bae93678c865671130a Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 16 Dec 2025 10:35:40 -0500 Subject: [PATCH 05/14] archival submit fix - per version cache --- .../edu/harvard/iq/dataverse/DatasetPage.java | 28 +++++++++++++------ src/main/webapp/dataset-versions.xhtml | 4 +-- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index 20617160a1c..8eba6cbeab9 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -387,7 +387,7 @@ public void setSelectedHostDataverse(Dataverse selectedHostDataverse) { private boolean showIngestSuccess; private Boolean archivable = null; - private Boolean versionArchivable = null; + private HashMap versionArchivable = new HashMap<>(); private Boolean someVersionArchived = null; public boolean isShowIngestSuccess() { @@ -6147,10 +6147,11 @@ public boolean isArchivable() { return archivable; } - public boolean isVersionArchivable() { - if (versionArchivable == null) { + public boolean isVersionArchivable(Long id) { + Boolean thisVersionArchivable = versionArchivable.get(id); + if (thisVersionArchivable == null) { // If this dataset isn't in an archivable collection return false - versionArchivable = false; + thisVersionArchivable = false; if (isArchivable()) { boolean checkForArchivalCopy = false; // Otherwise, we need to know if the archiver is single-version-only @@ -6167,11 +6168,19 @@ public boolean isVersionArchivable() { if (checkForArchivalCopy) { // If we have to check (single version archiving), we can't allow archiving if // one version is already archived (or attempted - any non-null status) - versionArchivable = !isSomeVersionArchived(); + thisVersionArchivable = !isSomeVersionArchived(); } else { - // If we allow multiple versions or didn't find one that has had archiving run - // on it, we can archive, so return true - versionArchivable = true; + // If we didn't find one that has had archiving run + // on it, or archiving per version is supported and either + // the status is null or the archiver can delete prior runs and status isn't success, + // we can archive, so return true + // Find the specific version by id + DatasetVersion targetVersion = dataset.getVersions().stream() + .filter(v -> v.getId().equals(id)) + .findFirst() + .orElse(null); + String status = targetVersion.getArchivalCopyLocationStatus(); + thisVersionArchivable = (status == null) || ((!status.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS) && (!status.equals(DatasetVersion.ARCHIVAL_STATUS_PENDING)) && supportsDelete)); } } catch (ClassNotFoundException | IllegalAccessException | IllegalArgumentException | InvocationTargetException | NoSuchMethodException | SecurityException e) { @@ -6180,8 +6189,9 @@ public boolean isVersionArchivable() { } } } + versionArchivable.put(id, thisVersionArchivable); } - return versionArchivable; + return thisVersionArchivable; } public boolean isSomeVersionArchived() { diff --git a/src/main/webapp/dataset-versions.xhtml b/src/main/webapp/dataset-versions.xhtml index 9e5f0a9b24d..ee726bb5eee 100644 --- a/src/main/webapp/dataset-versions.xhtml +++ b/src/main/webapp/dataset-versions.xhtml @@ -171,11 +171,11 @@ - - + From 50e8c61a8e1a878a7056f0da980e1a9e5271f957 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 29 Jan 2026 17:20:31 -0500 Subject: [PATCH 06/14] Add check to display submit button only if prior versions are archvd --- .../edu/harvard/iq/dataverse/DatasetPage.java | 34 +++++++++++++------ .../dataverse/FileMetadataVersionsHelper.java | 14 ++------ .../iq/dataverse/dataset/DatasetUtil.java | 17 ++++++++++ .../iq/dataverse/util/ArchiverUtil.java | 11 ++++++ 4 files changed, 53 insertions(+), 23 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index 8eba6cbeab9..375489484c0 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -387,6 +387,8 @@ public void setSelectedHostDataverse(Dataverse selectedHostDataverse) { private boolean showIngestSuccess; private Boolean archivable = null; + private Boolean checkForArchivalCopy; + private Boolean supportsDelete; private HashMap versionArchivable = new HashMap<>(); private Boolean someVersionArchived = null; @@ -6152,19 +6154,33 @@ public boolean isVersionArchivable(Long id) { if (thisVersionArchivable == null) { // If this dataset isn't in an archivable collection return false thisVersionArchivable = false; + boolean requiresEarlierVersionsToBeArchived = settingsWrapper.isTrueForKey(SettingsServiceBean.Key.ArchiverOnlyIfEarlierVersionsAreArchived, false); if (isArchivable()) { - boolean checkForArchivalCopy = false; // Otherwise, we need to know if the archiver is single-version-only // If it is, we have to check for an existing archived version to answer the // question String className = settingsWrapper.getValueForKey(SettingsServiceBean.Key.ArchiverClassName, null); if (className != null) { try { - Class clazz = Class.forName(className); - Method m = clazz.getMethod("isSingleVersion", SettingsWrapper.class); - Object[] params = { settingsWrapper }; - checkForArchivalCopy = (Boolean) m.invoke(null, params); - + DatasetVersion targetVersion = dataset.getVersions().stream() + .filter(v -> v.getId().equals(id)).findFirst().orElse(null); + if (requiresEarlierVersionsToBeArchived) {// Find the specific version by id + DatasetVersion priorVersion = DatasetUtil.getPriorVersion(targetVersion); + + if (priorVersion== null || (isVersionArchivable(priorVersion.getId()) + && ArchiverUtil.isVersionArchived(priorVersion))) { + thisVersionArchivable = true; + } + } + if (checkForArchivalCopy == null) { + //Only check once + Class clazz = Class.forName(className); + Method m = clazz.getMethod("isSingleVersion", SettingsWrapper.class); + Method m2 = clazz.getMethod("supportsDelete"); + Object[] params = { settingsWrapper }; + checkForArchivalCopy = (Boolean) m.invoke(null, params); + supportsDelete = (Boolean) m2.invoke(null); + } if (checkForArchivalCopy) { // If we have to check (single version archiving), we can't allow archiving if // one version is already archived (or attempted - any non-null status) @@ -6175,16 +6191,12 @@ public boolean isVersionArchivable(Long id) { // the status is null or the archiver can delete prior runs and status isn't success, // we can archive, so return true // Find the specific version by id - DatasetVersion targetVersion = dataset.getVersions().stream() - .filter(v -> v.getId().equals(id)) - .findFirst() - .orElse(null); String status = targetVersion.getArchivalCopyLocationStatus(); thisVersionArchivable = (status == null) || ((!status.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS) && (!status.equals(DatasetVersion.ARCHIVAL_STATUS_PENDING)) && supportsDelete)); } } catch (ClassNotFoundException | IllegalAccessException | IllegalArgumentException | InvocationTargetException | NoSuchMethodException | SecurityException e) { - logger.warning("Failed to call isSingleVersion on configured archiver class: " + className); + logger.warning("Failed to call methods on configured archiver class: " + className); e.printStackTrace(); } } diff --git a/src/main/java/edu/harvard/iq/dataverse/FileMetadataVersionsHelper.java b/src/main/java/edu/harvard/iq/dataverse/FileMetadataVersionsHelper.java index 4d408a72c8c..cc632054642 100644 --- a/src/main/java/edu/harvard/iq/dataverse/FileMetadataVersionsHelper.java +++ b/src/main/java/edu/harvard/iq/dataverse/FileMetadataVersionsHelper.java @@ -1,6 +1,7 @@ package edu.harvard.iq.dataverse; import edu.harvard.iq.dataverse.authorization.Permission; +import edu.harvard.iq.dataverse.dataset.DatasetUtil; import edu.harvard.iq.dataverse.engine.command.DataverseRequest; import jakarta.ejb.EJB; import jakarta.ejb.Stateless; @@ -95,18 +96,7 @@ private FileMetadata getPreviousFileMetadata(FileMetadata fileMetadata, FileMeta //TODO: this could use some refactoring to cut down on the number of for loops! private FileMetadata getPreviousFileMetadata(FileMetadata fileMetadata, DatasetVersion currentversion) { List allfiles = allRelatedFiles(fileMetadata); - boolean foundCurrent = false; - DatasetVersion priorVersion = null; - for (DatasetVersion versionLoop : fileMetadata.getDatasetVersion().getDataset().getVersions()) { - if (foundCurrent) { - priorVersion = versionLoop; - break; - } - if (versionLoop.equals(currentversion)) { - foundCurrent = true; - } - - } + DatasetVersion priorVersion = DatasetUtil.getPriorVersion(fileMetadata.getDatasetVersion()); if (priorVersion != null && priorVersion.getFileMetadatasSorted() != null) { for (FileMetadata fmdTest : priorVersion.getFileMetadatasSorted()) { for (DataFile fileTest : allfiles) { diff --git a/src/main/java/edu/harvard/iq/dataverse/dataset/DatasetUtil.java b/src/main/java/edu/harvard/iq/dataverse/dataset/DatasetUtil.java index 2ce5471a523..79451a61a84 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataset/DatasetUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataset/DatasetUtil.java @@ -740,4 +740,21 @@ public static String getLocaleCurationStatusLabelFromString(String label) { } return localizedName; } + + // Find the prior version - relies on version sorting by major/minor numbers + public static DatasetVersion getPriorVersion(DatasetVersion version) { + boolean foundCurrent = false; + DatasetVersion priorVersion = null; + for (DatasetVersion versionLoop : version.getDataset().getVersions()) { + if (foundCurrent) { + priorVersion = versionLoop; + break; + } + if (versionLoop.equals(version)) { + foundCurrent = true; + } + + } + return priorVersion; + } } diff --git a/src/main/java/edu/harvard/iq/dataverse/util/ArchiverUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/ArchiverUtil.java index 18ec6243d5a..7d03004f3f7 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/ArchiverUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/ArchiverUtil.java @@ -71,5 +71,16 @@ public static boolean isSomeVersionArchived(Dataset dataset) { return someVersionArchived; } + + /** + * Checks if a version has been successfully archived. + * + * @param version the version to check + * @return true if the version has been successfully archived, false otherwise + */ + public static boolean isVersionArchived(DatasetVersion version) { + String status = version.getArchivalCopyLocationStatus(); + return status != null && status.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS); + } } \ No newline at end of file From 06428970ac2a7644f7ca7b5d6c929ad639c20c31 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 29 Jan 2026 18:06:43 -0500 Subject: [PATCH 07/14] setting name tweak, add docs, release note --- doc/release-notes/12122-archiving in sequence.md | 3 +++ doc/sphinx-guides/source/installation/config.rst | 12 ++++++++++++ .../java/edu/harvard/iq/dataverse/DatasetPage.java | 2 +- .../command/impl/AbstractSubmitToArchiveCommand.java | 2 +- .../iq/dataverse/settings/SettingsServiceBean.java | 2 +- 5 files changed, 18 insertions(+), 3 deletions(-) create mode 100644 doc/release-notes/12122-archiving in sequence.md diff --git a/doc/release-notes/12122-archiving in sequence.md b/doc/release-notes/12122-archiving in sequence.md new file mode 100644 index 00000000000..6f4373a1e31 --- /dev/null +++ b/doc/release-notes/12122-archiving in sequence.md @@ -0,0 +1,3 @@ +This release introduces an additial setting related to archival bag creation, ArchiveOnlyIfEarlierVersionsAreArchived (default false). +If it is true, dataset versions must be archived in order. That is, all prior versions of a dataset must be archived before the latest version can be archived. +This is intended to support use cases where deduplication of files between dataset versions will be done (i.e. by a third-party service running at the archival copy location) and is a step towards supporting the Oxford Common File Layout (OCFL) as an archival format. diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index a9d5c7c0041..e0dffd10ac9 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -2263,6 +2263,13 @@ At present, archiving classes include the DuraCloudSubmitToArchiveCommand, Local All current options support the :ref:`Archival Status API` calls and the same status is available in the dataset page version table (for contributors/those who could view the unpublished dataset, with more detail available to superusers). +Two settings that can be used with all current Archivers are: + +- \:BagGeneratorThreads - the number of threads to use when adding data files to the zipped bag. The default is 2. Values of 4 or more may increase performance on larger machines but may cause problems if file access is throttled +- \:ArchiveOnlyIfEarlierVersionsAreArchived - when true, requires dataset versions to be archived in order by confirming that all prior versions have been successfully archived before allowing a new version to be archived. Default is false + +These must be included in the \:ArchiverSettings for the Archiver to work + .. _Duracloud Configuration: Duracloud Configuration @@ -5333,6 +5340,11 @@ This setting specifies which storage system to use by identifying the particular For examples, see the specific configuration above in :ref:`BagIt Export`. +:ArchiveOnlyIfEarlierVersionsAreArchived +++++++++++++++++++++++++++++++++++++++++ + +This setting, if true, only allows creation of an archival Bag for a dataset version if all prior versions have been successfully archived. The default is false (any version can be archived independently as long as other settings allow it) + :ArchiverSettings +++++++++++++++++ diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index 375489484c0..5b267007887 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -6154,7 +6154,7 @@ public boolean isVersionArchivable(Long id) { if (thisVersionArchivable == null) { // If this dataset isn't in an archivable collection return false thisVersionArchivable = false; - boolean requiresEarlierVersionsToBeArchived = settingsWrapper.isTrueForKey(SettingsServiceBean.Key.ArchiverOnlyIfEarlierVersionsAreArchived, false); + boolean requiresEarlierVersionsToBeArchived = settingsWrapper.isTrueForKey(SettingsServiceBean.Key.ArchiveOnlyIfEarlierVersionsAreArchived, false); if (isArchivable()) { // Otherwise, we need to know if the archiver is single-version-only // If it is, we have to check for an existing archived version to answer the diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java index 98e9dfb68e1..72f45ab5d2b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java @@ -90,7 +90,7 @@ public WorkflowStepResult runArchivingProcess(DatasetVersion version, ApiToken t // this.requestedSettings won't be set yet in the workflow case, so set it now (used in getNumberOfBagGeneratorThreads) this.requestedSettings.putAll(requestedSettings); // Check if earlier versions must be archived first - String requireEarlierArchivedValue = requestedSettings.get(SettingsServiceBean.Key.ArchiverOnlyIfEarlierVersionsAreArchived.toString()); + String requireEarlierArchivedValue = requestedSettings.get(SettingsServiceBean.Key.ArchiveOnlyIfEarlierVersionsAreArchived.toString()); boolean requireEarlierArchived = Boolean.parseBoolean(requireEarlierArchivedValue); if (requireEarlierArchived) { diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java index fac136042ce..1cc9fda7645 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java @@ -489,7 +489,7 @@ Whether Harvesting (OAI) service is enabled * Only create an archival Bag for a dataset version if all prior versions have * been successfully archived */ - ArchiverOnlyIfEarlierVersionsAreArchived, + ArchiveOnlyIfEarlierVersionsAreArchived, /** * Custom settings for each archiver. See list below. From ca0af05c7f80fc28114ebde71b460917199d2a60 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 29 Jan 2026 18:15:55 -0500 Subject: [PATCH 08/14] simplify --- .../impl/AbstractSubmitToArchiveCommand.java | 47 +++++++++---------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java index 72f45ab5d2b..8949f346567 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractSubmitToArchiveCommand.java @@ -97,36 +97,31 @@ public WorkflowStepResult runArchivingProcess(DatasetVersion version, ApiToken t Dataset dataset = version.getDataset(); List versions = dataset.getVersions(); - // Check all earlier versions (those with version numbers less than current) - for (DatasetVersion earlierVersion : versions) { - // Skip the current version and any versions that come after it - if (earlierVersion.getId().equals(version.getId())) { - continue; - } - - // Compare version numbers to ensure we only check earlier versions - if (earlierVersion.getVersionNumber() != null && version.getVersionNumber() != null) { - if (earlierVersion.getVersionNumber() < version.getVersionNumber() - || (earlierVersion.getVersionNumber().equals(version.getVersionNumber()) - && earlierVersion.getMinorVersionNumber() < version.getMinorVersionNumber())) { + boolean foundCurrent = false; - // Check if this earlier version has been successfully archived - String archivalStatus = earlierVersion.getArchivalCopyLocationStatus(); - if (archivalStatus == null || !archivalStatus.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS) + // versions are ordered, all versions after the current one have lower + // major/minor version numbers + for (DatasetVersion versionInLoop : versions) { + if (foundCurrent) { + // Once foundCurrent is true, we are looking at prior versions + // Check if this earlier version has been successfully archived + String archivalStatus = versionInLoop.getArchivalCopyLocationStatus(); + if (archivalStatus == null || !archivalStatus.equals(DatasetVersion.ARCHIVAL_STATUS_SUCCESS) // || !archivalStatus.equals(DatasetVersion.ARCHIVAL_STATUS_OBSOLETE) - ) { - JsonObjectBuilder statusObjectBuilder = Json.createObjectBuilder(); - statusObjectBuilder.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); - statusObjectBuilder.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, - "Successful archiving of earlier versions is required."); - version.setArchivalCopyLocation(statusObjectBuilder.build().toString()); - return new Failure( - "Earlier versions must be successfully archived first", - "Archival prerequisites not met" - ); - } + ) { + JsonObjectBuilder statusObjectBuilder = Json.createObjectBuilder(); + statusObjectBuilder.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); + statusObjectBuilder.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, + "Successful archiving of earlier versions is required."); + version.setArchivalCopyLocation(statusObjectBuilder.build().toString()); + return new Failure("Earlier versions must be successfully archived first", + "Archival prerequisites not met"); } } + if (versionInLoop.equals(version)) { + foundCurrent = true; + } + } } // Delegate to the archiver-specific implementation From 1808d2db3fb2ab55c08e25dd6e7f03dc5ed414ed Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 30 Jan 2026 11:17:15 -0500 Subject: [PATCH 09/14] basic fetch --- .../iq/dataverse/settings/JvmSettings.java | 4 + .../iq/dataverse/util/bagit/BagGenerator.java | 93 ++++++++++++++++--- 2 files changed, 86 insertions(+), 11 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java index 05390ba8a8c..b32b7a8d77d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java @@ -276,6 +276,10 @@ public enum JvmSettings { BAGIT_SOURCE_ORG_NAME(SCOPE_BAGIT_SOURCEORG, "name"), BAGIT_SOURCEORG_ADDRESS(SCOPE_BAGIT_SOURCEORG, "address"), BAGIT_SOURCEORG_EMAIL(SCOPE_BAGIT_SOURCEORG, "email"), + SCOPE_BAGIT_HOLEY(SCOPE_BAGIT, "holey"), + BAGIT_HOLEY_MAX_FILE_SIZE(SCOPE_BAGIT_HOLEY, "max-file-size"), + BAGIT_HOLEY_MAX_DATA_SIZE(SCOPE_BAGIT_HOLEY, "max-data-size"), + // STORAGE USE SETTINGS SCOPE_STORAGEUSE(PREFIX, "storageuse"), diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 3035694ae3d..6d096704a58 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -126,6 +126,13 @@ public class BagGenerator { private OREMap oremap; static PrintWriter pw = null; + + //Holey Bags + private long maxDataFileSize = Long.MAX_VALUE; + private long maxTotalDataSize = Long.MAX_VALUE; + private long currentBagDataSize = 0; + private StringBuilder fetchFileContent = new StringBuilder(); + private boolean usingFetchFile = false; /** * This BagGenerator creates a BagIt version 1.0 @@ -176,6 +183,13 @@ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxExceptio logger.warning("Aint gonna work"); e.printStackTrace(); } + initializeHoleyBagLimits(); + } + + private void initializeHoleyBagLimits() { + this.maxDataFileSize = JvmSettings.BAGIT_HOLEY_MAX_FILE_SIZE.lookupOptional(Long.class).orElse(Long.MAX_VALUE); + this.maxTotalDataSize = JvmSettings.BAGIT_HOLEY_MAX_DATA_SIZE.lookupOptional(Long.class).orElse(Long.MAX_VALUE); + logger.fine("BagGenerator size limits - maxDataFileSize: " + maxDataFileSize + ", maxTotalDataSize: " + maxTotalDataSize); } public void setIgnoreHashes(boolean val) { @@ -312,6 +326,8 @@ public boolean generateBag(OutputStream outputStream) throws Exception { logger.fine("Creating bag: " + bagName); + writeFetchFile(); + ZipArchiveOutputStream zipArchiveOutputStream = new ZipArchiveOutputStream(outputStream); /* @@ -479,7 +495,8 @@ public static String getValidName(String bagName) { return bagName.replaceAll("\\W", "-"); } - private void processContainer(JsonObject item, String currentPath) throws IOException { + private void processContainer(JsonObject item, String currentPath) + throws IOException, ExecutionException, InterruptedException { JsonArray children = getChildren(item); HashSet titles = new HashSet(); String title = null; @@ -529,7 +546,6 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce } else { resourceUsed[index] = true; // add item - // ToDo String dataUrl = child.get(JsonLDTerm.schemaOrg("sameAs").getLabel()).getAsString(); logger.fine("File url: " + dataUrl); String childTitle = child.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); @@ -545,13 +561,22 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce childPath=currentPath + directoryLabel.getAsString() + "/" + childTitle; } + // Get file size + Long fileSize = null; + if (child.has(JsonLDTerm.filesize.getLabel())) { + fileSize = child.get(JsonLDTerm.filesize.getLabel()).getAsLong(); + } + if(fileSize == null) { + logger.severe("File size missing for " + childPath); + throw new IOException("Unable to create bag due to missing file size"); + } String childHash = null; if (child.has(JsonLDTerm.checksum.getLabel())) { ChecksumType childHashType = ChecksumType.fromString( child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@type").getAsString()); if (hashtype == null) { - //If one wasn't set as a default, pick up what the first child with one uses + //If one wasn't set as a default, pick up what the first child with one uses hashtype = childHashType; } if (hashtype != null && !hashtype.equals(childHashType)) { @@ -574,7 +599,7 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce } try { if ((childHash == null) | ignorehashes) { - // Generate missing hashInputStream inputStream = null; + // Generate missing hash InputStream inputStream = null; try { inputStream = getInputStreamSupplier(dataUrl).get(); @@ -608,17 +633,30 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce logger.warning("Unable to calculate a " + hashtype + " for " + dataUrl); } } - logger.fine("Requesting: " + childPath + " from " + dataUrl); - createFileFromURL(childPath, dataUrl); + + // Add file to bag or fetch file + if (shouldAddToFetchFile(fileSize)) { + // Add to fetch file instead of including in bag + logger.fine("Adding to fetch file: " + childPath + " from " + dataUrl); + addToFetchFile(dataUrl, fileSize, childPath); + usingFetchFile = true; + } else { + // Add file to bag as before + logger.fine("Requesting: " + childPath + " from " + dataUrl); + createFileFromURL(childPath, dataUrl); + if (fileSize != null) { + currentBagDataSize += fileSize; + } + } + dataCount++; if (dataCount % 1000 == 0) { logger.info("Retrieval in progress: " + dataCount + " files retrieved"); } - if (child.has(JsonLDTerm.filesize.getLabel())) { - Long size = child.get(JsonLDTerm.filesize.getLabel()).getAsLong(); - totalDataSize += size; - if (size > maxFileSize) { - maxFileSize = size; + if (fileSize != null) { + totalDataSize += fileSize; + if (fileSize > maxFileSize) { + maxFileSize = fileSize; } } if (child.has(JsonLDTerm.schemaOrg("fileFormat").getLabel())) { @@ -638,6 +676,39 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce } } + // Helper method to determine if file should go to fetch file + private boolean shouldAddToFetchFile(long fileSize) { + + // Check individual file size limit + if (fileSize > maxDataFileSize) { + logger.fine("File size " + fileSize + " exceeds max data file size " + maxDataFileSize); + return true; + } + + // Check total bag size limit + if (currentBagDataSize + fileSize > maxTotalDataSize) { + logger.fine("Adding file would exceed max total data size. Current: " + currentBagDataSize + + ", File: " + fileSize + ", Max: " + maxTotalDataSize); + return true; + } + + return false; + } + + // Method to append to fetch file content + private void addToFetchFile(String url, long size, String filename) { + // Format: URL size filename + fetchFileContent.append(url).append(" ").append(Long.toString(size)).append(" ").append(filename).append("\n"); + } + + // Method to write fetch file to bag (call this before finalizing the bag) + private void writeFetchFile() throws IOException, ExecutionException, InterruptedException { + if (usingFetchFile && fetchFileContent.length() > 0) { + logger.info("Creating fetch.txt file for holey bag"); + createFileFromString("fetch.txt", fetchFileContent.toString()); + } + } + private int getUnusedIndexOf(String childId) { int index = resourceIndex.indexOf(childId); if (resourceUsed[index] != null) { From 366eccd486d9d4fd20a33effe8741f84e87cef4a Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 30 Jan 2026 13:52:18 -0500 Subject: [PATCH 10/14] order by file size --- .../iq/dataverse/util/bagit/BagGenerator.java | 299 ++++++++++-------- 1 file changed, 162 insertions(+), 137 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 6d096704a58..a168f1ea5d9 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -20,10 +20,11 @@ import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; +import java.util.Collections; import java.util.HashMap; -import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; +import java.util.List; import java.util.Set; import java.util.TreeSet; import java.util.Map.Entry; @@ -35,7 +36,6 @@ import java.util.logging.Logger; import java.util.zip.ZipEntry; -import edu.harvard.iq.dataverse.util.BundleUtil; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.compress.archivers.zip.ParallelScatterZipCreator; import org.apache.commons.compress.archivers.zip.ScatterZipOutputStream; @@ -77,7 +77,6 @@ import edu.harvard.iq.dataverse.settings.JvmSettings; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.BagGeneratorThreads; import edu.harvard.iq.dataverse.util.json.JsonLDTerm; -import java.util.Optional; public class BagGenerator { @@ -254,7 +253,15 @@ public boolean generateBag(OutputStream outputStream) throws Exception { resourceUsed = new Boolean[aggregates.size() + 1]; // Process current container (the aggregation itself) and its // children - processContainer(aggregation, currentPath); + // Recursively collect all files from the entire tree, start with an empty set of processedContainers + List allFiles = new ArrayList<>(); + collectAllFiles(aggregation, currentPath, allFiles); + + // Sort files by size (smallest first) + Collections.sort(allFiles); + + // Process all files in sorted order + processAllFiles(allFiles); } // Create manifest files // pid-mapping.txt - a DataOne recommendation to connect ids and @@ -495,27 +502,29 @@ public static String getValidName(String bagName) { return bagName.replaceAll("\\W", "-"); } - private void processContainer(JsonObject item, String currentPath) + // Collect all files recursively and process containers to create dirs in the zip + private void collectAllFiles(JsonObject item, String currentPath, List allFiles) throws IOException, ExecutionException, InterruptedException { JsonArray children = getChildren(item); - HashSet titles = new HashSet(); String title = null; if (item.has(JsonLDTerm.dcTerms("Title").getLabel())) { title = item.get("Title").getAsString(); } else if (item.has(JsonLDTerm.schemaOrg("name").getLabel())) { title = item.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); } - logger.fine("Adding " + title + "/ to path " + currentPath); + logger.fine("Collecting files from " + title + "/ at path " + currentPath); currentPath = currentPath + title + "/"; + + // Mark this container as processed + String containerId = item.get("@id").getAsString(); + + // Create directory and update tracking for this container int containerIndex = -1; try { createDir(currentPath); - // Add containers to pid map and mark as 'used', but no sha1 hash - // value - containerIndex = getUnusedIndexOf(item.get("@id").getAsString()); + containerIndex = getUnusedIndexOf(containerId); resourceUsed[containerIndex] = true; - pidMap.put(item.get("@id").getAsString(), currentPath); - + pidMap.put(containerId, currentPath); } catch (InterruptedException | IOException | ExecutionException e) { e.printStackTrace(); logger.severe(e.getMessage()); @@ -523,159 +532,156 @@ private void processContainer(JsonObject item, String currentPath) resourceUsed[containerIndex] = false; } throw new IOException("Unable to create bag"); - } - for (int i = 0; i < children.size(); i++) { - // Find the ith child in the overall array of aggregated - // resources + for (int i = 0; i < children.size(); i++) { String childId = children.get(i).getAsString(); - logger.fine("Processing: " + childId); + logger.fine("Examining: " + childId); int index = getUnusedIndexOf(childId); - if (resourceUsed[index] != null) { - System.out.println("Warning: reusing resource " + index); - } - // Aggregation is at index 0, so need to shift by 1 for aggregates - // entries JsonObject child = aggregates.get(index - 1).getAsJsonObject(); if (childIsContainer(child)) { - // create dir and process children - // processContainer will mark this item as used - processContainer(child, currentPath); + // Recursively collect files from this container + collectAllFiles(child, currentPath, allFiles); } else { - resourceUsed[index] = true; - // add item - String dataUrl = child.get(JsonLDTerm.schemaOrg("sameAs").getLabel()).getAsString(); - logger.fine("File url: " + dataUrl); - String childTitle = child.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); - if (titles.contains(childTitle)) { - logger.warning("**** Multiple items with the same title in: " + currentPath); - logger.warning("**** Will cause failure in hash and size validation in: " + bagID); - } else { - titles.add(childTitle); - } - String childPath = currentPath + childTitle; - JsonElement directoryLabel = child.get(JsonLDTerm.DVCore("directoryLabel").getLabel()); - if(directoryLabel!=null) { - childPath=currentPath + directoryLabel.getAsString() + "/" + childTitle; - } - // Get file size Long fileSize = null; if (child.has(JsonLDTerm.filesize.getLabel())) { fileSize = child.get(JsonLDTerm.filesize.getLabel()).getAsLong(); } - if(fileSize == null) { - logger.severe("File size missing for " + childPath); + if (fileSize == null) { + logger.severe("File size missing for child: " + childId); throw new IOException("Unable to create bag due to missing file size"); } - String childHash = null; - if (child.has(JsonLDTerm.checksum.getLabel())) { - ChecksumType childHashType = ChecksumType.fromString( - child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@type").getAsString()); - if (hashtype == null) { - //If one wasn't set as a default, pick up what the first child with one uses - hashtype = childHashType; - } - if (hashtype != null && !hashtype.equals(childHashType)) { - logger.warning("Multiple hash values in use - will calculate " + hashtype.toString() - + " hashes for " + childTitle); - } else { - childHash = child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@value").getAsString(); - if (checksumMap.containsValue(childHash)) { - // Something else has this hash - logger.warning("Duplicate/Collision: " + child.get("@id").getAsString() + " has SHA1 Hash: " - + childHash + " in: " + bagID); - } - logger.fine("Adding " + childPath + " with hash " + childHash + " to checksumMap"); - checksumMap.put(childPath, childHash); - } + // Store minimal info for sorting - JsonObject is just a reference + allFiles.add(new FileEntry(fileSize, child, currentPath, index)); + } + } + } + + + // Process all files in sorted order + private void processAllFiles(List sortedFiles) + throws IOException, ExecutionException, InterruptedException { + + if ((hashtype == null) | ignorehashes) { + hashtype = DataFile.ChecksumType.SHA512; + } + + for (FileEntry entry : sortedFiles) { + // Extract all needed information from the JsonObject reference + JsonObject child = entry.jsonObject; + String dataUrl = child.get(JsonLDTerm.schemaOrg("sameAs").getLabel()).getAsString(); + String childTitle = child.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); + + // Build full path using stored currentPath + String childPath = entry.currentPath + childTitle; + JsonElement directoryLabel = child.get(JsonLDTerm.DVCore("directoryLabel").getLabel()); + if (directoryLabel != null) { + childPath = entry.currentPath + directoryLabel.getAsString() + "/" + childTitle; + } + + // Get hash if exists + String childHash = null; + if (child.has(JsonLDTerm.checksum.getLabel())) { + ChecksumType childHashType = ChecksumType.fromString( + child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@type").getAsString()); + if (hashtype == null) { + hashtype = childHashType; } - if ((hashtype == null) | ignorehashes) { - // Pick sha512 when ignoring hashes or none exist - hashtype = DataFile.ChecksumType.SHA512; + if (hashtype != null && !hashtype.equals(childHashType)) { + logger.warning("Multiple hash values in use - will calculate " + hashtype.toString() + + " hashes for " + childTitle); + } else { + childHash = child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@value").getAsString(); } - try { - if ((childHash == null) | ignorehashes) { - // Generate missing hash - InputStream inputStream = null; - try { - inputStream = getInputStreamSupplier(dataUrl).get(); - - if (hashtype != null) { - if (hashtype.equals(DataFile.ChecksumType.SHA1)) { - childHash = DigestUtils.sha1Hex(inputStream); - } else if (hashtype.equals(DataFile.ChecksumType.SHA256)) { - childHash = DigestUtils.sha256Hex(inputStream); - } else if (hashtype.equals(DataFile.ChecksumType.SHA512)) { - childHash = DigestUtils.sha512Hex(inputStream); - } else if (hashtype.equals(DataFile.ChecksumType.MD5)) { - childHash = DigestUtils.md5Hex(inputStream); - } + } + + resourceUsed[entry.resourceIndex] = true; + + try { + if ((childHash == null) | ignorehashes) { + // Generate missing hash + InputStream inputStream = null; + try { + inputStream = getInputStreamSupplier(dataUrl).get(); + + if (hashtype != null) { + if (hashtype.equals(DataFile.ChecksumType.SHA1)) { + childHash = DigestUtils.sha1Hex(inputStream); + } else if (hashtype.equals(DataFile.ChecksumType.SHA256)) { + childHash = DigestUtils.sha256Hex(inputStream); + } else if (hashtype.equals(DataFile.ChecksumType.SHA512)) { + childHash = DigestUtils.sha512Hex(inputStream); + } else if (hashtype.equals(DataFile.ChecksumType.MD5)) { + childHash = DigestUtils.md5Hex(inputStream); } - - } catch (IOException e) { - logger.severe("Failed to read " + childPath); - throw e; - } finally { - IOUtils.closeQuietly(inputStream); - } - if (childHash != null) { - JsonObject childHashObject = new JsonObject(); - childHashObject.addProperty("@type", hashtype.toString()); - childHashObject.addProperty("@value", childHash); - child.add(JsonLDTerm.checksum.getLabel(), (JsonElement) childHashObject); - - checksumMap.put(childPath, childHash); - } else { - logger.warning("Unable to calculate a " + hashtype + " for " + dataUrl); } + + } catch (IOException e) { + logger.severe("Failed to read " + childPath); + throw e; + } finally { + IOUtils.closeQuietly(inputStream); } - - // Add file to bag or fetch file - if (shouldAddToFetchFile(fileSize)) { - // Add to fetch file instead of including in bag - logger.fine("Adding to fetch file: " + childPath + " from " + dataUrl); - addToFetchFile(dataUrl, fileSize, childPath); - usingFetchFile = true; + if (childHash != null) { + JsonObject childHashObject = new JsonObject(); + childHashObject.addProperty("@type", hashtype.toString()); + childHashObject.addProperty("@value", childHash); + child.add(JsonLDTerm.checksum.getLabel(), (JsonElement) childHashObject); + + checksumMap.put(childPath, childHash); } else { - // Add file to bag as before - logger.fine("Requesting: " + childPath + " from " + dataUrl); - createFileFromURL(childPath, dataUrl); - if (fileSize != null) { - currentBagDataSize += fileSize; - } + logger.warning("Unable to calculate a " + hashtype + " for " + dataUrl); } - - dataCount++; - if (dataCount % 1000 == 0) { - logger.info("Retrieval in progress: " + dataCount + " files retrieved"); - } - if (fileSize != null) { - totalDataSize += fileSize; - if (fileSize > maxFileSize) { - maxFileSize = fileSize; - } - } - if (child.has(JsonLDTerm.schemaOrg("fileFormat").getLabel())) { - mimetypes.add(child.get(JsonLDTerm.schemaOrg("fileFormat").getLabel()).getAsString()); + } else { + // Hash already exists, add to checksumMap + if (checksumMap.containsValue(childHash)) { + logger.warning("Duplicate/Collision: " + child.get("@id").getAsString() + + " has hash: " + childHash + " in: " + bagID); } - - } catch (Exception e) { - resourceUsed[index] = false; - e.printStackTrace(); - throw new IOException("Unable to create bag"); + logger.fine("Adding " + childPath + " with hash " + childHash + " to checksumMap"); + checksumMap.put(childPath, childHash); + } + + // Add file to bag or fetch file + if (shouldAddToFetchFile(entry.size)) { + logger.fine("Adding to fetch file: " + childPath + " from " + dataUrl + + " (size: " + entry.size + " bytes)"); + addToFetchFile(dataUrl, entry.size, childPath); + usingFetchFile = true; + } else { + logger.fine("Requesting: " + childPath + " from " + dataUrl + + " (size: " + entry.size + " bytes)"); + createFileFromURL(childPath, dataUrl); + currentBagDataSize += entry.size; + } + + dataCount++; + if (dataCount % 1000 == 0) { + logger.info("Retrieval in progress: " + dataCount + " files retrieved"); + } + + totalDataSize += entry.size; + if (entry.size > maxFileSize) { + maxFileSize = entry.size; + } + + if (child.has(JsonLDTerm.schemaOrg("fileFormat").getLabel())) { + mimetypes.add(child.get(JsonLDTerm.schemaOrg("fileFormat").getLabel()).getAsString()); } - // Check for nulls! - pidMap.put(child.get("@id").getAsString(), childPath); - + } catch (Exception e) { + resourceUsed[entry.resourceIndex] = false; + e.printStackTrace(); + throw new IOException("Unable to create bag"); } + + pidMap.put(child.get("@id").getAsString(), childPath); } } - + // Helper method to determine if file should go to fetch file private boolean shouldAddToFetchFile(long fileSize) { @@ -1199,5 +1205,24 @@ public static void setNumConnections(int numConnections) { BagGenerator.numConnections = numConnections; logger.fine("All BagGenerators will use " + numConnections + " threads"); } - + + // Inner class to hold file information before processing + private static class FileEntry implements Comparable { + final long size; + final JsonObject jsonObject; // Direct reference, not a copy + final String currentPath; // Parent directory path + final int resourceIndex; // Still need this for resourceUsed tracking + + FileEntry(long size, JsonObject jsonObject, String currentPath, int resourceIndex) { + this.size = size; + this.jsonObject = jsonObject; + this.currentPath = currentPath; + this.resourceIndex = resourceIndex; + } + + @Override + public int compareTo(FileEntry other) { + return Long.compare(this.size, other.size); + } + } } \ No newline at end of file From eec333b42a204bf1cf953a556665a9638700ed95 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 30 Jan 2026 15:10:28 -0500 Subject: [PATCH 11/14] only add subcollection folders (if they exist) --- .../iq/dataverse/util/bagit/BagGenerator.java | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index a168f1ea5d9..0c8c477918e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -255,7 +255,7 @@ public boolean generateBag(OutputStream outputStream) throws Exception { // children // Recursively collect all files from the entire tree, start with an empty set of processedContainers List allFiles = new ArrayList<>(); - collectAllFiles(aggregation, currentPath, allFiles); + collectAllFiles(aggregation, currentPath, allFiles, false); // Sort files by size (smallest first) Collections.sort(allFiles); @@ -503,18 +503,19 @@ public static String getValidName(String bagName) { } // Collect all files recursively and process containers to create dirs in the zip - private void collectAllFiles(JsonObject item, String currentPath, List allFiles) + private void collectAllFiles(JsonObject item, String currentPath, List allFiles, boolean addTitle) throws IOException, ExecutionException, InterruptedException { JsonArray children = getChildren(item); - String title = null; - if (item.has(JsonLDTerm.dcTerms("Title").getLabel())) { - title = item.get("Title").getAsString(); - } else if (item.has(JsonLDTerm.schemaOrg("name").getLabel())) { - title = item.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); + if (addTitle) { + String title = null; + if (item.has(JsonLDTerm.dcTerms("Title").getLabel())) { + title = item.get("Title").getAsString(); + } else if (item.has(JsonLDTerm.schemaOrg("name").getLabel())) { + title = item.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); + } + logger.fine("Collecting files from " + title + "/ at path " + currentPath); + currentPath = currentPath + title + "/"; } - logger.fine("Collecting files from " + title + "/ at path " + currentPath); - currentPath = currentPath + title + "/"; - // Mark this container as processed String containerId = item.get("@id").getAsString(); @@ -540,9 +541,10 @@ private void collectAllFiles(JsonObject item, String currentPath, List Date: Fri, 30 Jan 2026 15:22:28 -0500 Subject: [PATCH 12/14] replace deprecated constructs --- .../iq/dataverse/util/bagit/BagGenerator.java | 110 ++++++++---------- 1 file changed, 51 insertions(+), 59 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index 0c8c477918e..f122346e2fd 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -43,6 +43,7 @@ import org.apache.commons.compress.archivers.zip.ZipArchiveEntryRequest; import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream; import org.apache.commons.compress.archivers.zip.ZipFile; +import org.apache.commons.compress.archivers.zip.ZipFile.Builder; import org.apache.commons.compress.parallel.InputStreamSupplier; import org.apache.commons.compress.utils.IOUtils; import org.apache.commons.text.WordUtils; @@ -381,7 +382,6 @@ public boolean generateBag(OutputStream outputStream) throws Exception { public boolean generateBag(String bagName, boolean temp) { usetemp = temp; - FileOutputStream bagFileOS = null; try { File origBagFile = getBagFile(bagName); File bagFile = origBagFile; @@ -390,82 +390,78 @@ public boolean generateBag(String bagName, boolean temp) { logger.fine("Writing to: " + bagFile.getAbsolutePath()); } // Create an output stream backed by the file - bagFileOS = new FileOutputStream(bagFile); - if (generateBag(bagFileOS)) { - //The generateBag call sets this.bagName to the correct value - validateBagFile(bagFile); - if (usetemp) { - logger.fine("Moving tmp zip"); - origBagFile.delete(); - bagFile.renameTo(origBagFile); + try (FileOutputStream bagFileOS = new FileOutputStream(bagFile)) { + if (generateBag(bagFileOS)) { + // The generateBag call sets this.bagName to the correct value + validateBagFile(bagFile); + if (usetemp) { + logger.fine("Moving tmp zip"); + origBagFile.delete(); + bagFile.renameTo(origBagFile); + } + return true; + } else { + return false; } - return true; - } else { - return false; } } catch (Exception e) { logger.log(Level.SEVERE,"Bag Exception: ", e); e.printStackTrace(); logger.warning("Failure: Processing failure during Bagit file creation"); return false; - } finally { - IOUtils.closeQuietly(bagFileOS); } } public void validateBag(String bagId) { logger.info("Validating Bag"); - ZipFile zf = null; - InputStream is = null; try { File bagFile = getBagFile(bagId); - zf = new ZipFile(bagFile); - ZipArchiveEntry entry = zf.getEntry(getValidName(bagId) + "/manifest-sha1.txt"); - if (entry != null) { - logger.info("SHA1 hashes used"); - hashtype = DataFile.ChecksumType.SHA1; - } else { - entry = zf.getEntry(getValidName(bagId) + "/manifest-sha512.txt"); + try (ZipFile zf = ZipFile.builder().setFile(bagFile).get()) { + ZipArchiveEntry entry = zf.getEntry(getValidName(bagId) + "/manifest-sha1.txt"); if (entry != null) { - logger.info("SHA512 hashes used"); - hashtype = DataFile.ChecksumType.SHA512; + logger.info("SHA1 hashes used"); + hashtype = DataFile.ChecksumType.SHA1; } else { - entry = zf.getEntry(getValidName(bagId) + "/manifest-sha256.txt"); + entry = zf.getEntry(getValidName(bagId) + "/manifest-sha512.txt"); if (entry != null) { - logger.info("SHA256 hashes used"); - hashtype = DataFile.ChecksumType.SHA256; + logger.info("SHA512 hashes used"); + hashtype = DataFile.ChecksumType.SHA512; } else { - entry = zf.getEntry(getValidName(bagId) + "/manifest-md5.txt"); + entry = zf.getEntry(getValidName(bagId) + "/manifest-sha256.txt"); if (entry != null) { - logger.info("MD5 hashes used"); - hashtype = DataFile.ChecksumType.MD5; + logger.info("SHA256 hashes used"); + hashtype = DataFile.ChecksumType.SHA256; + } else { + entry = zf.getEntry(getValidName(bagId) + "/manifest-md5.txt"); + if (entry != null) { + logger.info("MD5 hashes used"); + hashtype = DataFile.ChecksumType.MD5; + } } } } + if (entry == null) + throw new IOException("No manifest file found"); + try (InputStream is = zf.getInputStream(entry)) { + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + String line = br.readLine(); + while (line != null) { + logger.fine("Hash entry: " + line); + int breakIndex = line.indexOf(' '); + String hash = line.substring(0, breakIndex); + String path = line.substring(breakIndex + 1); + logger.fine("Adding: " + path + " with hash: " + hash); + checksumMap.put(path, hash); + line = br.readLine(); + } + } } - if (entry == null) - throw new IOException("No manifest file found"); - is = zf.getInputStream(entry); - BufferedReader br = new BufferedReader(new InputStreamReader(is)); - String line = br.readLine(); - while (line != null) { - logger.fine("Hash entry: " + line); - int breakIndex = line.indexOf(' '); - String hash = line.substring(0, breakIndex); - String path = line.substring(breakIndex + 1); - logger.fine("Adding: " + path + " with hash: " + hash); - checksumMap.put(path, hash); - line = br.readLine(); - } - IOUtils.closeQuietly(is); logger.info("HashMap Map contains: " + checksumMap.size() + " entries"); checkFiles(checksumMap, bagFile); } catch (IOException io) { logger.log(Level.SEVERE,"Could not validate Hashes", io); } catch (Exception e) { logger.log(Level.SEVERE,"Could not validate Hashes", e); - } finally { - IOUtils.closeQuietly(zf); } return; } @@ -605,10 +601,8 @@ private void processAllFiles(List sortedFiles) try { if ((childHash == null) | ignorehashes) { // Generate missing hash - InputStream inputStream = null; - try { - inputStream = getInputStreamSupplier(dataUrl).get(); - + + try (InputStream inputStream = getInputStreamSupplier(dataUrl).get()){ if (hashtype != null) { if (hashtype.equals(DataFile.ChecksumType.SHA1)) { childHash = DigestUtils.sha1Hex(inputStream); @@ -624,8 +618,6 @@ private void processAllFiles(List sortedFiles) } catch (IOException e) { logger.severe("Failed to read " + childPath); throw e; - } finally { - IOUtils.closeQuietly(inputStream); } if (childHash != null) { JsonObject childHashObject = new JsonObject(); @@ -782,11 +774,13 @@ private void createFileFromURL(final String relPath, final String uri) addEntry(archiveEntry, supp); } + @SuppressWarnings("deprecation") private void checkFiles(HashMap shaMap, File bagFile) { ExecutorService executor = Executors.newFixedThreadPool(numConnections); - ZipFile zf = null; - try { - zf = new ZipFile(bagFile); + + try (ZipFile zf = ZipFile.builder() + .setFile(bagFile) + .get() ){ BagValidationJob.setZipFile(zf); BagValidationJob.setBagGenerator(this); @@ -813,8 +807,6 @@ private void checkFiles(HashMap shaMap, File bagFile) { } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); - } finally { - IOUtils.closeQuietly(zf); } logger.fine("Hash Validations Completed"); From b746d5db75c85c9c71ce9bd440d237df6a3456be Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 30 Jan 2026 16:35:36 -0500 Subject: [PATCH 13/14] restore name collision check --- .../iq/dataverse/util/bagit/BagGenerator.java | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index f122346e2fd..a488499b8fe 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -22,6 +22,7 @@ import java.util.Calendar; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; @@ -133,7 +134,7 @@ public class BagGenerator { private long currentBagDataSize = 0; private StringBuilder fetchFileContent = new StringBuilder(); private boolean usingFetchFile = false; - + /** * This BagGenerator creates a BagIt version 1.0 * (https://tools.ietf.org/html/draft-kunze-bagit-16) compliant bag that is also @@ -563,6 +564,9 @@ private void collectAllFiles(JsonObject item, String currentPath, List sortedFiles) throws IOException, ExecutionException, InterruptedException { + // Track titles to detect duplicates + Set titles = new HashSet<>(); + if ((hashtype == null) | ignorehashes) { hashtype = DataFile.ChecksumType.SHA512; } @@ -573,6 +577,14 @@ private void processAllFiles(List sortedFiles) String dataUrl = child.get(JsonLDTerm.schemaOrg("sameAs").getLabel()).getAsString(); String childTitle = child.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); + // Check for duplicate titles + if (titles.contains(childTitle)) { + logger.warning("**** Multiple items with the same title in: " + entry.currentPath); + logger.warning("**** Will cause failure in hash and size validation in: " + bagID); + } else { + titles.add(childTitle); + } + // Build full path using stored currentPath String childPath = entry.currentPath + childTitle; JsonElement directoryLabel = child.get(JsonLDTerm.DVCore("directoryLabel").getLabel()); From 88edc8aefe591bcc593dde8bdda27e8f89d26f6d Mon Sep 17 00:00:00 2001 From: qqmyers Date: Fri, 30 Jan 2026 16:35:53 -0500 Subject: [PATCH 14/14] add null check to quiet log/avoid exception --- .../harvard/iq/dataverse/util/bagit/OREMap.java | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java index 4cbc2aa7b9a..dd651885d01 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java @@ -505,11 +505,16 @@ private static void addCvocValue(String val, JsonArrayBuilder vals, JsonObject c for (String prefix : context.keySet()) { localContext.putIfAbsent(prefix, context.getString(prefix)); } - JsonObjectBuilder job = Json.createObjectBuilder(datasetFieldService.getExternalVocabularyValue(val)); - job.add("@id", val); - JsonObject extVal = job.build(); - logger.fine("Adding: " + extVal); - vals.add(extVal); + JsonObject cachedValue = datasetFieldService.getExternalVocabularyValue(val); + if (cachedValue != null) { + JsonObjectBuilder job = Json.createObjectBuilder(cachedValue); + job.add("@id", val); + JsonObject extVal = job.build(); + logger.fine("Adding: " + extVal); + vals.add(extVal); + } else { + vals.add(val); + } } else { vals.add(val); }