diff --git a/doc/release-notes/12063-ORE-and-Bag-updates.md b/doc/release-notes/12063-ORE-and-Bag-updates.md new file mode 100644 index 00000000000..bbc22b22182 --- /dev/null +++ b/doc/release-notes/12063-ORE-and-Bag-updates.md @@ -0,0 +1,15 @@ +This release contains multiple updates to the OAI-ORE metadata export and archival Bag output: + +OAI-ORE +- now uses URI for checksum algorithms +- a bug causing failures with deaccessioned versions when the deaccession note ("Deaccession Reason" in the UI) was null (which has been allowed via the API). +- the "https://schema.org/additionalType" is updated to "Dataverse OREMap Format v1.0.2" to indicate that the out has changed + +Archival Bag +- for dataset versions with no files, the (empty) manifest-.txt file created will now use the default algorithm defined by the "FileFixityChecksumAlgorithm" setting rather than always defaulting to "md5" +- a bug causing the bag-info.txt to not have information on contacts when the dataset version has more than one contact has been fixed +- values used in the bag-info.txt file that may be multi-line (with embedded CR or LF characters) are now properly indented/formatted per the BagIt specification (i.e. Internal-Sender-Identifier, External-Description, Source-Organization, Organization-Address). +- the name of the dataset is no longer used as a subdirectory under the data directory (dataset names can be long enough to cause failures when unzipping) +- a new key, "Dataverse-Bag-Version" has been added to bag-info.txt with a value "1.0", allowing tracking of changes to Dataverse's arhival bag generation +- improvements to file retrieval w.r.t. retries on errors or throttling +- retrieval of files for inclusion in the bag is no longer counted as a download by Dataverse \ No newline at end of file diff --git a/doc/release-notes/12144-un-holey-bags.md b/doc/release-notes/12144-un-holey-bags.md new file mode 100644 index 00000000000..3c9c632eb6c --- /dev/null +++ b/doc/release-notes/12144-un-holey-bags.md @@ -0,0 +1,21 @@ +This release contains multiple updates to the OAI-ORE metadata export and archival Bag output: + +OAI-ORE +- now uses URI for checksum algorithms +- a bug causing failures with deaccessioned versions when the deaccession note ("Deaccession Reason" in the UI) was null (which has been allowed via the API). +- the "https://schema.org/additionalType" is updated to "Dataverse OREMap Format v1.0.2" to indicate that the out has changed + +Archival Bag +- for dataset versions with no files, the (empty) manifest-.txt file created will now use the default algorithm defined by the "FileFixityChecksumAlgorithm" setting rather than always defaulting to "md5" +- a bug causing the bag-info.txt to not have information on contacts when the dataset version has more than one contact has been fixed +- values used in the bag-info.txt file that may be multi-line (with embedded CR or LF characters) are now properly indented/formatted per the BagIt specification (i.e. Internal-Sender-Identifier, External-Description, Source-Organization, Organization-Address). +- the name of the dataset is no longer used as a subdirectory under the data directory (dataset names can be long enough to cause failures when unzipping) +- a new key, "Dataverse-Bag-Version" has been added to bag-info.txt with a value "1.0", allowing tracking of changes to Dataverse's arhival bag generation +- improvements to file retrieval w.r.t. retries on errors or throttling +- retrieval of files for inclusion in the bag is no longer counted as a download by Dataverse +- the size of data files and total dataset size that will be included in an archival bag can now be limited. Admins can choose whether files above these limits are transferred along with the zipped bag (creating a complete archival copy) or are just referenced (using the concept of a "holey" bag and just listing the oversized files and the Dataverse urls from which they can be retrieved. In the holey bag case, an active service on the archiving platform must retrieve the oversized files (using appropriate credentials as needed) to make a complete copy + +### New JVM Options (MicroProfile Config Settings) +dataverse.bagit.zip.holey +dataverse.bagit.zip.max-data-size +dataverse.bagit.zip.max-file-size \ No newline at end of file diff --git a/doc/sphinx-guides/source/admin/big-data-administration.rst b/doc/sphinx-guides/source/admin/big-data-administration.rst index c4a98a6987a..c1d2a02c4a2 100644 --- a/doc/sphinx-guides/source/admin/big-data-administration.rst +++ b/doc/sphinx-guides/source/admin/big-data-administration.rst @@ -302,6 +302,7 @@ There are a broad range of options (that are not turned on by default) for impro - :ref:`:DisableSolrFacetsWithoutJsession` - disables facets for users who have disabled cookies (e.g. for bots) - :ref:`:DisableUncheckedTypesFacet` - only disables the facet showing the number of collections, datasets, files matching the query (this facet is potentially less useful than others) - :ref:`:StoreIngestedTabularFilesWithVarHeaders` - by default, Dataverse stores ingested files without headers and dynamically adds them back at download time. Once this setting is enabled, Dataverse will leave the headers in place (for newly ingested files), reducing the cost of downloads +- :ref:`dataverse.bagit.zip.max-file-size`, :ref:`dataverse.bagit.zip.max-data-size`, and :ref:`dataverse.bagit.zip.holey` - options to control the size and temporary storage requirements when generating archival Bags - see :ref:`BagIt Export` Scaling Infrastructure diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index a9d5c7c0041..fff7a747063 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -2259,6 +2259,8 @@ These archival Bags include all of the files and metadata in a given dataset ver The Dataverse Software offers an internal archive workflow which may be configured as a PostPublication workflow via an admin API call to manually submit previously published Datasets and prior versions to a configured archive such as Chronopolis. The workflow creates a `JSON-LD `_ serialized `OAI-ORE `_ map file, which is also available as a metadata export format in the Dataverse Software web interface. +The size of the zipped archival Bag can be limited, and files that don't fit within that limit can either be transferred separately (placed so that they are correctly positioned according to the BagIt specification when the zipped bag in unzipped in place) or just referenced for later download (using the BagIt concept of a 'holey' bag with a list of files in a ``fetch.txt`` file) can now be configured for all archivers. These settings allow for managing large datasets by excluding files over a certain size or total data size, which can be useful for archivers with size limitations or to reduce transfer times. See the :ref:`dataverse.bagit.zip.max-file-size`, :ref:`dataverse.bagit.zip.max-data-size`, and :ref:`dataverse.bagit.zip.holey` JVM options for more details. + At present, archiving classes include the DuraCloudSubmitToArchiveCommand, LocalSubmitToArchiveCommand, GoogleCloudSubmitToArchive, and S3SubmitToArchiveCommand , which all extend the AbstractSubmitToArchiveCommand and use the configurable mechanisms discussed below. (A DRSSubmitToArchiveCommand, which works with Harvard's DRS also exists and, while specific to DRS, is a useful example of how Archivers can support single-version-only semantics and support archiving only from specified collections (with collection specific parameters)). All current options support the :ref:`Archival Status API` calls and the same status is available in the dataset page version table (for contributors/those who could view the unpublished dataset, with more detail available to superusers). @@ -3868,6 +3870,21 @@ This can instead be restricted to only superusers who can publish the dataset us Example: ``dataverse.coar-notify.relationship-announcement.notify-superusers-only=true`` +.. _dataverse.bagit.zip.holey: + +``dataverse.bagit.zip.holey`` + A boolean that, if true, will cause the BagIt archiver to create a "holey" bag. In a holey bag, files that are not included in the bag are listed in the ``fetch.txt`` file with a URL from which they can be downloaded. This is used in conjunction with ``dataverse.bagit.zip.max-file-size`` and/or ``dataverse.bagit.zip.max-data-size``. Default: false. + +.. _dataverse.bagit.zip.max-data-size: + +``dataverse.bagit.zip.max-data-size`` + The maximum total (uncompressed) size of data files (in bytes) to include in a BagIt zip archive. If the total size of the dataset files exceeds this limit, files will be excluded from the zipped bag (starting from the largest) until the total size is under the limit. Excluded files will be handled as defined by ``dataverse.bagit.zip.holey`` - just listed if that setting is true or being transferred separately and placed next to the zipped bag. When not set, there is no limit. + +.. _dataverse.bagit.zip.max-file-size: + +``dataverse.bagit.zip.max-file-size`` + The maximum (uncompressed) size of a single file (in bytes) to include in a BagIt zip archive. Any file larger than this will be excluded. Excluded files will be handled as defined by ``dataverse.bagit.zip.holey`` - just listed if that setting is true or being transferred separately and placed next to the zipped bag. When not set, there is no limit. + .. _feature-flags: Feature Flags diff --git a/src/main/java/edu/harvard/iq/dataverse/DataFile.java b/src/main/java/edu/harvard/iq/dataverse/DataFile.java index 45604a5472b..8a08cd15029 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataFile.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataFile.java @@ -109,18 +109,22 @@ public class DataFile extends DvObject implements Comparable { * The list of types should be limited to the list above in the technote * because the string gets passed into MessageDigest.getInstance() and you * can't just pass in any old string. + * + * The URIs are used in the OAI_ORE export. They are taken from the associated XML Digital Signature standards. */ public enum ChecksumType { - MD5("MD5"), - SHA1("SHA-1"), - SHA256("SHA-256"), - SHA512("SHA-512"); + MD5("MD5", "http://www.w3.org/2001/04/xmldsig-more#md5"), + SHA1("SHA-1", "http://www.w3.org/2000/09/xmldsig#sha1"), + SHA256("SHA-256", "http://www.w3.org/2001/04/xmlenc#sha256"), + SHA512("SHA-512", "http://www.w3.org/2001/04/xmlenc#sha512"); private final String text; + private final String uri; - private ChecksumType(final String text) { + private ChecksumType(final String text, final String uri) { this.text = text; + this.uri = uri; } public static ChecksumType fromString(String text) { @@ -131,13 +135,30 @@ public static ChecksumType fromString(String text) { } } } - throw new IllegalArgumentException("ChecksumType must be one of these values: " + Arrays.asList(ChecksumType.values()) + "."); + throw new IllegalArgumentException( + "ChecksumType must be one of these values: " + Arrays.asList(ChecksumType.values()) + "."); + } + + public static ChecksumType fromUri(String uri) { + if (uri != null) { + for (ChecksumType checksumType : ChecksumType.values()) { + if (uri.equals(checksumType.uri)) { + return checksumType; + } + } + } + throw new IllegalArgumentException( + "ChecksumType must be one of these values: " + Arrays.asList(ChecksumType.values()) + "."); } @Override public String toString() { return text; } + + public String toUri() { + return uri; + } } //@Expose diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java index fe4a25091d7..b65f39fa484 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java @@ -7,16 +7,24 @@ import edu.harvard.iq.dataverse.authorization.users.ApiToken; import edu.harvard.iq.dataverse.engine.command.DataverseRequest; import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator; +import edu.harvard.iq.dataverse.util.bagit.OREMap; + import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.DuraCloudContext; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.DuraCloudHost; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.DuraCloudPort; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; +import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.InputStream; import java.io.PipedInputStream; import java.io.PipedOutputStream; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; import java.security.DigestInputStream; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; @@ -96,6 +104,8 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, "Bag not transferred"); + Path tempBagFile = null; + try { /* * If there is a failure in creating a space, it is likely that a prior version @@ -161,20 +171,38 @@ public void run() { // Add BagIt ZIP file // Although DuraCloud uses SHA-256 internally, it's API uses MD5 to verify the // transfer + Path bagFile = null; + - messageDigest = MessageDigest.getInstance("MD5"); - try (PipedInputStream in = new PipedInputStream(100000); - DigestInputStream digestInputStream2 = new DigestInputStream(in, messageDigest)) { - Thread bagThread = startBagThread(dv, in, digestInputStream2, dataciteXml, token); - checksum = store.addContent(spaceName, fileName, digestInputStream2, -1l, null, null, null); - bagThread.join(); - if (success) { - logger.fine("Content: " + fileName + " added with checksum: " + checksum); - localchecksum = Hex.encodeHexString(digestInputStream2.getMessageDigest().digest()); + tempBagFile = Files.createTempFile("dataverse-bag-", ".zip"); + logger.fine("Creating bag in temporary file: " + tempBagFile.toString()); + + BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); + bagger.setAuthenticationKey(token.getTokenString()); + // Generate bag to temporary file using the provided ore JsonObject + try (FileOutputStream fos = new FileOutputStream(tempBagFile.toFile())) { + if (!bagger.generateBag(fos)) { + throw new IOException("Bag generation failed"); } - if (!success || !checksum.equals(localchecksum)) { + } + + // Store BagIt file + long bagSize = Files.size(tempBagFile); + logger.fine("Bag created successfully, size: " + bagSize + " bytes"); + + // Now upload the bag file + messageDigest = MessageDigest.getInstance("MD5"); + try (InputStream is = Files.newInputStream(bagFile); + DigestInputStream bagDigestInputStream = new DigestInputStream(is, messageDigest)) { + checksum = store.addContent(spaceName, fileName, bagDigestInputStream, bagFile.toFile().length(), "application/zip", null, null); + localchecksum = Hex.encodeHexString(bagDigestInputStream.getMessageDigest().digest()); + + if (checksum != null && checksum.equals(localchecksum)) { + logger.fine("Content: " + fileName + " added with checksum: " + checksum); + success = true; + } else { logger.severe("Failure on " + fileName); - logger.severe(success ? checksum + " not equal to " + localchecksum : "failed to transfer to DuraCloud"); + logger.severe(checksum + " not equal to " + localchecksum); try { store.deleteContent(spaceName, fileName); store.deleteContent(spaceName, baseFileName + "_datacite.xml"); @@ -185,9 +213,6 @@ public void run() { "DuraCloud Submission Failure: incomplete archive transfer"); } } - - logger.fine("DuraCloud Submission step: Content Transferred"); - // Document the location of dataset archival copy location (actually the URL // where you can // view it as an admin) @@ -223,8 +248,20 @@ public void run() { return new Failure("Unable to create DuraCloud space with name: " + baseFileName, mesg); } catch (NoSuchAlgorithmException e) { logger.severe("MD5 MessageDigest not available!"); + } catch (Exception e) { + logger.warning(e.getLocalizedMessage()); + e.printStackTrace(); + return new Failure("Error in transferring file to DuraCloud", + "DuraCloud Submission Failure: internal error"); } finally { + if (tempBagFile != null) { + try { + Files.deleteIfExists(tempBagFile); + } catch (IOException e) { + logger.warning("Failed to delete temporary bag file: " + tempBagFile + " : " + e.getMessage()); + } + } dv.setArchivalCopyLocation(statusObject.build().toString()); } } else { diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java index 7dfb9f07e19..17e7b641cf6 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java @@ -14,20 +14,29 @@ import edu.harvard.iq.dataverse.engine.command.DataverseRequest; import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; import edu.harvard.iq.dataverse.settings.JvmSettings; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator; +import edu.harvard.iq.dataverse.util.bagit.OREMap; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator.FileEntry; + import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.GoogleCloudBucket; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.GoogleCloudProject; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; import org.apache.commons.codec.binary.Hex; +import org.apache.commons.compress.parallel.InputStreamSupplier; import jakarta.json.Json; import jakarta.json.JsonObjectBuilder; import java.io.File; import java.io.FileInputStream; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.InputStream; import java.io.PipedInputStream; import java.io.PipedOutputStream; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; import java.security.DigestInputStream; import java.security.MessageDigest; import java.util.Map; @@ -45,26 +54,28 @@ public GoogleCloudSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersi } @Override - public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, Map requestedSettings) { + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, + Map requestedSettings) { logger.fine("In GoogleCloudSubmitToArchiveCommand..."); String bucketName = requestedSettings.get(GOOGLECLOUD_BUCKET); String projectName = requestedSettings.get(GOOGLECLOUD_PROJECT); logger.fine("Project: " + projectName + " Bucket: " + bucketName); if (bucketName != null && projectName != null) { Storage storage; - //Set a failure status that will be updated if we succeed + // Set a failure status that will be updated if we succeed JsonObjectBuilder statusObject = Json.createObjectBuilder(); statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, "Bag not transferred"); - + String cloudKeyFile = JvmSettings.FILES_DIRECTORY.lookup() + File.separator + "googlecloudkey.json"; - + + // Create temporary file for bag + Path tempBagFile = null; + try (FileInputStream cloudKeyStream = new FileInputStream(cloudKeyFile)) { storage = StorageOptions.newBuilder() - .setCredentials(ServiceAccountCredentials.fromStream(cloudKeyStream)) - .setProjectId(projectName) - .build() - .getService(); + .setCredentials(ServiceAccountCredentials.fromStream(cloudKeyStream)).setProjectId(projectName) + .build().getService(); Bucket bucket = storage.get(bucketName); Dataset dataset = dv.getDataset(); @@ -72,6 +83,7 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-') .replace('.', '-').toLowerCase(); + String bagFileName = spaceName + "/" + spaceName + ".v" + dv.getFriendlyVersionNumber() + ".zip"; String dataciteXml = getDataCiteXml(dv); MessageDigest messageDigest = MessageDigest.getInstance("MD5"); @@ -102,7 +114,8 @@ public void run() { Thread.sleep(10); i++; } - Blob dcXml = bucket.create(spaceName + "/datacite.v" + dv.getFriendlyVersionNumber() + ".xml", digestInputStream, "text/xml", Bucket.BlobWriteOption.doesNotExist()); + Blob dcXml = bucket.create(spaceName + "/datacite.v" + dv.getFriendlyVersionNumber() + ".xml", + digestInputStream, "text/xml", Bucket.BlobWriteOption.doesNotExist()); dcThread.join(); String checksum = dcXml.getMd5ToHexString(); @@ -110,7 +123,8 @@ public void run() { String localchecksum = Hex.encodeHexString(digestInputStream.getMessageDigest().digest()); if (!success || !checksum.equals(localchecksum)) { logger.severe("Failure on " + spaceName); - logger.severe(success ? checksum + " not equal to " + localchecksum : "datacite.xml transfer did not succeed"); + logger.severe(success ? checksum + " not equal to " + localchecksum + : "datacite.xml transfer did not succeed"); try { dcXml.delete(Blob.BlobSourceOption.generationMatch()); } catch (StorageException se) { @@ -119,55 +133,112 @@ public void run() { return new Failure("Error in transferring DataCite.xml file to GoogleCloud", "GoogleCloud Submission Failure: incomplete metadata transfer"); } + } + + tempBagFile = Files.createTempFile("dataverse-bag-", ".zip"); + logger.fine("Creating bag in temporary file: " + tempBagFile.toString()); + + BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); + bagger.setAuthenticationKey(token.getTokenString()); + // Generate bag to temporary file using the provided ore JsonObject + try (FileOutputStream fos = new FileOutputStream(tempBagFile.toFile())) { + if (!bagger.generateBag(fos)) { + throw new IOException("Bag generation failed"); + } + } - // Store BagIt file - success = false; - String fileName = spaceName + ".v" + dv.getFriendlyVersionNumber() + ".zip"; + // Store BagIt file + long bagSize = Files.size(tempBagFile); + logger.fine("Bag created successfully, size: " + bagSize + " bytes"); + + if (bagSize == 0) { + throw new IOException("Generated bag file is empty"); + } + + // Upload bag file and calculate checksum during upload + messageDigest = MessageDigest.getInstance("MD5"); + String localChecksum; + + try (FileInputStream fis = new FileInputStream(tempBagFile.toFile()); + DigestInputStream dis = new DigestInputStream(fis, messageDigest)) { + + logger.fine("Uploading bag to GoogleCloud: " + bagFileName); + + Blob bag = bucket.create(bagFileName, dis, "application/zip", + Bucket.BlobWriteOption.doesNotExist()); + + if (bag.getSize() == 0) { + throw new IOException("Uploaded bag has zero size"); + } - // Add BagIt ZIP file - // Google uses MD5 as one way to verify the - // transfer + // Get checksum after upload completes + localChecksum = Hex.encodeHexString(dis.getMessageDigest().digest()); + String remoteChecksum = bag.getMd5ToHexString(); + + logger.fine("Bag: " + bagFileName + " uploaded"); + logger.fine("Local checksum: " + localChecksum); + logger.fine("Remote checksum: " + remoteChecksum); + + if (!localChecksum.equals(remoteChecksum)) { + logger.severe("Bag checksum mismatch!"); + logger.severe("Local: " + localChecksum + " != Remote: " + remoteChecksum); + try { + bag.delete(Blob.BlobSourceOption.generationMatch()); + } catch (StorageException se) { + logger.warning(se.getMessage()); + } + return new Failure("Error in transferring Zip file to GoogleCloud", + "GoogleCloud Submission Failure: bag checksum mismatch"); + } + } + + logger.fine("GoogleCloud Submission step: Content Transferred Successfully"); + + // Now upload any files that were too large for the bag + for (FileEntry entry : bagger.getOversizedFiles()) { + String childPath = entry.getChildPath(entry.getChildTitle()); + String fileKey = spaceName + "/" + childPath; + logger.fine("Uploading oversized file to GoogleCloud: " + fileKey); messageDigest = MessageDigest.getInstance("MD5"); - try (PipedInputStream in = new PipedInputStream(100000); - DigestInputStream digestInputStream2 = new DigestInputStream(in, messageDigest)) { - Thread bagThread = startBagThread(dv, in, digestInputStream2, dataciteXml, token); - Blob bag = bucket.create(spaceName + "/" + fileName, digestInputStream2, "application/zip", - Bucket.BlobWriteOption.doesNotExist()); - if (bag.getSize() == 0) { - throw new IOException("Empty Bag"); + InputStreamSupplier supplier = bagger.getInputStreamSupplier(entry.getDataUrl()); + try (InputStream is = supplier.get(); + DigestInputStream dis = new DigestInputStream(is, messageDigest)) { + Blob oversizedFileBlob = bucket.create(fileKey, dis, Bucket.BlobWriteOption.doesNotExist()); + if (oversizedFileBlob.getSize() == 0) { + throw new IOException("Uploaded oversized file has zero size: " + fileKey); } - bagThread.join(); - - checksum = bag.getMd5ToHexString(); - logger.fine("Bag: " + fileName + " added with checksum: " + checksum); - localchecksum = Hex.encodeHexString(digestInputStream2.getMessageDigest().digest()); - if (!success || !checksum.equals(localchecksum)) { - logger.severe(success ? checksum + " not equal to " + localchecksum - : "bag transfer did not succeed"); + localChecksum = Hex.encodeHexString(dis.getMessageDigest().digest()); + String remoteChecksum = oversizedFileBlob.getMd5ToHexString(); + logger.fine("Oversized file: " + fileKey + " uploaded"); + logger.fine("Local checksum: " + localChecksum); + logger.fine("Remote checksum: " + remoteChecksum); + if (!localChecksum.equals(remoteChecksum)) { + logger.severe("Oversized file checksum mismatch!"); + logger.severe("Local: " + localChecksum + " != Remote: " + remoteChecksum); try { - bag.delete(Blob.BlobSourceOption.generationMatch()); + oversizedFileBlob.delete(Blob.BlobSourceOption.generationMatch()); } catch (StorageException se) { logger.warning(se.getMessage()); } - return new Failure("Error in transferring Zip file to GoogleCloud", - "GoogleCloud Submission Failure: incomplete archive transfer"); + return new Failure("Error in transferring oversized file to GoogleCloud", + "GoogleCloud Submission Failure: oversized file transfer incomplete"); } + } catch (IOException e) { + logger.warning("Failed to upload oversized file: " + childPath + " : " + e.getMessage()); + return new Failure("Error uploading oversized file to Google Cloud: " + childPath); } + } - logger.fine("GoogleCloud Submission step: Content Transferred"); - - // Document the location of dataset archival copy location (actually the URL - // where you can view it as an admin) - // Changed to point at bucket where the zip and datacite.xml are visible + // Document the location of dataset archival copy location (actually the URL + // to the bucket). + statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); + statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, + String.format("https://storage.cloud.google.com/%s/%s", bucketName, spaceName)); - StringBuffer sb = new StringBuffer("https://console.cloud.google.com/storage/browser/"); - sb.append(bucketName + "/" + spaceName); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); - statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, sb.toString()); - - } } else { - logger.warning("GoogleCloud Submision Workflow aborted: Dataset locked for pidRegister"); + logger.warning( + "GoogleCloud Archiver Submision Workflow aborted: Dataset locked for publication/pidRegister"); + return new Failure("Dataset locked"); } } catch (Exception e) { @@ -177,11 +248,19 @@ public void run() { e.getLocalizedMessage() + ": check log for details"); } finally { + if (tempBagFile != null) { + try { + Files.deleteIfExists(tempBagFile); + } catch (IOException e) { + logger.warning("Failed to delete temporary bag file: " + tempBagFile + " : " + e.getMessage()); + } + } dv.setArchivalCopyLocation(statusObject.build().toString()); } return WorkflowStepResult.OK; } else { - return new Failure("GoogleCloud Submission not configured - no \":GoogleCloudBucket\" and/or \":GoogleCloudProject\"."); + return new Failure( + "GoogleCloud Submission not configured - no \":GoogleCloudBucket\" and/or \":GoogleCloudProject\"."); } } diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java index 462879f2ec9..76d7ae87f38 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/LocalSubmitToArchiveCommand.java @@ -10,6 +10,7 @@ import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.BagItLocalPath; import edu.harvard.iq.dataverse.util.bagit.BagGenerator; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator.FileEntry; import edu.harvard.iq.dataverse.util.bagit.OREMap; import edu.harvard.iq.dataverse.workflow.step.Failure; import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; @@ -23,6 +24,7 @@ import java.io.File; import java.io.FileOutputStream; +import java.io.InputStream; import org.apache.commons.io.FileUtils; @@ -63,12 +65,22 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t new File(localPath + "/" + spaceName + "-datacite.v" + dv.getFriendlyVersionNumber() + ".xml"), dataciteXml, StandardCharsets.UTF_8); BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); - bagger.setNumConnections(getNumberOfBagGeneratorThreads()); bagger.setAuthenticationKey(token.getTokenString()); zipName = localPath + "/" + spaceName + "v" + dv.getFriendlyVersionNumber() + ".zip"; //ToDo: generateBag(File f, true) seems to do the same thing (with a .tmp extension) - since we don't have to use a stream here, could probably just reuse the existing code? bagger.generateBag(new FileOutputStream(zipName + ".partial")); + // Now download any files that were too large for the bag + for (FileEntry entry : bagger.getOversizedFiles()) { + String childPath = entry.getChildPath(entry.getChildTitle()); + File destFile = new File(localPath, localPath + "/" + spaceName + "v" + dv.getFriendlyVersionNumber() + "/" + childPath); + logger.fine("Downloading oversized file to " + destFile.getAbsolutePath()); + destFile.getParentFile().mkdirs(); + try (InputStream is = bagger.getInputStreamSupplier(entry.getDataUrl()).get()) { + FileUtils.copyInputStreamToFile(is, destFile); + } + } + File srcFile = new File(zipName + ".partial"); File destFile = new File(zipName); diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java index 65531d775c8..072fd0edb48 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/S3SubmitToArchiveCommand.java @@ -9,6 +9,7 @@ import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.S3ArchiverConfig; import edu.harvard.iq.dataverse.util.bagit.BagGenerator; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator.FileEntry; import edu.harvard.iq.dataverse.util.bagit.OREMap; import edu.harvard.iq.dataverse.util.json.JsonUtil; import edu.harvard.iq.dataverse.workflow.step.Failure; @@ -17,9 +18,15 @@ import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; import java.nio.charset.StandardCharsets; +import java.util.List; import java.util.Map; import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.logging.Level; import java.util.logging.Logger; import jakarta.annotation.Resource; @@ -28,6 +35,7 @@ import jakarta.json.JsonObject; import jakarta.json.JsonObjectBuilder; +import org.apache.commons.compress.parallel.InputStreamSupplier; import org.eclipse.microprofile.config.Config; import org.eclipse.microprofile.config.ConfigProvider; @@ -55,8 +63,11 @@ import software.amazon.awssdk.utils.StringUtils; import software.amazon.awssdk.transfer.s3.S3TransferManager; import software.amazon.awssdk.transfer.s3.model.CompletedFileUpload; +import software.amazon.awssdk.transfer.s3.model.CompletedUpload; import software.amazon.awssdk.transfer.s3.model.FileUpload; +import software.amazon.awssdk.transfer.s3.model.Upload; import software.amazon.awssdk.transfer.s3.model.UploadFileRequest; +import software.amazon.awssdk.transfer.s3.model.UploadRequest; @RequiredPermissions(Permission.PublishDataset) public class S3SubmitToArchiveCommand extends AbstractSubmitToArchiveCommand { @@ -98,7 +109,8 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t JsonObjectBuilder statusObject = Json.createObjectBuilder(); statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE); statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, "Bag not transferred"); - + ExecutorService executor = Executors.newCachedThreadPool(); + try { Dataset dataset = dv.getDataset(); @@ -150,7 +162,39 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t if (uploadResult.response().sdkHttpResponse().isSuccessful()) { logger.fine("S3 Submission step: Content Transferred"); + List bigFiles = bagger.getOversizedFiles(); + + for (FileEntry entry : bigFiles) { + String childPath = entry.getChildPath(entry.getChildTitle()); + String fileKey = spaceName + "/" + childPath; + InputStreamSupplier supplier = bagger.getInputStreamSupplier(entry.getDataUrl()); + try (InputStream is = supplier.get()) { + + PutObjectRequest filePutRequest = PutObjectRequest.builder().bucket(bucketName) + .key(fileKey).build(); + + UploadRequest uploadRequest = UploadRequest.builder() + .putObjectRequest(filePutRequest) + .requestBody(AsyncRequestBody.fromInputStream(is, entry.getSize(), executor)) + .build(); + + Upload upload = tm.upload(uploadRequest); + CompletedUpload completedUpload = upload.completionFuture().join(); + + if (completedUpload.response().sdkHttpResponse().isSuccessful()) { + logger.fine("Successfully uploaded oversized file: " + fileKey); + } else { + logger.warning("Failed to upload oversized file: " + fileKey); + return new Failure("Error uploading oversized file to S3: " + fileKey); + } + } catch (IOException e) { + logger.log(Level.WARNING, + "Failed to get input stream for oversized file: " + fileKey, e); + return new Failure("Error getting input stream for oversized file: " + fileKey); + } + } statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_SUCCESS); + statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, String.format("https://%s.s3.amazonaws.com/%s", bucketName, bagKey)); } else { @@ -175,6 +219,7 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t e.getLocalizedMessage() + ": check log for details"); } finally { + executor.shutdown(); if (tm != null) { tm.close(); } diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java index 05390ba8a8c..086ed7929aa 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java @@ -276,6 +276,11 @@ public enum JvmSettings { BAGIT_SOURCE_ORG_NAME(SCOPE_BAGIT_SOURCEORG, "name"), BAGIT_SOURCEORG_ADDRESS(SCOPE_BAGIT_SOURCEORG, "address"), BAGIT_SOURCEORG_EMAIL(SCOPE_BAGIT_SOURCEORG, "email"), + SCOPE_BAGIT_ZIP(SCOPE_BAGIT, "zip"), + BAGIT_ZIP_MAX_FILE_SIZE(SCOPE_BAGIT_ZIP, "max-file-size"), + BAGIT_ZIP_MAX_DATA_SIZE(SCOPE_BAGIT_ZIP, "max-data-size"), + BAGIT_ZIP_HOLEY(SCOPE_BAGIT_ZIP, "holey"), + // STORAGE USE SETTINGS SCOPE_STORAGEUSE(PREFIX, "storageuse"), diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java index f24ebdb8655..55235f85491 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/BagGenerator.java @@ -4,12 +4,15 @@ import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileOutputStream; +import java.io.FilterInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.io.InterruptedIOException; import java.io.OutputStream; import java.io.PrintWriter; import java.net.MalformedURLException; +import java.net.SocketTimeoutException; import java.net.URI; import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; @@ -20,10 +23,12 @@ import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; +import java.util.List; import java.util.Set; import java.util.TreeSet; import java.util.Map.Entry; @@ -33,9 +38,10 @@ import java.util.concurrent.TimeUnit; import java.util.logging.Level; import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import java.util.zip.ZipEntry; -import edu.harvard.iq.dataverse.util.BundleUtil; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.compress.archivers.zip.ParallelScatterZipCreator; import org.apache.commons.compress.archivers.zip.ScatterZipOutputStream; @@ -44,25 +50,24 @@ import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream; import org.apache.commons.compress.archivers.zip.ZipFile; import org.apache.commons.compress.parallel.InputStreamSupplier; -import org.apache.commons.compress.utils.IOUtils; -import org.apache.commons.text.WordUtils; -import org.apache.http.client.ClientProtocolException; -import org.apache.http.client.config.CookieSpecs; -import org.apache.http.client.config.RequestConfig; -import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.client.methods.HttpGet; -import org.apache.http.config.Registry; -import org.apache.http.config.RegistryBuilder; -import org.apache.http.conn.socket.ConnectionSocketFactory; -import org.apache.http.conn.socket.PlainConnectionSocketFactory; -import org.apache.http.conn.ssl.NoopHostnameVerifier; -import org.apache.http.conn.ssl.SSLConnectionSocketFactory; -import org.apache.http.conn.ssl.TrustSelfSignedStrategy; -import org.apache.http.ssl.SSLContextBuilder; -import org.apache.http.util.EntityUtils; -import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.HttpClients; -import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; +import org.apache.hc.client5.http.ClientProtocolException; +import org.apache.hc.client5.http.classic.methods.HttpGet; +import org.apache.hc.client5.http.config.RequestConfig; +import org.apache.hc.client5.http.impl.classic.CloseableHttpClient; +import org.apache.hc.client5.http.impl.classic.CloseableHttpResponse; +import org.apache.hc.client5.http.impl.classic.HttpClients; +import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager; +import org.apache.hc.client5.http.protocol.HttpClientContext; +import org.apache.hc.client5.http.socket.ConnectionSocketFactory; +import org.apache.hc.client5.http.socket.PlainConnectionSocketFactory; +import org.apache.hc.client5.http.ssl.NoopHostnameVerifier; +import org.apache.hc.client5.http.ssl.SSLConnectionSocketFactory; +import org.apache.hc.client5.http.ssl.TrustSelfSignedStrategy; +import org.apache.hc.core5.http.HttpEntity; +import org.apache.hc.core5.http.config.Registry; +import org.apache.hc.core5.http.config.RegistryBuilder; +import org.apache.hc.core5.ssl.SSLContextBuilder; +import org.apache.hc.core5.util.Timeout; import org.json.JSONArray; import com.google.gson.JsonArray; import com.google.gson.JsonElement; @@ -76,9 +81,20 @@ import edu.harvard.iq.dataverse.pidproviders.PidUtil; import edu.harvard.iq.dataverse.settings.JvmSettings; import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.BagGeneratorThreads; -import edu.harvard.iq.dataverse.util.json.JsonLDTerm; -import java.util.Optional; +import edu.harvard.iq.dataverse.util.SystemConfig; +import edu.harvard.iq.dataverse.util.json.JsonLDTerm; +import jakarta.enterprise.inject.spi.CDI; + +/** + * Creates an archival zipped Bag for long-term storage. It is intended to + * include all the information needed to reconstruct the dataset version in a + * new Dataverse instance. + * + * Note that the Dataverse-Bag-Version written in the generateInfoFile() method + * should be updated any time the content/structure of the bag is changed. + * + */ public class BagGenerator { private static final Logger logger = Logger.getLogger(BagGenerator.class.getCanonicalName()); @@ -92,10 +108,11 @@ public class BagGenerator { private HashMap pidMap = new LinkedHashMap(); private HashMap checksumMap = new LinkedHashMap(); - private int timeout = 60; - private RequestConfig config = RequestConfig.custom().setConnectTimeout(timeout * 1000) - .setConnectionRequestTimeout(timeout * 1000).setSocketTimeout(timeout * 1000) - .setCookieSpec(CookieSpecs.STANDARD).build(); + private int timeout = 300; + private RequestConfig config = RequestConfig.custom() + .setConnectionRequestTimeout(Timeout.ofSeconds(timeout)) + .setResponseTimeout(Timeout.ofSeconds(timeout)) + .build(); protected CloseableHttpClient client; private PoolingHttpClientConnectionManager cm = null; @@ -120,13 +137,41 @@ public class BagGenerator { private boolean usetemp = false; - private int numConnections = 8; + private static int numConnections = 2; public static final String BAG_GENERATOR_THREADS = BagGeneratorThreads.toString(); private OREMap oremap; static PrintWriter pw = null; + + // Size limits and holey Bags + private long maxDataFileSize = Long.MAX_VALUE; + private long maxTotalDataSize = Long.MAX_VALUE; + private long currentBagDataSize = 0; + private StringBuilder fetchFileContent = new StringBuilder(); + private boolean usingFetchFile = false; + private boolean createHoleyBag = false; + private List oversizedFiles = new ArrayList<>(); + + // Bag-info.txt field labels + private static final String CONTACT_NAME = "Contact-Name: "; + private static final String CONTACT_EMAIL = "Contact-Email: "; + private static final String SOURCE_ORGANIZATION = "Source-Organization: "; + private static final String ORGANIZATION_ADDRESS = "Organization-Address: "; + private static final String ORGANIZATION_EMAIL = "Organization-Email: "; + private static final String EXTERNAL_DESCRIPTION = "External-Description: "; + private static final String BAGGING_DATE = "Bagging-Date: "; + private static final String EXTERNAL_IDENTIFIER = "External-Identifier: "; + private static final String BAG_SIZE = "Bag-Size: "; + private static final String PAYLOAD_OXUM = "Payload-Oxum: "; + private static final String INTERNAL_SENDER_IDENTIFIER = "Internal-Sender-Identifier: "; + private static final String DATAVERSE_BAG_VERSION = "Dataverse-Bag-Version: "; + + // Implement exponential backoff with jitter + static final long baseWaitTimeMs = 1000; // Start with 1 second + static final long maxWaitTimeMs = 30000; // Cap at 30 seconds + /** * This BagGenerator creates a BagIt version 1.0 * (https://tools.ietf.org/html/draft-kunze-bagit-16) compliant bag that is also @@ -139,19 +184,24 @@ public class BagGenerator { * and zipping are done in parallel, using a connection pool. The required space * on disk is ~ n+1/n of the final bag size, e.g. 125% of the bag size for a * 4-way parallel zip operation. - * @throws Exception - * @throws JsonSyntaxException + * + * @throws Exception + * @throws JsonSyntaxException */ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxException, Exception { this.oremap = oreMap; this.oremapObject = oreMap.getOREMap(); - //(JsonObject) new JsonParser().parse(oreMap.getOREMap().toString()); this.dataciteXml = dataciteXml; try { - // Using Dataverse, all the URLs to be retrieved should be on the current server, so allowing self-signed certs and not verifying hostnames are useful in testing and - // shouldn't be a significant security issue. This should not be allowed for arbitrary OREMap sources. + /* + * Using Dataverse, all the URLs to be retrieved should be on the current + * server, so allowing self-signed certs and not verifying hostnames are useful + * in testing and shouldn't be a significant security issue. This should not be + * allowed for arbitrary OREMap sources. + * + */ SSLContextBuilder builder = new SSLContextBuilder(); try { builder.loadTrustMaterial(null, new TrustSelfSignedStrategy()); @@ -159,33 +209,45 @@ public BagGenerator(OREMap oreMap, String dataciteXml) throws JsonSyntaxExceptio e.printStackTrace(); } - SSLConnectionSocketFactory sslConnectionFactory = new SSLConnectionSocketFactory(builder.build(), NoopHostnameVerifier.INSTANCE); + SSLConnectionSocketFactory sslConnectionFactory = new SSLConnectionSocketFactory( + builder.build(), + NoopHostnameVerifier.INSTANCE + ); Registry registry = RegistryBuilder.create() - .register("http", PlainConnectionSocketFactory.getSocketFactory()) + .register("http", PlainConnectionSocketFactory.getSocketFactory()) .register("https", sslConnectionFactory).build(); cm = new PoolingHttpClientConnectionManager(registry); cm.setDefaultMaxPerRoute(numConnections); cm.setMaxTotal(numConnections > 20 ? numConnections : 20); - client = HttpClients.custom().setConnectionManager(cm).setDefaultRequestConfig(config).build(); + client = HttpClients.custom() + .setConnectionManager(cm) + .setDefaultRequestConfig(config) + .build(); scatterZipCreator = new ParallelScatterZipCreator(Executors.newFixedThreadPool(numConnections)); } catch (NoSuchAlgorithmException | KeyManagementException e) { - logger.warning("Aint gonna work"); + logger.warning("Failed to initialize HTTP client"); e.printStackTrace(); } + initializeHoleyBagLimits(); + } + + private void initializeHoleyBagLimits() { + this.maxDataFileSize = JvmSettings.BAGIT_ZIP_MAX_FILE_SIZE.lookupOptional(Long.class).orElse(Long.MAX_VALUE); + this.maxTotalDataSize = JvmSettings.BAGIT_ZIP_MAX_DATA_SIZE.lookupOptional(Long.class).orElse(Long.MAX_VALUE); + this.createHoleyBag = JvmSettings.BAGIT_ZIP_HOLEY.lookupOptional(Boolean.class).orElse(false); + logger.fine("BagGenerator size limits - maxDataFileSize: " + maxDataFileSize + + ", maxTotalDataSize: " + maxTotalDataSize + + ", createHoleyBag: " + createHoleyBag); } public void setIgnoreHashes(boolean val) { ignorehashes = val; } - - public void setDefaultCheckSumType(ChecksumType type) { - hashtype=type; - } - + public static void println(String s) { System.out.println(s); System.out.flush(); @@ -203,18 +265,18 @@ public static void println(String s) { * @return success true/false */ public boolean generateBag(OutputStream outputStream) throws Exception { - File tmp = File.createTempFile("qdr-scatter-dirs", "tmp"); dirs = ScatterZipOutputStream.fileBased(tmp); - // The oremapObject is javax.json.JsonObject and we need com.google.gson.JsonObject for the aggregation object - aggregation = (JsonObject) new JsonParser().parse(oremapObject.getJsonObject(JsonLDTerm.ore("describes").getLabel()).toString()); + // The oremapObject is javax.json.JsonObject and we need + // com.google.gson.JsonObject for the aggregation object + aggregation = (JsonObject) JsonParser + .parseString(oremapObject.getJsonObject(JsonLDTerm.ore("describes").getLabel()).toString()); String pidUrlString = aggregation.get("@id").getAsString(); - String pidString=PidUtil.parseAsGlobalID(pidUrlString).asString(); - bagID = pidString + "v." - + aggregation.get(JsonLDTerm.schemaOrg("version").getLabel()).getAsString(); - + String pidString = PidUtil.parseAsGlobalID(pidUrlString).asString(); + bagID = pidString + "v." + aggregation.get(JsonLDTerm.schemaOrg("version").getLabel()).getAsString(); + logger.info("Generating Bag: " + bagID); try { // Create valid filename from identifier and extend path with @@ -240,7 +302,15 @@ public boolean generateBag(OutputStream outputStream) throws Exception { resourceUsed = new Boolean[aggregates.size() + 1]; // Process current container (the aggregation itself) and its // children - processContainer(aggregation, currentPath); + // Recursively collect all files from the entire tree, start with an empty set of processedContainers + List allFiles = new ArrayList<>(); + collectAllFiles(aggregation, currentPath, allFiles, false); + + // Sort files by size (smallest first) + Collections.sort(allFiles); + + // Process all files in sorted order + processAllFiles(allFiles); } // Create manifest files // pid-mapping.txt - a DataOne recommendation to connect ids and @@ -271,6 +341,15 @@ public boolean generateBag(OutputStream outputStream) throws Exception { String path = sha1Entry.getKey(); sha1StringBuffer.append(sha1Entry.getValue() + " " + path); } + if(hashtype == null) { // No files - still want to send an empty manifest to nominally comply with BagIT specification requirement. + try { + // Use the current type if we can retrieve it + hashtype = CDI.current().select(SystemConfig.class).get().getFileFixityChecksumAlgorithm(); + } catch (Exception e) { + // Default to MD5 if we can't + hashtype = DataFile.ChecksumType.MD5; + } + } if (!(hashtype == null)) { String manifestName = "manifest-"; if (hashtype.equals(DataFile.ChecksumType.SHA1)) { @@ -286,7 +365,7 @@ public boolean generateBag(OutputStream outputStream) throws Exception { } createFileFromString(manifestName, sha1StringBuffer.toString()); } else { - logger.warning("No Hash values (no files?) sending empty manifest to nominally comply with BagIT specification requirement"); + logger.warning("No Hash value defined sending empty manifest-md5 to nominally comply with BagIT specification requirement"); createFileFromString("manifest-md5.txt", ""); } // bagit.txt - Required by spec @@ -312,6 +391,8 @@ public boolean generateBag(OutputStream outputStream) throws Exception { logger.fine("Creating bag: " + bagName); + writeFetchFile(); + ZipArchiveOutputStream zipArchiveOutputStream = new ZipArchiveOutputStream(outputStream); /* @@ -358,7 +439,6 @@ public boolean generateBag(OutputStream outputStream) throws Exception { public boolean generateBag(String bagName, boolean temp) { usetemp = temp; - FileOutputStream bagFileOS = null; try { File origBagFile = getBagFile(bagName); File bagFile = origBagFile; @@ -367,82 +447,78 @@ public boolean generateBag(String bagName, boolean temp) { logger.fine("Writing to: " + bagFile.getAbsolutePath()); } // Create an output stream backed by the file - bagFileOS = new FileOutputStream(bagFile); - if (generateBag(bagFileOS)) { - //The generateBag call sets this.bagName to the correct value - validateBagFile(bagFile); - if (usetemp) { - logger.fine("Moving tmp zip"); - origBagFile.delete(); - bagFile.renameTo(origBagFile); + try (FileOutputStream bagFileOS = new FileOutputStream(bagFile)) { + if (generateBag(bagFileOS)) { + // The generateBag call sets this.bagName to the correct value + validateBagFile(bagFile); + if (usetemp) { + logger.fine("Moving tmp zip"); + origBagFile.delete(); + bagFile.renameTo(origBagFile); + } + return true; + } else { + return false; } - return true; - } else { - return false; } } catch (Exception e) { - logger.log(Level.SEVERE,"Bag Exception: ", e); + logger.log(Level.SEVERE, "Bag Exception: ", e); e.printStackTrace(); logger.warning("Failure: Processing failure during Bagit file creation"); return false; - } finally { - IOUtils.closeQuietly(bagFileOS); } } public void validateBag(String bagId) { logger.info("Validating Bag"); - ZipFile zf = null; - InputStream is = null; try { File bagFile = getBagFile(bagId); - zf = new ZipFile(bagFile); - ZipArchiveEntry entry = zf.getEntry(getValidName(bagId) + "/manifest-sha1.txt"); - if (entry != null) { - logger.info("SHA1 hashes used"); - hashtype = DataFile.ChecksumType.SHA1; - } else { - entry = zf.getEntry(getValidName(bagId) + "/manifest-sha512.txt"); + try (ZipFile zf = ZipFile.builder().setFile(bagFile).get()) { + ZipArchiveEntry entry = zf.getEntry(getValidName(bagId) + "/manifest-sha1.txt"); if (entry != null) { - logger.info("SHA512 hashes used"); - hashtype = DataFile.ChecksumType.SHA512; + logger.info("SHA1 hashes used"); + hashtype = DataFile.ChecksumType.SHA1; } else { - entry = zf.getEntry(getValidName(bagId) + "/manifest-sha256.txt"); + entry = zf.getEntry(getValidName(bagId) + "/manifest-sha512.txt"); if (entry != null) { - logger.info("SHA256 hashes used"); - hashtype = DataFile.ChecksumType.SHA256; + logger.info("SHA512 hashes used"); + hashtype = DataFile.ChecksumType.SHA512; } else { - entry = zf.getEntry(getValidName(bagId) + "/manifest-md5.txt"); + entry = zf.getEntry(getValidName(bagId) + "/manifest-sha256.txt"); if (entry != null) { - logger.info("MD5 hashes used"); - hashtype = DataFile.ChecksumType.MD5; + logger.info("SHA256 hashes used"); + hashtype = DataFile.ChecksumType.SHA256; + } else { + entry = zf.getEntry(getValidName(bagId) + "/manifest-md5.txt"); + if (entry != null) { + logger.info("MD5 hashes used"); + hashtype = DataFile.ChecksumType.MD5; + } } } } + if (entry == null) + throw new IOException("No manifest file found"); + try (InputStream is = zf.getInputStream(entry)) { + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + String line = br.readLine(); + while (line != null) { + logger.fine("Hash entry: " + line); + int breakIndex = line.indexOf(' '); + String hash = line.substring(0, breakIndex); + String path = line.substring(breakIndex + 1); + logger.fine("Adding: " + path + " with hash: " + hash); + checksumMap.put(path, hash); + line = br.readLine(); + } + } } - if (entry == null) - throw new IOException("No manifest file found"); - is = zf.getInputStream(entry); - BufferedReader br = new BufferedReader(new InputStreamReader(is)); - String line = br.readLine(); - while (line != null) { - logger.fine("Hash entry: " + line); - int breakIndex = line.indexOf(' '); - String hash = line.substring(0, breakIndex); - String path = line.substring(breakIndex + 1); - logger.fine("Adding: " + path + " with hash: " + hash); - checksumMap.put(path, hash); - line = br.readLine(); - } - IOUtils.closeQuietly(is); logger.info("HashMap Map contains: " + checksumMap.size() + " entries"); checkFiles(checksumMap, bagFile); } catch (IOException io) { - logger.log(Level.SEVERE,"Could not validate Hashes", io); + logger.log(Level.SEVERE, "Could not validate Hashes", io); } catch (Exception e) { - logger.log(Level.SEVERE,"Could not validate Hashes", e); - } finally { - IOUtils.closeQuietly(zf); + logger.log(Level.SEVERE, "Could not validate Hashes", e); } return; } @@ -465,7 +541,7 @@ public File getBagFile(String bagID) throws Exception { private void validateBagFile(File bagFile) throws IOException { // Run a confirmation test - should verify all files and hashes - + // Check files calculates the hashes and file sizes and reports on // whether hashes are correct checkFiles(checksumMap, bagFile); @@ -479,26 +555,31 @@ public static String getValidName(String bagName) { return bagName.replaceAll("\\W", "-"); } - private void processContainer(JsonObject item, String currentPath) throws IOException { + // Collect all files recursively and process containers to create dirs in the zip + private void collectAllFiles(JsonObject item, String currentPath, List allFiles, boolean addTitle) + throws IOException { JsonArray children = getChildren(item); - HashSet titles = new HashSet(); - String title = null; - if (item.has(JsonLDTerm.dcTerms("Title").getLabel())) { - title = item.get("Title").getAsString(); - } else if (item.has(JsonLDTerm.schemaOrg("name").getLabel())) { - title = item.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); + + if (addTitle) { //For any sub-collections (non-Dataverse) + String title = null; + if (item.has(JsonLDTerm.dcTerms("Title").getLabel())) { + title = item.get("Title").getAsString(); + } else if (item.has(JsonLDTerm.schemaOrg("name").getLabel())) { + title = item.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); + } + logger.fine("Collecting files from " + title + "/ at path " + currentPath); + currentPath = currentPath + title + "/"; } - logger.fine("Adding " + title + "/ to path " + currentPath); - currentPath = currentPath + title + "/"; + // Mark this container as processed + String containerId = item.get("@id").getAsString(); + + // Create directory and update tracking for this container int containerIndex = -1; try { createDir(currentPath); - // Add containers to pid map and mark as 'used', but no sha1 hash - // value - containerIndex = getUnusedIndexOf(item.get("@id").getAsString()); + containerIndex = getUnusedIndexOf(containerId); resourceUsed[containerIndex] = true; - pidMap.put(item.get("@id").getAsString(), currentPath); - + pidMap.put(containerId, currentPath); } catch (InterruptedException | IOException | ExecutionException e) { e.printStackTrace(); logger.severe(e.getMessage()); @@ -506,8 +587,8 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce resourceUsed[containerIndex] = false; } throw new IOException("Unable to create bag"); - } + for (int i = 0; i < children.size(); i++) { // Find the ith child in the overall array of aggregated @@ -522,119 +603,188 @@ private void processContainer(JsonObject item, String currentPath) throws IOExce // Aggregation is at index 0, so need to shift by 1 for aggregates // entries JsonObject child = aggregates.get(index - 1).getAsJsonObject(); + // Dataverse does not currently use containers - this is for other variants/future use if (childIsContainer(child)) { - // create dir and process children - // processContainer will mark this item as used - processContainer(child, currentPath); + // Recursively collect files from this container + collectAllFiles(child, currentPath, allFiles, true); } else { - resourceUsed[index] = true; - // add item - // ToDo - String dataUrl = child.get(JsonLDTerm.schemaOrg("sameAs").getLabel()).getAsString(); - logger.fine("File url: " + dataUrl); - String childTitle = child.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); - if (titles.contains(childTitle)) { - logger.warning("**** Multiple items with the same title in: " + currentPath); - logger.warning("**** Will cause failure in hash and size validation in: " + bagID); - } else { - titles.add(childTitle); + + // Get file size + Long fileSize = null; + if (child.has(JsonLDTerm.filesize.getLabel())) { + fileSize = child.get(JsonLDTerm.filesize.getLabel()).getAsLong(); } - String childPath = currentPath + childTitle; - JsonElement directoryLabel = child.get(JsonLDTerm.DVCore("directoryLabel").getLabel()); - if(directoryLabel!=null) { - childPath=currentPath + directoryLabel.getAsString() + "/" + childTitle; + if (fileSize == null) { + logger.severe("File size missing for child: " + childId); + throw new IOException("Unable to create bag due to missing file size"); } - - String childHash = null; - if (child.has(JsonLDTerm.checksum.getLabel())) { - ChecksumType childHashType = ChecksumType.fromString( - child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@type").getAsString()); - if (hashtype == null) { - //If one wasn't set as a default, pick up what the first child with one uses - hashtype = childHashType; - } - if (hashtype != null && !hashtype.equals(childHashType)) { - logger.warning("Multiple hash values in use - will calculate " + hashtype.toString() - + " hashes for " + childTitle); - } else { - childHash = child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@value").getAsString(); - if (checksumMap.containsValue(childHash)) { - // Something else has this hash - logger.warning("Duplicate/Collision: " + child.get("@id").getAsString() + " has SHA1 Hash: " - + childHash + " in: " + bagID); - } - logger.fine("Adding " + childPath + " with hash " + childHash + " to checksumMap"); - checksumMap.put(childPath, childHash); - } + // Store minimal info for sorting - JsonObject is just a reference + allFiles.add(new FileEntry(fileSize, child, currentPath, index)); + } + } + } + + + // Process all files in sorted order + private void processAllFiles(List sortedFiles) + throws IOException, ExecutionException, InterruptedException { + + // Track titles to detect duplicates + Set titles = new HashSet<>(); + + if ((hashtype == null) | ignorehashes) { + hashtype = DataFile.ChecksumType.SHA512; + } + + for (FileEntry entry : sortedFiles) { + // Extract all needed information from the JsonObject reference + JsonObject child = entry.jsonObject; + + String childTitle = entry.getChildTitle(); + + // Check for duplicate titles + if (titles.contains(childTitle)) { + logger.warning("**** Multiple items with the same title in: " + entry.currentPath); + logger.warning("**** Will cause failure in hash and size validation in: " + bagID); + } else { + titles.add(childTitle); + } + + String childPath= entry.getChildPath(childTitle); + + // Get hash if exists + String childHash = null; + if (child.has(JsonLDTerm.checksum.getLabel())) { + ChecksumType childHashType = ChecksumType + .fromUri(child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@type").getAsString()); + if (hashtype == null) { + hashtype = childHashType; } - if ((hashtype == null) | ignorehashes) { - // Pick sha512 when ignoring hashes or none exist - hashtype = DataFile.ChecksumType.SHA512; + if (hashtype != null && !hashtype.equals(childHashType)) { + logger.warning("Multiple hash values in use - will calculate " + hashtype.toString() + + " hashes for " + childTitle); + } else { + childHash = child.getAsJsonObject(JsonLDTerm.checksum.getLabel()).get("@value").getAsString(); } - try { - if ((childHash == null) | ignorehashes) { - // Generate missing hashInputStream inputStream = null; - InputStream inputStream = null; - try { - inputStream = getInputStreamSupplier(dataUrl).get(); - - if (hashtype != null) { - if (hashtype.equals(DataFile.ChecksumType.SHA1)) { - childHash = DigestUtils.sha1Hex(inputStream); - } else if (hashtype.equals(DataFile.ChecksumType.SHA256)) { - childHash = DigestUtils.sha256Hex(inputStream); - } else if (hashtype.equals(DataFile.ChecksumType.SHA512)) { - childHash = DigestUtils.sha512Hex(inputStream); - } else if (hashtype.equals(DataFile.ChecksumType.MD5)) { - childHash = DigestUtils.md5Hex(inputStream); - } + } + + resourceUsed[entry.resourceIndex] = true; + String dataUrl = entry.getDataUrl(); + + try { + if ((childHash == null) | ignorehashes) { + // Generate missing hash + + try (InputStream inputStream = getInputStreamSupplier(dataUrl).get()){ + if (hashtype != null) { + if (hashtype.equals(DataFile.ChecksumType.SHA1)) { + childHash = DigestUtils.sha1Hex(inputStream); + } else if (hashtype.equals(DataFile.ChecksumType.SHA256)) { + childHash = DigestUtils.sha256Hex(inputStream); + } else if (hashtype.equals(DataFile.ChecksumType.SHA512)) { + childHash = DigestUtils.sha512Hex(inputStream); + } else if (hashtype.equals(DataFile.ChecksumType.MD5)) { + childHash = DigestUtils.md5Hex(inputStream); } - - } catch (IOException e) { - logger.severe("Failed to read " + childPath); - throw e; - } finally { - IOUtils.closeQuietly(inputStream); } - if (childHash != null) { - JsonObject childHashObject = new JsonObject(); - childHashObject.addProperty("@type", hashtype.toString()); - childHashObject.addProperty("@value", childHash); - child.add(JsonLDTerm.checksum.getLabel(), (JsonElement) childHashObject); - checksumMap.put(childPath, childHash); - } else { - logger.warning("Unable to calculate a " + hashtype + " for " + dataUrl); - } + } catch (IOException e) { + logger.severe("Failed to read " + childPath); + throw e; } - logger.fine("Requesting: " + childPath + " from " + dataUrl); - createFileFromURL(childPath, dataUrl); - dataCount++; - if (dataCount % 1000 == 0) { - logger.info("Retrieval in progress: " + dataCount + " files retrieved"); + if (childHash != null) { + JsonObject childHashObject = new JsonObject(); + childHashObject.addProperty("@type", hashtype.toString()); + childHashObject.addProperty("@value", childHash); + child.add(JsonLDTerm.checksum.getLabel(), (JsonElement) childHashObject); + + checksumMap.put(childPath, childHash); + } else { + logger.warning("Unable to calculate a " + hashtype + " for " + dataUrl); } - if (child.has(JsonLDTerm.filesize.getLabel())) { - Long size = child.get(JsonLDTerm.filesize.getLabel()).getAsLong(); - totalDataSize += size; - if (size > maxFileSize) { - maxFileSize = size; - } + } else { + // Hash already exists, add to checksumMap + if (checksumMap.containsValue(childHash)) { + logger.warning("Duplicate/Collision: " + child.get("@id").getAsString() + + " has hash: " + childHash + " in: " + bagID); } - if (child.has(JsonLDTerm.schemaOrg("fileFormat").getLabel())) { - mimetypes.add(child.get(JsonLDTerm.schemaOrg("fileFormat").getLabel()).getAsString()); + logger.fine("Adding " + childPath + " with hash " + childHash + " to checksumMap"); + checksumMap.put(childPath, childHash); + } + // Add file to bag or fetch file + if (!addToZip(entry.size)) { + if(createHoleyBag) { + logger.fine("Adding to fetch file: " + childPath + " from " + dataUrl + + " (size: " + entry.size + " bytes)"); + addToFetchFile(dataUrl, entry.size, childPath); + usingFetchFile = true; + } else { + // Add to list for archiver to retrieve + oversizedFiles.add(entry); + logger.fine("Adding " + childPath + " to oversized files list for archiver"); } - - } catch (Exception e) { - resourceUsed[index] = false; - e.printStackTrace(); - throw new IOException("Unable to create bag"); + } else { + logger.fine("Requesting: " + childPath + " from " + dataUrl + + " (size: " + entry.size + " bytes)"); + createFileFromURL(childPath, dataUrl); + currentBagDataSize += entry.size; + } + + dataCount++; + if (dataCount % 1000 == 0) { + logger.info("Retrieval in progress: " + dataCount + " files retrieved"); + } + + totalDataSize += entry.size; + if (entry.size > maxFileSize) { + maxFileSize = entry.size; + } + + if (child.has(JsonLDTerm.schemaOrg("fileFormat").getLabel())) { + mimetypes.add(child.get(JsonLDTerm.schemaOrg("fileFormat").getLabel()).getAsString()); } - // Check for nulls! - pidMap.put(child.get("@id").getAsString(), childPath); - + } catch (Exception e) { + resourceUsed[entry.resourceIndex] = false; + e.printStackTrace(); + throw new IOException("Unable to create bag"); } + + pidMap.put(child.get("@id").getAsString(), childPath); + } + } + + // Helper method to determine if file should go to fetch file + private boolean addToZip(long fileSize) { + + // Check individual file size limit + if (fileSize > maxDataFileSize) { + logger.fine("File size " + fileSize + " exceeds max data file size " + maxDataFileSize); + return false; + } + + // Check total bag size limit + if (currentBagDataSize + fileSize > maxTotalDataSize) { + logger.fine("Adding file would exceed max total data size. Current: " + currentBagDataSize + + ", File: " + fileSize + ", Max: " + maxTotalDataSize); + return false; + } + + return true; + } + + // Method to append to fetch file content + private void addToFetchFile(String url, long size, String filename) { + // Format: URL size filename + fetchFileContent.append(url).append(" ").append(Long.toString(size)).append(" ").append(filename).append("\r\n"); + } + + // Method to write fetch file to bag (call this before finalizing the bag) + private void writeFetchFile() throws IOException, ExecutionException, InterruptedException { + if (usingFetchFile && fetchFileContent.length() > 0) { + logger.info("Creating fetch.txt file for holey bag"); + createFileFromString("fetch.txt", fetchFileContent.toString()); } } @@ -705,9 +855,7 @@ private void createFileFromURL(final String relPath, final String uri) private void checkFiles(HashMap shaMap, File bagFile) { ExecutorService executor = Executors.newFixedThreadPool(numConnections); - ZipFile zf = null; - try { - zf = new ZipFile(bagFile); + try (ZipFile zf = ZipFile.builder().setFile(bagFile).get()) { BagValidationJob.setZipFile(zf); BagValidationJob.setBagGenerator(this); @@ -730,12 +878,9 @@ private void checkFiles(HashMap shaMap, File bagFile) { } } catch (InterruptedException e) { logger.log(Level.SEVERE, "Hash Calculations interrupted", e); - } + } } catch (IOException e1) { - // TODO Auto-generated catch block e1.printStackTrace(); - } finally { - IOUtils.closeQuietly(zf); } logger.fine("Hash Validations Completed"); @@ -764,53 +909,51 @@ private String generateInfoFile() { logger.fine("Generating info file"); StringBuffer info = new StringBuffer(); - JsonArray contactsArray = new JsonArray(); - /* Contact, and it's subfields, are terms from citation.tsv whose mapping to a formal vocabulary and label in the oremap may change - * so we need to find the labels used. - */ + /* + * Contact, and it's subfields, are terms from citation.tsv whose mapping to a + * formal vocabulary and label in the oremap may change so we need to find the + * labels used. + */ JsonLDTerm contactTerm = oremap.getContactTerm(); if ((contactTerm != null) && aggregation.has(contactTerm.getLabel())) { JsonElement contacts = aggregation.get(contactTerm.getLabel()); JsonLDTerm contactNameTerm = oremap.getContactNameTerm(); JsonLDTerm contactEmailTerm = oremap.getContactEmailTerm(); - + if (contacts.isJsonArray()) { + JsonArray contactsArray = contacts.getAsJsonArray(); for (int i = 0; i < contactsArray.size(); i++) { - info.append("Contact-Name: "); + JsonElement person = contactsArray.get(i); if (person.isJsonPrimitive()) { - info.append(person.getAsString()); + info.append(multilineWrap(CONTACT_NAME + person.getAsString())); info.append(CRLF); } else { - if(contactNameTerm != null) { - info.append(((JsonObject) person).get(contactNameTerm.getLabel()).getAsString()); - info.append(CRLF); + if (contactNameTerm != null) { + info.append(multilineWrap(CONTACT_NAME + ((JsonObject) person).get(contactNameTerm.getLabel()).getAsString())); + info.append(CRLF); } - if ((contactEmailTerm!=null) &&((JsonObject) person).has(contactEmailTerm.getLabel())) { - info.append("Contact-Email: "); - info.append(((JsonObject) person).get(contactEmailTerm.getLabel()).getAsString()); + if ((contactEmailTerm != null) && ((JsonObject) person).has(contactEmailTerm.getLabel())) { + info.append(multilineWrap(CONTACT_EMAIL + ((JsonObject) person).get(contactEmailTerm.getLabel()).getAsString())); info.append(CRLF); } } } } else { - info.append("Contact-Name: "); - if (contacts.isJsonPrimitive()) { - info.append((String) contacts.getAsString()); + info.append(multilineWrap(CONTACT_NAME + (String) contacts.getAsString())); info.append(CRLF); } else { JsonObject person = contacts.getAsJsonObject(); - if(contactNameTerm != null) { - info.append(person.get(contactNameTerm.getLabel()).getAsString()); - info.append(CRLF); + if (contactNameTerm != null) { + info.append(multilineWrap(CONTACT_NAME + person.get(contactNameTerm.getLabel()).getAsString())); + info.append(CRLF); } - if ((contactEmailTerm!=null) && (person.has(contactEmailTerm.getLabel()))) { - info.append("Contact-Email: "); - info.append(person.get(contactEmailTerm.getLabel()).getAsString()); + if ((contactEmailTerm != null) && (person.has(contactEmailTerm.getLabel()))) { + info.append(multilineWrap(CONTACT_EMAIL + person.get(contactEmailTerm.getLabel()).getAsString())); info.append(CRLF); } } @@ -820,88 +963,222 @@ private String generateInfoFile() { logger.warning("No contact info available for BagIt Info file"); } - String orgName = JvmSettings.BAGIT_SOURCE_ORG_NAME.lookupOptional(String.class).orElse("Dataverse Installation ()"); + String orgName = JvmSettings.BAGIT_SOURCE_ORG_NAME.lookupOptional(String.class) + .orElse("Dataverse Installation ()"); String orgAddress = JvmSettings.BAGIT_SOURCEORG_ADDRESS.lookupOptional(String.class).orElse(""); String orgEmail = JvmSettings.BAGIT_SOURCEORG_EMAIL.lookupOptional(String.class).orElse(""); - info.append("Source-Organization: " + orgName); + info.append(multilineWrap(SOURCE_ORGANIZATION + orgName)); // ToDo - make configurable info.append(CRLF); - info.append("Organization-Address: " + WordUtils.wrap(orgAddress, 78, CRLF + " ", true)); + info.append(multilineWrap(ORGANIZATION_ADDRESS + orgAddress)); info.append(CRLF); // Not a BagIt standard name - info.append("Organization-Email: " + orgEmail); + info.append(multilineWrap(ORGANIZATION_EMAIL + orgEmail)); info.append(CRLF); - info.append("External-Description: "); - - /* Description, and it's subfields, are terms from citation.tsv whose mapping to a formal vocabulary and label in the oremap may change - * so we need to find the labels used. + /* + * Description, and it's subfields, are terms from citation.tsv whose mapping to + * a formal vocabulary and label in the oremap may change so we need to find the + * labels used. */ JsonLDTerm descriptionTerm = oremap.getDescriptionTerm(); JsonLDTerm descriptionTextTerm = oremap.getDescriptionTextTerm(); if (descriptionTerm == null) { logger.warning("No description available for BagIt Info file"); } else { - info.append( - // FixMe - handle description having subfields better - WordUtils.wrap(getSingleValue(aggregation.get(descriptionTerm.getLabel()), - descriptionTextTerm.getLabel()), 78, CRLF + " ", true)); + info.append(multilineWrap(EXTERNAL_DESCRIPTION + + getSingleValue(aggregation.get(descriptionTerm.getLabel()), descriptionTextTerm.getLabel()))); info.append(CRLF); } - info.append("Bagging-Date: "); + info.append(BAGGING_DATE); info.append((new SimpleDateFormat("yyyy-MM-dd").format(Calendar.getInstance().getTime()))); info.append(CRLF); - info.append("External-Identifier: "); - info.append(aggregation.get("@id").getAsString()); + info.append(multilineWrap(EXTERNAL_IDENTIFIER + aggregation.get("@id").getAsString())); info.append(CRLF); - info.append("Bag-Size: "); + info.append(BAG_SIZE); info.append(byteCountToDisplaySize(totalDataSize)); info.append(CRLF); - info.append("Payload-Oxum: "); + info.append(PAYLOAD_OXUM); info.append(Long.toString(totalDataSize)); info.append("."); info.append(Long.toString(dataCount)); info.append(CRLF); - info.append("Internal-Sender-Identifier: "); String catalog = orgName + " Catalog"; if (aggregation.has(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel())) { catalog = aggregation.get(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel()).getAsString(); } - info.append(catalog + ":" + aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString()); + info.append(multilineWrap(INTERNAL_SENDER_IDENTIFIER + catalog + ":" + + aggregation.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString())); info.append(CRLF); + // Add a version number for our bag type - should be updated with any change to + // the bag content/structure + info.append(DATAVERSE_BAG_VERSION + "1.0"); + info.append(CRLF); return info.toString(); } + static private String multilineWrap(String value) { + // Normalize line breaks and ensure all lines after the first are indented + String[] lines = value.split("\\r?\\n"); + StringBuilder wrappedValue = new StringBuilder(); + for (int i = 0; i < lines.length; i++) { + // Skip empty lines - RFC8493 (section 7.3) doesn't allow truly empty lines, + // While trailing whitespace or whitespace-only lines appear to be allowed, it's + // not clear that handling them adds value (visually identical entries in + // Dataverse could result in entries w/ or w/o extra lines in the bag-info.txt + // file + String line = lines[i].trim(); + if (line.length() > 0) { + // Recommended line length, including the label or indents is 79 + String wrapped = lineWrap(line, 79, CRLF + " ", true); + wrappedValue.append(wrapped); + if (i < lines.length - 1) { + wrappedValue.append(CRLF).append(" "); + } + } + } + return wrappedValue.toString(); + } + + /** Adapted from Apache WordUtils.wrap() - make subsequent lines shorter by the length of any spaces in newLineStr*/ + public static String lineWrap(final String str, int wrapLength, String newLineStr, final boolean wrapLongWords) { + if (str == null) { + return null; + } + if (newLineStr == null) { + newLineStr = System.lineSeparator(); + } + if (wrapLength < 1) { + wrapLength = 1; + } + + // Calculate the indent length (characters after CRLF in newLineStr) + int indentLength = 0; + int crlfIndex = newLineStr.lastIndexOf("\n"); + if (crlfIndex != -1) { + indentLength = newLineStr.length() - crlfIndex -1; + } + + String wrapOn = " "; + final Pattern patternToWrapOn = Pattern.compile(wrapOn); + final int inputLineLength = str.length(); + int offset = 0; + final StringBuilder wrappedLine = new StringBuilder(inputLineLength + 32); + int matcherSize = -1; + boolean isFirstLine = true; + + while (offset < inputLineLength) { + // Adjust wrap length based on whether this is the first line or subsequent + // lines + int currentWrapLength = isFirstLine ? wrapLength : (wrapLength - indentLength); + + int spaceToWrapAt = -1; + Matcher matcher = patternToWrapOn.matcher(str.substring(offset, + Math.min((int) Math.min(Integer.MAX_VALUE, offset + currentWrapLength + 1L), inputLineLength))); + if (matcher.find()) { + if (matcher.start() == 0) { + matcherSize = matcher.end(); + if (matcherSize != 0) { + offset += matcher.end(); + continue; + } + offset += 1; + } + spaceToWrapAt = matcher.start() + offset; + } + + // only last line without leading spaces is left + if (inputLineLength - offset <= currentWrapLength) { + break; + } + + while (matcher.find()) { + spaceToWrapAt = matcher.start() + offset; + } + + if (spaceToWrapAt >= offset) { + // normal case + wrappedLine.append(str, offset, spaceToWrapAt); + wrappedLine.append(newLineStr); + offset = spaceToWrapAt + 1; + isFirstLine = false; + + } else // really long word or URL + if (wrapLongWords) { + if (matcherSize == 0) { + offset--; + } + // wrap really long word one line at a time + wrappedLine.append(str, offset, currentWrapLength + offset); + wrappedLine.append(newLineStr); + offset += currentWrapLength; + matcherSize = -1; + isFirstLine = false; + } else { + // do not wrap really long word, just extend beyond limit + matcher = patternToWrapOn.matcher(str.substring(offset + currentWrapLength)); + if (matcher.find()) { + matcherSize = matcher.end() - matcher.start(); + spaceToWrapAt = matcher.start() + offset + currentWrapLength; + } + + if (spaceToWrapAt >= 0) { + if (matcherSize == 0 && offset != 0) { + offset--; + } + wrappedLine.append(str, offset, spaceToWrapAt); + wrappedLine.append(newLineStr); + offset = spaceToWrapAt + 1; + isFirstLine = false; + } else { + if (matcherSize == 0 && offset != 0) { + offset--; + } + wrappedLine.append(str, offset, str.length()); + offset = inputLineLength; + matcherSize = -1; + } + } + } + + if (matcherSize == 0 && offset < inputLineLength) { + offset--; + } + + // Whatever is left in line is short enough to just pass through + wrappedLine.append(str, offset, str.length()); + + return wrappedLine.toString(); + } + /** - * Kludge - compound values (e.g. for descriptions) are sent as an array of + * Compound values (e.g. for descriptions) are sent as an array of * objects containing key/values whereas a single value is sent as one object. * For cases where multiple values are sent, create a concatenated string so * that information is not lost. * - * @param jsonElement - * - the root json object - * @param key - * - the key to find a value(s) for + * @param jsonElement - the root json object + * @param key - the key to find a value(s) for * @return - a single string */ String getSingleValue(JsonElement jsonElement, String key) { String val = ""; - if(jsonElement.isJsonObject()) { - JsonObject jsonObject=jsonElement.getAsJsonObject(); + if (jsonElement.isJsonObject()) { + JsonObject jsonObject = jsonElement.getAsJsonObject(); val = jsonObject.get(key).getAsString(); } else if (jsonElement.isJsonArray()) { - + Iterator iter = jsonElement.getAsJsonArray().iterator(); ArrayList stringArray = new ArrayList(); while (iter.hasNext()) { @@ -949,6 +1226,7 @@ private static JsonArray getChildren(JsonObject parent) { // Logic to decide if this is a container - // first check for children, then check for source-specific type indicators + // Dataverse does not currently use containers - this is for other variants/future use private static boolean childIsContainer(JsonObject item) { if (getChildren(item).size() != 0) { return true; @@ -994,10 +1272,8 @@ private HttpGet createNewGetRequest(URI url, String returnType) { urlString = urlString + ((urlString.indexOf('?') != -1) ? "&key=" : "?key=") + apiKey; request = new HttpGet(new URI(urlString)); } catch (MalformedURLException e) { - // TODO Auto-generated catch block e.printStackTrace(); } catch (URISyntaxException e) { - // TODO Auto-generated catch block e.printStackTrace(); } } else { @@ -1009,75 +1285,114 @@ private HttpGet createNewGetRequest(URI url, String returnType) { return request; } - InputStreamSupplier getInputStreamSupplier(final String uriString) { + /** Get a stream supplier for the given URI. + * + * Caller must close the stream when done. + */ + public InputStreamSupplier getInputStreamSupplier(final String uriString) { return new InputStreamSupplier() { public InputStream get() { try { URI uri = new URI(uriString); - int tries = 0; while (tries < 5) { - logger.fine("Get # " + tries + " for " + uriString); + logger.finest("Get # " + tries + " for " + uriString); HttpGet getFile = createNewGetRequest(uri, null); - logger.finest("Retrieving " + tries + ": " + uriString); - CloseableHttpResponse response = null; + try { - response = client.execute(getFile); - // Note - if we ever need to pass an HttpClientContext, we need a new one per - // thread. - int statusCode = response.getStatusLine().getStatusCode(); + // Execute the request directly and keep the response open + final CloseableHttpResponse response = (CloseableHttpResponse) client.executeOpen(null, getFile, HttpClientContext.create()); + int statusCode = response.getCode(); + if (statusCode == 200) { logger.finest("Retrieved: " + uri); - return response.getEntity().getContent(); - } - logger.warning("Attempt: " + tries + " - Unexpected Status when retrieving " + uriString - + " : " + statusCode); - if (statusCode < 500) { - logger.fine("Will not retry for 40x errors"); - tries += 5; + // Return a wrapped stream that will close the response when the stream is closed + final HttpEntity entity = response.getEntity(); + if (entity != null) { + // Create a wrapper stream that closes the response when the stream is closed + return new FilterInputStream(entity.getContent()) { + @Override + public void close() throws IOException { + try { + super.close(); + } finally { + response.close(); + } + } + }; + } else { + response.close(); + logger.warning("No content in response for: " + uriString); + return null; + } } else { + // Close the response for non-200 responses + response.close(); + + logger.warning("Attempt: " + tries + " - Unexpected Status when retrieving " + uriString + + " : " + statusCode); tries++; - } - // Error handling - if (response != null) { try { - EntityUtils.consumeQuietly(response.getEntity()); - response.close(); - } catch (IOException io) { - logger.warning( - "Exception closing response after status: " + statusCode + " on " + uri); + // Calculate exponential backoff: 2^tries * baseWaitTimeMs (1 sec) + long waitTime = (long) (Math.pow(2, tries) * baseWaitTimeMs); + + // Add jitter: random value between 0-30% of the wait time + long jitter = (long) (waitTime * 0.3 * Math.random()); + waitTime = waitTime + jitter; + + // Cap the wait time at maxWaitTimeMs (30 seconds) + waitTime = Math.min(waitTime, maxWaitTimeMs); + + logger.fine("Sleeping for " + waitTime + "ms before retry attempt " + tries); + Thread.sleep(waitTime); + } catch (InterruptedException ie) { + logger.log(Level.SEVERE, "InterruptedException during retry delay for file: " + uriString, ie); + Thread.currentThread().interrupt(); // Restore interrupt status + tries += 5; // Skip remaining attempts } } } catch (ClientProtocolException e) { tries += 5; - // TODO Auto-generated catch block - e.printStackTrace(); + logger.log(Level.SEVERE, "ClientProtocolException when retrieving file: " + uriString + " (attempt " + tries + ")", e); + } catch (SocketTimeoutException e) { + // Specific handling for timeout exceptions + tries++; + logger.log(Level.SEVERE, "SocketTimeoutException when retrieving file: " + uriString + " (attempt " + tries + " of 5) - Request exceeded timeout", e); + if (tries == 5) { + logger.log(Level.SEVERE, "FINAL FAILURE: File could not be retrieved after all retries due to timeouts: " + uriString, e); + } + } catch (InterruptedIOException e) { + // Catches interruptions during I/O operations + tries += 5; + logger.log(Level.SEVERE, "InterruptedIOException when retrieving file: " + uriString + " - Operation was interrupted", e); + Thread.currentThread().interrupt(); // Restore interrupt status } catch (IOException e) { - // Retry if this is a potentially temporary error such - // as a timeout + // Retry if this is a potentially temporary error such as a timeout tries++; - logger.log(Level.WARNING, "Attempt# " + tries + " : Unable to retrieve file: " + uriString, - e); + logger.log(Level.WARNING, "IOException when retrieving file: " + uriString + " (attempt " + tries + " of 5)", e); if (tries == 5) { - logger.severe("Final attempt failed for " + uriString); + logger.log(Level.SEVERE, "FINAL FAILURE: File could not be retrieved after all retries: " + uriString, e); } - e.printStackTrace(); } } - } catch (URISyntaxException e) { - // TODO Auto-generated catch block - e.printStackTrace(); + logger.log(Level.SEVERE, "URISyntaxException for file: " + uriString + " - Invalid URI format", e); } - logger.severe("Could not read: " + uriString); + logger.severe("FAILED TO RETRIEVE FILE after all retries: " + uriString); return null; } }; } + + + public List getOversizedFiles() { + return oversizedFiles; + } + /** * Adapted from org/apache/commons/io/FileUtils.java change to SI - add 2 digits * of precision @@ -1101,8 +1416,7 @@ public InputStream get() { * Returns a human-readable version of the file size, where the input represents * a specific number of bytes. * - * @param size - * the number of bytes + * @param size the number of bytes * @return a human-readable display value (includes units) */ public static String byteCountToDisplaySize(long size) { @@ -1124,9 +1438,56 @@ public void setAuthenticationKey(String tokenString) { apiKey = tokenString; } - public void setNumConnections(int numConnections) { - this.numConnections = numConnections; - logger.fine("BagGenerator will use " + numConnections + " threads"); + public static void setNumConnections(int numConnections) { + BagGenerator.numConnections = numConnections; + logger.fine("All BagGenerators will use " + numConnections + " threads"); } + + // Inner class to hold file information before processing + public static class FileEntry implements Comparable { + final long size; + final JsonObject jsonObject; // Direct reference, not a copy + final String currentPath; // Parent directory path + final int resourceIndex; // Still need this for resourceUsed tracking + + FileEntry(long size, JsonObject jsonObject, String currentPath, int resourceIndex) { + this.size = size; + this.jsonObject = jsonObject; + this.currentPath = currentPath; + this.resourceIndex = resourceIndex; + } + + public String getDataUrl() { + return suppressDownloadCounts(jsonObject.get(JsonLDTerm.schemaOrg("sameAs").getLabel()).getAsString()); + } + + public String getChildTitle() { + return jsonObject.get(JsonLDTerm.schemaOrg("name").getLabel()).getAsString(); + } + + public String getChildPath(String title) { + // Build full path using stored currentPath + String childPath = currentPath + title; + JsonElement directoryLabel = jsonObject.get(JsonLDTerm.DVCore("directoryLabel").getLabel()); + if (directoryLabel != null) { + childPath = currentPath + directoryLabel.getAsString() + "/" + title; + } + return childPath; + } + private String suppressDownloadCounts(String uriString) { + // Adding gbrecs to suppress counting this access as a download (archiving is + // not a download indicating scientific use) + return uriString + (uriString.contains("?") ? "&" : "?") + "gbrecs=true"; + } + + @Override + public int compareTo(FileEntry other) { + return Long.compare(this.size, other.size); + } + + public long getSize() { + return size; + } + } } \ No newline at end of file diff --git a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java index 4cbc2aa7b9a..0d99a5bddd1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/bagit/OREMap.java @@ -49,7 +49,7 @@ public class OREMap { public static final String NAME = "OREMap"; //NOTE: Update this value whenever the output of this class is changed - private static final String DATAVERSE_ORE_FORMAT_VERSION = "Dataverse OREMap Format v1.0.1"; + private static final String DATAVERSE_ORE_FORMAT_VERSION = "Dataverse OREMap Format v1.0.2"; //v1.0.1 - added versionNote private static final String DATAVERSE_SOFTWARE_NAME = "Dataverse"; private static final String DATAVERSE_SOFTWARE_URL = "https://github.com/iqss/dataverse"; @@ -130,7 +130,8 @@ public JsonObjectBuilder getOREMapBuilder(boolean aggregationOnly) { if(vs.equals(VersionState.DEACCESSIONED)) { JsonObjectBuilder deaccBuilder = Json.createObjectBuilder(); deaccBuilder.add(JsonLDTerm.schemaOrg("name").getLabel(), vs.name()); - deaccBuilder.add(JsonLDTerm.DVCore("reason").getLabel(), version.getDeaccessionNote()); + // Reason is supposed to not be null, but historically this has not been enforced (in the API) + addIfNotNull(deaccBuilder, JsonLDTerm.DVCore("reason"), version.getDeaccessionNote()); addIfNotNull(deaccBuilder, JsonLDTerm.DVCore("forwardUrl"), version.getDeaccessionLink()); aggBuilder.add(JsonLDTerm.schemaOrg("creativeWorkStatus").getLabel(), deaccBuilder); @@ -280,7 +281,7 @@ public JsonObjectBuilder getOREMapBuilder(boolean aggregationOnly) { JsonObject checksum = null; // Add checksum. RDA recommends SHA-512 if (df.getChecksumType() != null && df.getChecksumValue() != null) { - checksum = Json.createObjectBuilder().add("@type", df.getChecksumType().toString()) + checksum = Json.createObjectBuilder().add("@type", df.getChecksumType().toUri()) .add("@value", df.getChecksumValue()).build(); aggRes.add(JsonLDTerm.checksum.getLabel(), checksum); } @@ -505,11 +506,16 @@ private static void addCvocValue(String val, JsonArrayBuilder vals, JsonObject c for (String prefix : context.keySet()) { localContext.putIfAbsent(prefix, context.getString(prefix)); } - JsonObjectBuilder job = Json.createObjectBuilder(datasetFieldService.getExternalVocabularyValue(val)); - job.add("@id", val); - JsonObject extVal = job.build(); - logger.fine("Adding: " + extVal); - vals.add(extVal); + JsonObject cachedValue = datasetFieldService.getExternalVocabularyValue(val); + if (cachedValue != null) { + JsonObjectBuilder job = Json.createObjectBuilder(cachedValue); + job.add("@id", val); + JsonObject extVal = job.build(); + logger.fine("Adding: " + extVal); + vals.add(extVal); + } else { + vals.add(val); + } } else { vals.add(val); } diff --git a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorInfoFileTest.java b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorInfoFileTest.java new file mode 100644 index 00000000000..dbbf3241318 --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorInfoFileTest.java @@ -0,0 +1,295 @@ + +package edu.harvard.iq.dataverse.util.bagit; + +import edu.harvard.iq.dataverse.util.json.JsonLDTerm; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; + +import com.google.gson.JsonParser; + +import jakarta.json.Json; +import jakarta.json.JsonArrayBuilder; +import jakarta.json.JsonObject; +import jakarta.json.JsonObjectBuilder; + +import java.lang.reflect.Field; +import java.lang.reflect.Method; + +import static org.junit.jupiter.api.Assertions.*; +import static org.mockito.Mockito.*; + +public class BagGeneratorInfoFileTest { + + private BagGenerator bagGenerator; + private JsonObjectBuilder testAggregationBuilder; + + @Mock + private OREMap mockOreMap; + + @BeforeEach + public void setUp() throws Exception { + MockitoAnnotations.openMocks(this); + + // Create base test aggregation builder with required fields + testAggregationBuilder = Json.createObjectBuilder(); + testAggregationBuilder.add("@id", "doi:10.5072/FK2/TEST123"); + testAggregationBuilder.add(JsonLDTerm.schemaOrg("name").getLabel(), "Test Dataset"); + testAggregationBuilder.add(JsonLDTerm.schemaOrg("includedInDataCatalog").getLabel(), "Test Catalog"); + } + + /** + * Helper method to finalize the aggregation and create the BagGenerator + */ + private void initializeBagGenerator() throws Exception { + JsonObject testAggregation = testAggregationBuilder.build(); + + JsonObjectBuilder oremapJsonBuilder = Json.createObjectBuilder(); + oremapJsonBuilder.add(JsonLDTerm.ore("describes").getLabel(), testAggregation); + JsonObject oremapObject = oremapJsonBuilder.build(); + // Mock the OREMap.getOREMap() method to return the built JSON + when(mockOreMap.getOREMap()).thenReturn(oremapObject); + + // Initialize BagGenerator with test data + bagGenerator = new BagGenerator(mockOreMap, ""); + setPrivateField(bagGenerator, "aggregation", (com.google.gson.JsonObject) JsonParser + .parseString(oremapObject.getJsonObject(JsonLDTerm.ore("describes").getLabel()).toString())); + setPrivateField(bagGenerator, "totalDataSize", 1024000L); + setPrivateField(bagGenerator, "dataCount", 10L); + } + + @Test + public void testGenerateInfoFileWithSingleContact() throws Exception { + // Arrange + JsonLDTerm contactTerm = JsonLDTerm.schemaOrg("creator"); + JsonLDTerm contactNameTerm = JsonLDTerm.schemaOrg("name"); + JsonLDTerm contactEmailTerm = JsonLDTerm.schemaOrg("email"); + + when(mockOreMap.getContactTerm()).thenReturn(contactTerm); + when(mockOreMap.getContactNameTerm()).thenReturn(contactNameTerm); + when(mockOreMap.getContactEmailTerm()).thenReturn(contactEmailTerm); + + JsonObjectBuilder contactBuilder = Json.createObjectBuilder(); + contactBuilder.add(contactNameTerm.getLabel(), "John Doe"); + contactBuilder.add(contactEmailTerm.getLabel(), "john.doe@example.com"); + testAggregationBuilder.add(contactTerm.getLabel(), contactBuilder); + + initializeBagGenerator(); + + // Act + String infoFile = invokeGenerateInfoFile(); + + // Assert + assertNotNull(infoFile); + assertTrue(infoFile.contains("Contact-Name: John Doe")); + assertTrue(infoFile.contains("Contact-Email: john.doe@example.com")); + } + + @Test + public void testGenerateInfoFileWithMultipleContacts() throws Exception { + // Arrange + JsonLDTerm contactTerm = JsonLDTerm.schemaOrg("creator"); + JsonLDTerm contactNameTerm = JsonLDTerm.schemaOrg("name"); + JsonLDTerm contactEmailTerm = JsonLDTerm.schemaOrg("email"); + + when(mockOreMap.getContactTerm()).thenReturn(contactTerm); + when(mockOreMap.getContactNameTerm()).thenReturn(contactNameTerm); + when(mockOreMap.getContactEmailTerm()).thenReturn(contactEmailTerm); + + JsonArrayBuilder contactsBuilder = Json.createArrayBuilder(); + + JsonObjectBuilder contact1 = Json.createObjectBuilder(); + contact1.add(contactNameTerm.getLabel(), "John Doe"); + contact1.add(contactEmailTerm.getLabel(), "john.doe@example.com"); + + JsonObjectBuilder contact2 = Json.createObjectBuilder(); + contact2.add(contactNameTerm.getLabel(), "Jane Smith"); + contact2.add(contactEmailTerm.getLabel(), "jane.smith@example.com"); + + JsonObjectBuilder contact3 = Json.createObjectBuilder(); + contact3.add(contactNameTerm.getLabel(), "Bob Johnson"); + contact3.add(contactEmailTerm.getLabel(), "bob.johnson@example.com"); + + contactsBuilder.add(contact1); + contactsBuilder.add(contact2); + contactsBuilder.add(contact3); + + testAggregationBuilder.add(contactTerm.getLabel(), contactsBuilder); + + initializeBagGenerator(); + + // Act + String infoFile = invokeGenerateInfoFile(); + + // Assert + assertNotNull(infoFile); + assertTrue(infoFile.contains("Contact-Name: John Doe")); + assertTrue(infoFile.contains("Contact-Email: john.doe@example.com")); + assertTrue(infoFile.contains("Contact-Name: Jane Smith")); + assertTrue(infoFile.contains("Contact-Email: jane.smith@example.com")); + assertTrue(infoFile.contains("Contact-Name: Bob Johnson")); + assertTrue(infoFile.contains("Contact-Email: bob.johnson@example.com")); + } + + @Test + public void testGenerateInfoFileWithSingleDescription() throws Exception { + // Arrange + JsonLDTerm descriptionTerm = JsonLDTerm.schemaOrg("description"); + JsonLDTerm descriptionTextTerm = JsonLDTerm.schemaOrg("value"); + + when(mockOreMap.getDescriptionTerm()).thenReturn(descriptionTerm); + when(mockOreMap.getDescriptionTextTerm()).thenReturn(descriptionTextTerm); + + JsonObjectBuilder descriptionBuilder = Json.createObjectBuilder(); + descriptionBuilder.add(descriptionTextTerm.getLabel(), "This is a test dataset description."); + testAggregationBuilder.add(descriptionTerm.getLabel(), descriptionBuilder); + + initializeBagGenerator(); + + // Act + String infoFile = invokeGenerateInfoFile(); + + // Assert + assertNotNull(infoFile); + assertTrue(infoFile.contains("External-Description: This is a test dataset description.")); + } + + @Test + public void testGenerateInfoFileWithMultipleDescriptions() throws Exception { + // Arrange + JsonLDTerm descriptionTerm = JsonLDTerm.schemaOrg("description"); + JsonLDTerm descriptionTextTerm = JsonLDTerm.schemaOrg("value"); + + when(mockOreMap.getDescriptionTerm()).thenReturn(descriptionTerm); + when(mockOreMap.getDescriptionTextTerm()).thenReturn(descriptionTextTerm); + + JsonArrayBuilder descriptionsBuilder = Json.createArrayBuilder(); + + JsonObjectBuilder desc1 = Json.createObjectBuilder(); + desc1.add(descriptionTextTerm.getLabel(), "First description of the dataset."); + + JsonObjectBuilder desc2 = Json.createObjectBuilder(); + desc2.add(descriptionTextTerm.getLabel(), "Second description with additional details."); + + JsonObjectBuilder desc3 = Json.createObjectBuilder(); + desc3.add(descriptionTextTerm.getLabel(), "Third description for completeness."); + + descriptionsBuilder.add(desc1); + descriptionsBuilder.add(desc2); + descriptionsBuilder.add(desc3); + + testAggregationBuilder.add(descriptionTerm.getLabel(), descriptionsBuilder); + + initializeBagGenerator(); + + // Act + String infoFile = invokeGenerateInfoFile(); + // Assert + assertNotNull(infoFile); + // Multiple descriptions should be concatenated with commas as per getSingleValue method + assertTrue(infoFile.contains("External-Description: First description of the dataset.,Second description with\r\n additional details.,Third description for completeness.")); + } + + @Test + public void testGenerateInfoFileWithRequiredFields() throws Exception { + // Arrange - minimal setup with required fields already in setUp() + JsonLDTerm contactTerm = JsonLDTerm.schemaOrg("creator"); + JsonLDTerm contactNameTerm = JsonLDTerm.schemaOrg("name"); + JsonLDTerm descriptionTerm = JsonLDTerm.schemaOrg("description"); + JsonLDTerm descriptionTextTerm = JsonLDTerm.schemaOrg("value"); + + when(mockOreMap.getContactTerm()).thenReturn(contactTerm); + when(mockOreMap.getContactNameTerm()).thenReturn(contactNameTerm); + when(mockOreMap.getContactEmailTerm()).thenReturn(null); + when(mockOreMap.getDescriptionTerm()).thenReturn(descriptionTerm); + when(mockOreMap.getDescriptionTextTerm()).thenReturn(descriptionTextTerm); + + JsonObjectBuilder contactBuilder = Json.createObjectBuilder(); + contactBuilder.add(contactNameTerm.getLabel(), "Test Contact"); + testAggregationBuilder.add(contactTerm.getLabel(), contactBuilder); + + JsonObjectBuilder descriptionBuilder = Json.createObjectBuilder(); + descriptionBuilder.add(descriptionTextTerm.getLabel(), "Test description"); + testAggregationBuilder.add(descriptionTerm.getLabel(), descriptionBuilder); + + initializeBagGenerator(); + + // Act + String infoFile = invokeGenerateInfoFile(); + + // Assert + assertNotNull(infoFile); + assertTrue(infoFile.contains("Contact-Name: Test Contact")); + assertTrue(infoFile.contains("External-Description: Test description")); + assertTrue(infoFile.contains("Source-Organization:")); + assertTrue(infoFile.contains("Organization-Address:")); + assertTrue(infoFile.contains("Organization-Email:")); + assertTrue(infoFile.contains("Bagging-Date:")); + assertTrue(infoFile.contains("External-Identifier: doi:10.5072/FK2/TEST123")); + assertTrue(infoFile.contains("Bag-Size:")); + assertTrue(infoFile.contains("Payload-Oxum: 1024000.10")); + assertTrue(infoFile.contains("Internal-Sender-Identifier: Test Catalog:Test Dataset")); + } + + @Test + public void testGenerateInfoFileWithDifferentBagSizes() throws Exception { + // Arrange + JsonLDTerm contactTerm = JsonLDTerm.schemaOrg("creator"); + when(mockOreMap.getContactTerm()).thenReturn(contactTerm); + when(mockOreMap.getContactNameTerm()).thenReturn(null); + when(mockOreMap.getContactEmailTerm()).thenReturn(null); + when(mockOreMap.getDescriptionTerm()).thenReturn(null); + + initializeBagGenerator(); + + // Test with bytes + setPrivateField(bagGenerator, "totalDataSize", 512L); + setPrivateField(bagGenerator, "dataCount", 5L); + String infoFile1 = invokeGenerateInfoFile(); + assertTrue(infoFile1.contains("Bag-Size: 512 bytes")); + assertTrue(infoFile1.contains("Payload-Oxum: 512.5")); + + // Test with KB + setPrivateField(bagGenerator, "totalDataSize", 2048L); + setPrivateField(bagGenerator, "dataCount", 3L); + String infoFile2 = invokeGenerateInfoFile(); + assertTrue(infoFile2.contains("Bag-Size: 2.05 KB")); + assertTrue(infoFile2.contains("Payload-Oxum: 2048.3")); + + // Test with MB + setPrivateField(bagGenerator, "totalDataSize", 5242880L); + setPrivateField(bagGenerator, "dataCount", 100L); + String infoFile3 = invokeGenerateInfoFile(); + assertTrue(infoFile3.contains("Bag-Size: 5.24 MB")); + assertTrue(infoFile3.contains("Payload-Oxum: 5242880.100")); + + // Test with GB + setPrivateField(bagGenerator, "totalDataSize", 2147483648L); + setPrivateField(bagGenerator, "dataCount", 1000L); + + String infoFile4 = invokeGenerateInfoFile(); + assertTrue(infoFile4.contains("Bag-Size: 2.15 GB")); + assertTrue(infoFile4.contains("Payload-Oxum: 2147483648.1000")); + } + + // Helper methods + + /** + * Invokes the private generateInfoFile method using reflection + */ + private String invokeGenerateInfoFile() throws Exception { + Method method = BagGenerator.class.getDeclaredMethod("generateInfoFile"); + method.setAccessible(true); + return (String) method.invoke(bagGenerator); + } + + /** + * Sets a private field value using reflection + */ + private void setPrivateField(Object target, String fieldName, Object value) throws Exception { + Field field = BagGenerator.class.getDeclaredField(fieldName); + field.setAccessible(true); + field.set(target, value); + } +} \ No newline at end of file diff --git a/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java new file mode 100644 index 00000000000..19d478f4b0d --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/util/bagit/BagGeneratorMultilineWrapTest.java @@ -0,0 +1,160 @@ + +package edu.harvard.iq.dataverse.util.bagit; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +/** + * Tests adapted for DD-2093: verify the behavior of BagGenerator.multilineWrap. + */ +public class BagGeneratorMultilineWrapTest { + + private static Method multilineWrap; + + @BeforeAll + static void setUp() throws NoSuchMethodException { + // Access the private static method via reflection + multilineWrap = BagGenerator.class.getDeclaredMethod("multilineWrap", String.class); + multilineWrap.setAccessible(true); + } + + private String callMultilineWrap(String input) { + try { + return (String) multilineWrap.invoke(null, input); + } catch (IllegalAccessException | InvocationTargetException e) { + throw new RuntimeException(e); + } + } + + @Test + void shortLine_noWrap() { + String input = "Hello world"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo("Hello world"); + } + + @Test + void exactBoundary_78chars_noWrap() { + String input = "a".repeat(78); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(input); + } + + @Test + void longSingleWord_wrapsAt78WithIndent() { + String input = "a".repeat(100); + String expected = "a".repeat(79) + "\r\n " + "a".repeat(21); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void multiline_input_indentsSecondAndSubsequentOriginalLines() { + String input = "Line1\nLine2\nLine3"; + String expected = "Line1\r\n Line2\r\n Line3"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void multiline_withLF_normalizedAndIndented() { + String input = "a".repeat(200); + String expected = "a".repeat(79) + "\r\n " + "a".repeat(78) + "\r\n " + "a".repeat(43); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void emptyLines_trimmedAndSkipped() { + String input = "Line1\n\nLine3"; + String expected = "Line1\r\n Line3"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void whitespaceOnlyLines_ignored() { + String input = "Line1\n \n\t\t\nLine3"; + String expected = "Line1\r\n Line3"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void longSecondLine_preservesIndentOnWraps() { + String line1 = "Header"; + String line2 = "b".repeat(90); + String input = line1 + "\n" + line2; + String expected = "Header\r\n " + "b".repeat(79) + "\r\n " + "b".repeat(11); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void labelLength_reducesFirstLineMaxLength() { + // With a label of length 20, first line should wrap at 78-20=58 chars + String label = "l".repeat(20); + String input = label + "a".repeat(150); + // First line: 58 chars, subsequent lines: 78 + String expected = label + "a".repeat(59) + "\r\n " + "a".repeat(78) + "\r\n " + "a".repeat(13); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void labelLength_zero_behavesAsDefault() { + String input = "a".repeat(100); + String expected = "a".repeat(79) + "\r\n " + "a".repeat(21); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void labelLength_withMultipleLines_onlyAffectsFirstLine() { + String label = "l".repeat(15); + String input = label + "a".repeat(100) + "\nSecond line content"; + // First line wraps at 79-15=64, then continues at 78 per line + // Second line starts fresh and wraps normally + String expected = label + "a".repeat(64) + "\r\n " + "a".repeat(36) + "\r\n Second line content"; + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void wrapsAtWordBoundary_notMidWord() { + // Create a string with a word boundary at position 75 + // "a" repeated 75 times, then a space, then more characters + String input = "a".repeat(75) + " " + "b".repeat(20); + // Should wrap at the space (position 75), not at position 79 + String expected = "a".repeat(75) + "\r\n " + "b".repeat(20); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void wrapsAtWordBoundary_multipleSpaces() { + // Test with word boundary closer to the limit + String input = "a".repeat(70) + " word " + "b".repeat(20); + // Should wrap after "word" (at position 76) + String expected = "a".repeat(70) + " word\r\n " + "b".repeat(20); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } + + @Test + void wrapsAtWordBoundary_withLabelLength() { + String label = "l".repeat(20); + // With label length=20, first line wraps at 78-20=58 + // Create string with word boundary at position 55 + String input = label + "a".repeat(55) + " " + "b".repeat(30); + // Should wrap at the space (position 55) + String expected = label + "a".repeat(55) + "\r\n " + "b".repeat(30); + String out = callMultilineWrap(input); + assertThat(out).isEqualTo(expected); + } +} \ No newline at end of file