From 3b7657ab2d30baf50ea6bd0435613861701cb20b Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Thu, 29 Jan 2026 13:17:45 -0500 Subject: [PATCH 1/9] copy post-0.1.6 croissant code from external repo This commit, specifically: https://github.com/gdcc/exporter-croissant/commit/a0c3b8071e4523694e5384a36085298b64d791a8 --- .../dataverse/export/CroissantExporter.java | 484 +++++++++++++++ .../export/CroissantExporterTest.java | 583 ++++++++++++++++++ .../cars/expected/cars-croissant.json | 302 +++++++++ .../croissant/cars/in/dataCiteXml.xml | 51 ++ .../croissant/cars/in/datasetFileDetails.json | 355 +++++++++++ .../croissant/cars/in/datasetJson.json | 228 +++++++ .../croissant/cars/in/datasetORE.json | 133 ++++ .../cars/in/datasetSchemaDotOrg.json | 78 +++ .../croissant/cars/out/croissant.json | 302 +++++++++ .../draft/expected/draft-croissant.json | 94 +++ .../croissant/draft/in/dataCiteXml.xml | 46 ++ .../draft/in/datasetFileDetails.json | 23 + .../croissant/draft/in/datasetJson.json | 156 +++++ .../croissant/draft/in/datasetORE.json | 87 +++ .../draft/in/datasetSchemaDotOrg.json | 60 ++ .../croissant/draft/out/croissant.json | 94 +++ .../junk/expected/junk-croissant.json | 83 +++ .../croissant/junk/in/dataCiteXml.xml | 33 + .../croissant/junk/in/datasetFileDetails.json | 1 + .../croissant/junk/in/datasetJson.json | 124 ++++ .../croissant/junk/in/datasetORE.json | 62 ++ .../junk/in/datasetSchemaDotOrg.json | 52 ++ .../croissant/junk/out/croissant.json | 83 +++ .../croissant/max/expected/max-croissant.json | 196 ++++++ .../croissant/max/in/dataCiteXml.xml | 77 +++ .../croissant/max/in/datasetFileDetails.json | 117 ++++ .../croissant/max/in/datasetJson.json | 376 +++++++++++ .../croissant/max/in/datasetORE.json | 163 +++++ .../croissant/max/in/datasetSchemaDotOrg.json | 119 ++++ .../croissant/max/out/croissant.json | 196 ++++++ .../minimal/expected/minimal-croissant.json | 79 +++ .../croissant/minimal/in/dataCiteXml.xml | 17 + .../minimal/in/datasetFileDetails.json | 1 + .../croissant/minimal/in/datasetJson.json | 100 +++ .../croissant/minimal/in/datasetORE.json | 62 ++ .../minimal/in/datasetSchemaDotOrg.json | 44 ++ .../croissant/minimal/out/croissant.json | 79 +++ .../expected/restricted-croissant.json | 115 ++++ .../croissant/restricted/in/dataCiteXml.xml | 51 ++ .../restricted/in/datasetFileDetails.json | 355 +++++++++++ .../croissant/restricted/in/datasetJson.json | 228 +++++++ .../croissant/restricted/in/datasetORE.json | 133 ++++ .../restricted/in/datasetSchemaDotOrg.json | 78 +++ .../croissant/restricted/out/croissant.json | 115 ++++ 44 files changed, 6215 insertions(+) create mode 100644 src/main/java/edu/harvard/iq/dataverse/export/CroissantExporter.java create mode 100644 src/test/java/edu/harvard/iq/dataverse/export/CroissantExporterTest.java create mode 100644 src/test/resources/croissant/cars/expected/cars-croissant.json create mode 100644 src/test/resources/croissant/cars/in/dataCiteXml.xml create mode 100644 src/test/resources/croissant/cars/in/datasetFileDetails.json create mode 100644 src/test/resources/croissant/cars/in/datasetJson.json create mode 100644 src/test/resources/croissant/cars/in/datasetORE.json create mode 100644 src/test/resources/croissant/cars/in/datasetSchemaDotOrg.json create mode 100644 src/test/resources/croissant/cars/out/croissant.json create mode 100644 src/test/resources/croissant/draft/expected/draft-croissant.json create mode 100644 src/test/resources/croissant/draft/in/dataCiteXml.xml create mode 100644 src/test/resources/croissant/draft/in/datasetFileDetails.json create mode 100644 src/test/resources/croissant/draft/in/datasetJson.json create mode 100644 src/test/resources/croissant/draft/in/datasetORE.json create mode 100644 src/test/resources/croissant/draft/in/datasetSchemaDotOrg.json create mode 100644 src/test/resources/croissant/draft/out/croissant.json create mode 100644 src/test/resources/croissant/junk/expected/junk-croissant.json create mode 100644 src/test/resources/croissant/junk/in/dataCiteXml.xml create mode 100644 src/test/resources/croissant/junk/in/datasetFileDetails.json create mode 100644 src/test/resources/croissant/junk/in/datasetJson.json create mode 100644 src/test/resources/croissant/junk/in/datasetORE.json create mode 100644 src/test/resources/croissant/junk/in/datasetSchemaDotOrg.json create mode 100644 src/test/resources/croissant/junk/out/croissant.json create mode 100644 src/test/resources/croissant/max/expected/max-croissant.json create mode 100644 src/test/resources/croissant/max/in/dataCiteXml.xml create mode 100644 src/test/resources/croissant/max/in/datasetFileDetails.json create mode 100644 src/test/resources/croissant/max/in/datasetJson.json create mode 100644 src/test/resources/croissant/max/in/datasetORE.json create mode 100644 src/test/resources/croissant/max/in/datasetSchemaDotOrg.json create mode 100644 src/test/resources/croissant/max/out/croissant.json create mode 100644 src/test/resources/croissant/minimal/expected/minimal-croissant.json create mode 100644 src/test/resources/croissant/minimal/in/dataCiteXml.xml create mode 100644 src/test/resources/croissant/minimal/in/datasetFileDetails.json create mode 100644 src/test/resources/croissant/minimal/in/datasetJson.json create mode 100644 src/test/resources/croissant/minimal/in/datasetORE.json create mode 100644 src/test/resources/croissant/minimal/in/datasetSchemaDotOrg.json create mode 100644 src/test/resources/croissant/minimal/out/croissant.json create mode 100644 src/test/resources/croissant/restricted/expected/restricted-croissant.json create mode 100644 src/test/resources/croissant/restricted/in/dataCiteXml.xml create mode 100644 src/test/resources/croissant/restricted/in/datasetFileDetails.json create mode 100644 src/test/resources/croissant/restricted/in/datasetJson.json create mode 100644 src/test/resources/croissant/restricted/in/datasetORE.json create mode 100644 src/test/resources/croissant/restricted/in/datasetSchemaDotOrg.json create mode 100644 src/test/resources/croissant/restricted/out/croissant.json diff --git a/src/main/java/edu/harvard/iq/dataverse/export/CroissantExporter.java b/src/main/java/edu/harvard/iq/dataverse/export/CroissantExporter.java new file mode 100644 index 00000000000..667e399e30f --- /dev/null +++ b/src/main/java/edu/harvard/iq/dataverse/export/CroissantExporter.java @@ -0,0 +1,484 @@ +package edu.harvard.iq.dataverse.export; + +import com.google.auto.service.AutoService; +import io.gdcc.spi.export.ExportDataProvider; +import io.gdcc.spi.export.ExportException; +import io.gdcc.spi.export.Exporter; +import jakarta.json.Json; +import jakarta.json.JsonArray; +import jakarta.json.JsonArrayBuilder; +import jakarta.json.JsonNumber; +import jakarta.json.JsonObject; +import jakarta.json.JsonObjectBuilder; +import jakarta.json.JsonReader; +import jakarta.json.JsonValue; +import jakarta.ws.rs.core.MediaType; +import java.io.OutputStream; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import org.apache.commons.text.StringEscapeUtils; + +/** https://github.com/mlcommons/croissant */ +@AutoService(Exporter.class) +public class CroissantExporter implements Exporter { + + /* + * The name of the format it creates. If this format is already provided by a + * built-in exporter, this Exporter will override the built-in one. (Note that + * exports are cached, so existing metadata export files are not updated + * immediately.) + */ + @Override + public String getFormatName() { + return "croissant"; + } + + /** + * The display name shown in the UI + * + * @param locale + */ + @Override + public String getDisplayName(Locale locale) { + // This example includes the language in the name to demonstrate that locale is + // available. A production exporter would instead use the locale to generate an + // appropriate translation. + return "Croissant"; + } + + /** Whether the exported format should be available as an option for Harvesting */ + @Override + public Boolean isHarvestable() { + return false; + } + + /** Whether the exported format should be available for download in the UI and API */ + @Override + public Boolean isAvailableToUsers() { + return true; + } + + /** + * Defines the mime type of the exported format - used when metadata is downloaded, i.e. to + * trigger an appropriate viewer in the user's browser. + */ + @Override + public String getMediaType() { + return MediaType.APPLICATION_JSON; + } + + /** + * This method is called by Dataverse when metadata for a given dataset in this format is + * requested. + */ + @Override + public void exportDataset(ExportDataProvider dataProvider, OutputStream outputStream) + throws ExportException { + try { + // Start building the output format. + JsonObjectBuilder job = Json.createObjectBuilder(); + String contextString = + """ + { + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "rai": "http://mlcommons.org/croissant/RAI/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "samplingRate": "cr:samplingRate", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform", + "wd": "https://www.wikidata.org/wiki/" + } + } + """; + try (JsonReader jsonReader = Json.createReader(new StringReader(contextString))) { + JsonObject contextObject = jsonReader.readObject(); + job.add("@context", contextObject.getJsonObject("@context")); + } + + job.add("@type", "sc:Dataset"); + job.add("conformsTo", "http://mlcommons.org/croissant/1.0"); + + JsonObject datasetJson = dataProvider.getDatasetJson(); + + JsonObject datasetORE = dataProvider.getDatasetORE(); + JsonObject describes = datasetORE.getJsonObject("ore:describes"); + job.add("name", StringEscapeUtils.escapeHtml4(describes.getString("title"))); + job.add("url", describes.getJsonString("@id")); + JsonObject datasetSchemaDotOrg = dataProvider.getDatasetSchemaDotOrg(); + // We don't escape DatasetSchemaDotOrg fields like creator, description, etc. because + // they are already escaped. + job.add("creator", datasetSchemaDotOrg.getJsonArray("creator")); + job.add("description", datasetSchemaDotOrg.getJsonString("description")); + job.add("keywords", datasetSchemaDotOrg.getJsonArray("keywords")); + job.add("license", datasetSchemaDotOrg.getString("license")); + String datePublished = datasetSchemaDotOrg.getString("datePublished", null); + if (datePublished != null) { + job.add("datePublished", datasetSchemaDotOrg.getString("datePublished")); + } + job.add("dateModified", datasetSchemaDotOrg.getString("dateModified")); + job.add( + "includedInDataCatalog", + datasetSchemaDotOrg.getJsonObject("includedInDataCatalog")); + job.add("publisher", datasetSchemaDotOrg.getJsonObject("publisher")); + + /** + * For "version", we are knowingly sending "1.0" rather than "1.0.0", even though + * MAJOR.MINOR.PATCH is recommended by the Croissant spec. We are aware that the + * Croissant validator throws a warning for anything other than MAJOR.MINOR.PATCH. See + * the README for a detailed explanation and the following issues: + * https://github.com/mlcommons/croissant/issues/609 + * https://github.com/mlcommons/croissant/issues/643 + */ + job.add("version", describes.getString("schema:version")); + /** + * We have been told that it's fine and appropriate to put the citation to the dataset + * itself into "citeAs". However, the spec says "citeAs" is "A citation for a + * publication that describes the dataset" so we have asked for clarification here: + * https://github.com/mlcommons/croissant/issues/638 + */ + job.add("citeAs", getBibtex(datasetORE, datasetJson, datasetSchemaDotOrg)); + + JsonArray funder = datasetSchemaDotOrg.getJsonArray("funder"); + if (funder != null) { + job.add("funder", funder); + } + + JsonArray spatialCoverage = datasetSchemaDotOrg.getJsonArray("spatialCoverage"); + if (spatialCoverage != null) { + job.add("spatialCoverage", spatialCoverage); + } + + JsonArray oreFiles = describes.getJsonArray("ore:aggregates"); + + // Create a map so that later we can use the storageIdentifier to lookup + // the position of the file in the array of files in the datasetORE format. + // We don't use checksum because it's possible for a dataset to have the + // same checksum for multiple files. + Map storageIdentifierToPositionInOre = new HashMap<>(); + for (int i = 0; i < oreFiles.size(); i++) { + JsonObject aggregate = oreFiles.getJsonObject(i); + String storageIdentifier = aggregate.getString("dvcore:storageIdentifier", null); + if (storageIdentifier != null) { + storageIdentifierToPositionInOre.put(storageIdentifier, i); + } + } + + JsonArrayBuilder distribution = Json.createArrayBuilder(); + JsonArrayBuilder recordSet = Json.createArrayBuilder(); + JsonArray datasetFileDetails = dataProvider.getDatasetFileDetails(); + for (JsonValue jsonValue : datasetFileDetails) { + + JsonObjectBuilder recordSetContent = Json.createObjectBuilder(); + recordSetContent.add("@type", "cr:RecordSet"); + JsonObject fileDetails = jsonValue.asJsonObject(); + /** + * When there is an originalFileName, it means that the file has gone through ingest + * and that multiple files formats are available: original, tab-separated, and + * RData. Currently we are only showing the original file but we we could create + * additional cr:FileObject entries for tab-separated and RData as suggested in + * https://github.com/mlcommons/croissant/issues/641 . Should we? Is there interest + * in this? And would we duplicate all the cr:RecordSet entries (columns) with each + * additional format? Probably not as it would be the same. + */ + String filename = + StringEscapeUtils.escapeHtml4( + fileDetails.getString("originalFileName", null)); + if (filename == null) { + filename = StringEscapeUtils.escapeHtml4(fileDetails.getString("filename")); + } + String fileFormat = null; + // Use the original file format, if available, since that's where the + // contentUrl will point. + String originalFileFormat = fileDetails.getString("originalFileFormat", null); + if (originalFileFormat != null) { + if ("text/tsv".equals(originalFileFormat)) { + // "text/tsv" is an internal format used by Dataverse while + // "text/tab-separated-values" is the official IANA format + // that we present to the outside world + // See https://github.com/IQSS/dataverse/issues/11505 and + // https://www.iana.org/assignments/media-types/media-types.xhtml + fileFormat = "text/tab-separated-values"; + } else { + fileFormat = originalFileFormat; + } + } + if (fileFormat == null) { + fileFormat = fileDetails.getString("contentType"); + } + JsonNumber fileSize = fileDetails.getJsonNumber("originalFileSize"); + if (fileSize == null) { + fileSize = fileDetails.getJsonNumber("filesize"); + } + + /** + * We make contentSize a String ( https://schema.org/Text ) rather than a number + * (JsonNumber) to pass the Croissant validator and comply with the spec. We don't + * include a unit because the spec says "Defaults to bytes if a unit is not + * specified." + */ + String fileSizeInBytes = fileSize.toString(); + JsonObject checksum = fileDetails.getJsonObject("checksum"); + // Out of the box the checksum type will be md5 + String checksumType = checksum.getString("type").toLowerCase(); + String checksumValue = checksum.getString("value"); + String storageIdentifier = fileDetails.getString("storageIdentifier"); + int positionInOre = storageIdentifierToPositionInOre.get(storageIdentifier); + String contentUrl = + oreFiles.getJsonObject(positionInOre).getString("schema:sameAs"); + String description = + StringEscapeUtils.escapeHtml4(fileDetails.getString("description", "")); + /** + * See https://github.com/mlcommons/croissant/issues/639 for discussion with the + * Croissant spec leads on what to put in + * + * @id (path/to/file.txt). + *

It's suboptimal that the directoryLabel isn't already included in + * dataProvider.getDatasetFileDetails(). If it gets added as part of the + * following issue, we can get it from there: + * https://github.com/IQSS/dataverse/issues/10523 + */ + String fileId = filename; + // We don't escape directory label because many characters aren't allowed anyway + String directoryLabel = + oreFiles.getJsonObject(positionInOre) + .getString("dvcore:directoryLabel", null); + if (directoryLabel != null) { + fileId = directoryLabel + "/" + filename; + } + + distribution.add( + Json.createObjectBuilder() + .add("@type", "cr:FileObject") + .add("@id", fileId) + .add("name", filename) + .add("encodingFormat", fileFormat) + .add(checksumType, checksumValue) + .add("contentSize", fileSizeInBytes) + .add("description", description) + .add("contentUrl", contentUrl)); + boolean fileRestricted = fileDetails.getBoolean("restricted"); + if (fileRestricted) { + // Don't add the recordSet items for restricted files. + // Go on to the next file. + continue; + } + int fileIndex = 0; + JsonArray dataTables = fileDetails.getJsonArray("dataTables"); + if (dataTables == null) { + dataTables = JsonArray.EMPTY_JSON_ARRAY; + } + for (JsonValue dataTableValue : dataTables) { + JsonObject dataTableObject = dataTableValue.asJsonObject(); + // Unused + int varQuantity = dataTableObject.getInt("varQuantity"); + // Unused + int caseQuantity = dataTableObject.getInt("caseQuantity"); + JsonArray dataVariables = dataTableObject.getJsonArray("dataVariables"); + JsonArrayBuilder fieldSetArray = Json.createArrayBuilder(); + for (JsonValue dataVariableValue : dataVariables) { + JsonObjectBuilder fieldSetObject = Json.createObjectBuilder(); + fieldSetObject.add("@type", "cr:RecordSet"); + JsonObject dataVariableObject = dataVariableValue.asJsonObject(); + // TODO: should this be an integer? + Integer variableId = dataVariableObject.getInt("id"); + String variableName = + StringEscapeUtils.escapeHtml4(dataVariableObject.getString("name")); + String variableDescription = + StringEscapeUtils.escapeHtml4( + dataVariableObject.getString("label", "")); + String variableFormatType = + dataVariableObject.getString("variableFormatType"); + String variableIntervalType = + dataVariableObject.getString("variableIntervalType"); + String dataType = null; + /** + * There are only two variableFormatType types on the Dataverse side: + * CHARACTER and NUMERIC. (See VariableType in DataVariable.java.) + */ + switch (variableFormatType) { + case "CHARACTER": + dataType = "sc:Text"; + break; + case "NUMERIC": + dataType = getNumericType(variableIntervalType); + break; + default: + break; + } + fieldSetArray.add( + Json.createObjectBuilder() + .add("@type", "cr:Field") + .add("name", variableName) + .add("description", variableDescription) + .add("dataType", dataType) + .add( + "source", + Json.createObjectBuilder() + .add("@id", variableId.toString()) + .add( + "fileObject", + Json.createObjectBuilder() + .add("@id", fileId)) + .add( + "extract", + Json.createObjectBuilder() + .add( + "column", + variableName)))); + } + recordSetContent.add("field", fieldSetArray); + recordSet.add(recordSetContent); + fileIndex++; + } + } + + JsonArray citation = datasetSchemaDotOrg.getJsonArray("citation"); + if (citation != null) { + job.add("citation", citation); + } + JsonArray temporalCoverage = datasetSchemaDotOrg.getJsonArray("temporalCoverage"); + if (temporalCoverage != null) { + job.add("temporalCoverage", temporalCoverage); + } + JsonArray distributionArray = distribution.build(); + if (!distributionArray.isEmpty()) { + job.add("distribution", distributionArray); + } + JsonArray recordSetArray = recordSet.build(); + if (!recordSetArray.isEmpty()) { + job.add("recordSet", recordSetArray); + } + + // TODO: Do we need DataCite XML? + String dataCiteXml = dataProvider.getDataCiteXml(); + + // Write the output format to the output stream. + outputStream.write(job.build().toString().getBytes("UTF8")); + // Flush the output stream - The output stream is automatically closed by + // Dataverse and should not be closed in the Exporter. + outputStream.flush(); + } catch (Exception ex) { + System.out.println("Exception caught in Croissant exporter. Printing stacktrace..."); + ex.printStackTrace(); + // If anything goes wrong, an Exporter should throw an ExportException. + throw new ExportException("Unknown exception caught during export: " + ex); + } + } + + /* + Here's how a BibTeX export looks in Dataverse: + @data{DVN/TJCLKP_2017, + author = {Durbin, Philip}, + publisher = {Harvard Dataverse}, + title = {{Open Source at Harvard}}, + UNF = {UNF:6:e9+1ZqpZtjCuBzTDSrsHgA==}, + year = {2017}, + version = {DRAFT VERSION}, + doi = {10.7910/DVN/TJCLKP}, + url = {https://doi.org/10.7910/DVN/TJCLKP} + } + */ + /** + * The code is inspired by DataCitation.java upstream. However, Croissant does not want + * newlines, so we omit them. Some notes about this example: + * + *

- DVN/TJCLKP_2017 seems strange as an identifier. This is probably a bug upstream. + * + *

- "DRAFT VERSION" is an artifact from a bug that was probably fixed in + * https://github.com/IQSS/dataverse/pull/9705 + */ + private String getBibtex( + JsonObject datasetORE, JsonObject datasetJson, JsonObject datasetSchemaDotOrg) { + String identifier = datasetJson.getString("identifier"); + + JsonObject oreDescribes = datasetORE.getJsonObject("ore:describes"); + String publicationYear = null; + String publicationDate = oreDescribes.getString("schema:datePublished", null); + if (publicationDate != null) { + publicationYear = publicationDate.substring(0, 4); + } + + JsonArray creatorArray = datasetSchemaDotOrg.getJsonArray("creator"); + List creators = new ArrayList<>(); + for (JsonValue creator : creatorArray) { + creators.add(creator.asJsonObject().getString("name")); + } + String creatorsFormatted = String.join(" and ", creators); + + String publisher = datasetSchemaDotOrg.getJsonObject("publisher").getString("name"); + String title = datasetSchemaDotOrg.getString("name"); + + String pidAsUrl = oreDescribes.getString("@id"); + + StringBuilder sb = new StringBuilder(); + if (publicationYear != null) { + sb.append("@data{").append(identifier).append("_").append(publicationYear).append(","); + } else { + sb.append("@data{").append(identifier).append(","); + } + sb.append("author = {").append(creatorsFormatted).append("},"); + sb.append("publisher = {").append(publisher).append("},"); + sb.append("title = {").append(title).append("},"); + if (publicationYear != null) { + sb.append("year = {").append(publicationYear).append("},"); + } + sb.append("url = {").append(pidAsUrl).append("}"); + sb.append("}"); + return sb.toString(); + } + + private String getNumericType(String variableIntervalType) { + /** + * According to DataVariable.java in Dataverse, the four possibilities are: discrete, contin + * (continuous), nominal, and dichotomous. + */ + return switch (variableIntervalType) { + case "discrete" -> "sc:Integer"; + case "contin" -> "sc:Float"; + default -> "sc:Text"; + }; + } +} diff --git a/src/test/java/edu/harvard/iq/dataverse/export/CroissantExporterTest.java b/src/test/java/edu/harvard/iq/dataverse/export/CroissantExporterTest.java new file mode 100644 index 00000000000..59e7b2cc329 --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/export/CroissantExporterTest.java @@ -0,0 +1,583 @@ +package edu.harvard.iq.dataverse.export; + +import static org.junit.jupiter.api.Assertions.*; + +import io.gdcc.spi.export.ExportDataProvider; +import jakarta.json.Json; +import jakarta.json.JsonArray; +import jakarta.json.JsonObject; +import jakarta.json.JsonReader; +import jakarta.json.JsonWriter; +import jakarta.json.JsonWriterFactory; +import jakarta.json.stream.JsonGenerator; +import java.io.ByteArrayOutputStream; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.io.OutputStream; +import java.io.StringReader; +import java.io.StringWriter; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.HashMap; +import java.util.Map; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.skyscreamer.jsonassert.JSONAssert; + +public class CroissantExporterTest { + + static CroissantExporter exporter; + static OutputStream outputStreamMinimal; + static ExportDataProvider dataProviderMinimal; + static OutputStream outputStreamMax; + static ExportDataProvider dataProviderMax; + static OutputStream outputStreamCars; + static ExportDataProvider dataProviderCars; + static OutputStream outputStreamRestricted; + static ExportDataProvider dataProviderRestricted; + static OutputStream outputStreamJunk; + static ExportDataProvider dataProviderJunk; + static OutputStream outputStreamDraft; + static ExportDataProvider dataProviderDraft; + + @BeforeAll + public static void setUp() { + exporter = new CroissantExporter(); + + outputStreamMinimal = new ByteArrayOutputStream(); + dataProviderMinimal = + new ExportDataProvider() { + @Override + public JsonObject getDatasetJson() { + String pathToJsonFile = "src/test/resources/croissant/minimal/in/datasetJson.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetORE() { + String pathToJsonFile = "src/test/resources/croissant/minimal/in/datasetORE.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonArray getDatasetFileDetails() { + String pathToJsonFile = + "src/test/resources/croissant/minimal/in/datasetFileDetails.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readArray(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetSchemaDotOrg() { + String pathToJsonFile = + "src/test/resources/croissant/minimal/in/datasetSchemaDotOrg.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public String getDataCiteXml() { + try { + return Files.readString( + Paths.get("src/test/resources/croissant/minimal/in/dataCiteXml.xml"), + StandardCharsets.UTF_8); + } catch (IOException ex) { + return null; + } + } + }; + + outputStreamMax = new ByteArrayOutputStream(); + dataProviderMax = + new ExportDataProvider() { + @Override + public JsonObject getDatasetJson() { + String pathToJsonFile = "src/test/resources/croissant/max/in/datasetJson.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetORE() { + String pathToJsonFile = "src/test/resources/croissant/max/in/datasetORE.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonArray getDatasetFileDetails() { + String pathToJsonFile = "src/test/resources/croissant/max/in/datasetFileDetails.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readArray(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetSchemaDotOrg() { + String pathToJsonFile = + "src/test/resources/croissant/max/in/datasetSchemaDotOrg.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public String getDataCiteXml() { + try { + return Files.readString( + Paths.get("src/test/resources/croissant/max/in/dataCiteXml.xml"), + StandardCharsets.UTF_8); + } catch (IOException ex) { + return null; + } + } + }; + + outputStreamCars = new ByteArrayOutputStream(); + dataProviderCars = + new ExportDataProvider() { + @Override + public JsonObject getDatasetJson() { + String pathToJsonFile = "src/test/resources/croissant/cars/in/datasetJson.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetORE() { + String pathToJsonFile = "src/test/resources/croissant/cars/in/datasetORE.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonArray getDatasetFileDetails() { + String pathToJsonFile = + "src/test/resources/croissant/cars/in/datasetFileDetails.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readArray(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetSchemaDotOrg() { + String pathToJsonFile = + "src/test/resources/croissant/cars/in/datasetSchemaDotOrg.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public String getDataCiteXml() { + try { + return Files.readString( + Paths.get("src/test/resources/croissant/cars/in/dataCiteXml.xml"), + StandardCharsets.UTF_8); + } catch (IOException ex) { + return null; + } + } + }; + + outputStreamRestricted = new ByteArrayOutputStream(); + dataProviderRestricted = + new ExportDataProvider() { + @Override + public JsonObject getDatasetJson() { + String pathToJsonFile = "src/test/resources/croissant/restricted/in/datasetJson.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetORE() { + String pathToJsonFile = "src/test/resources/croissant/restricted/in/datasetORE.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonArray getDatasetFileDetails() { + String pathToJsonFile = + "src/test/resources/croissant/restricted/in/datasetFileDetails.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readArray(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetSchemaDotOrg() { + String pathToJsonFile = + "src/test/resources/croissant/restricted/in/datasetSchemaDotOrg.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public String getDataCiteXml() { + try { + return Files.readString( + Paths.get("src/test/resources/croissant/restricted/in/dataCiteXml.xml"), + StandardCharsets.UTF_8); + } catch (IOException ex) { + return null; + } + } + }; + + outputStreamJunk = new ByteArrayOutputStream(); + dataProviderJunk = + new ExportDataProvider() { + @Override + public JsonObject getDatasetJson() { + String pathToJsonFile = "src/test/resources/croissant/junk/in/datasetJson.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetORE() { + String pathToJsonFile = "src/test/resources/croissant/junk/in/datasetORE.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonArray getDatasetFileDetails() { + String pathToJsonFile = + "src/test/resources/croissant/junk/in/datasetFileDetails.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readArray(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetSchemaDotOrg() { + String pathToJsonFile = + "src/test/resources/croissant/junk/in/datasetSchemaDotOrg.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public String getDataCiteXml() { + try { + return Files.readString( + Paths.get("src/test/resources/croissant/junk/in/dataCiteXml.xml"), + StandardCharsets.UTF_8); + } catch (IOException ex) { + return null; + } + } + }; + + outputStreamDraft = new ByteArrayOutputStream(); + dataProviderDraft = + new ExportDataProvider() { + @Override + public JsonObject getDatasetJson() { + String pathToJsonFile = "src/test/resources/croissant/draft/in/datasetJson.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetORE() { + String pathToJsonFile = "src/test/resources/croissant/draft/in/datasetORE.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonArray getDatasetFileDetails() { + String pathToJsonFile = + "src/test/resources/croissant/draft/in/datasetFileDetails.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readArray(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetSchemaDotOrg() { + String pathToJsonFile = + "src/test/resources/croissant/draft/in/datasetSchemaDotOrg.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public String getDataCiteXml() { + try { + return Files.readString( + Paths.get("src/test/resources/croissant/draft/in/dataCiteXml.xml"), + StandardCharsets.UTF_8); + } catch (IOException ex) { + return null; + } + } + }; + } + + @Test + public void testGetFormatName() { + CroissantExporter instance = new CroissantExporter(); + String expResult = ""; + String result = instance.getFormatName(); + assertEquals("croissant", result); + } + + @Test + public void testGetDisplayName() { + assertEquals("Croissant", exporter.getDisplayName(null)); + } + + @Test + public void testIsHarvestable() { + assertEquals(false, exporter.isHarvestable()); + } + + @Test + public void testIsAvailableToUsers() { + assertEquals(true, exporter.isAvailableToUsers()); + } + + @Test + public void testGetMediaType() { + assertEquals("application/json", exporter.getMediaType()); + } + + @Test + public void testExportDatasetMinimal() throws Exception { + exporter.exportDataset(dataProviderMinimal, outputStreamMinimal); + String actual = outputStreamMinimal.toString(); + writeCroissantFile(actual, "minimal"); + String expected = + Files.readString( + Paths.get("src/test/resources/croissant/minimal/expected/minimal-croissant.json"), + StandardCharsets.UTF_8); + JSONAssert.assertEquals(expected, actual, true); + assertEquals(prettyPrint(expected), prettyPrint(outputStreamMinimal.toString())); + } + + @Test + public void testExportDatasetMax() throws Exception { + exporter.exportDataset(dataProviderMax, outputStreamMax); + String actual = outputStreamMax.toString(); + writeCroissantFile(actual, "max"); + /* + First, install pyDataverse from Dans-labs, the "croissant" branch: + pip3 install --upgrade --no-cache-dir git+https://github.com/Dans-labs/pyDataverse@croissant#egg=pyDataverse + You can use this script to export Croissant from a dataset: + --- + from pyDataverse.Croissant import Croissant + #from pyDataverse.Croissant import Croissant + import json + #host = "https://dataverse.nl" + #PID = "doi:10.34894/KMRAYH" + host = "https://beta.dataverse.org" + PID = "doi:10.5072/FK2/VQTYHD" + croissant = Croissant(host, PID) + print(json.dumps(croissant.get_record(), indent=4, default=str)) + --- + Finally, uncomment the lines below to check for differences. + */ + // String pyDataverse = Files.readString(Paths.get("/tmp/pyDataverse.json"), + // StandardCharsets.UTF_8); + // JSONAssert.assertEquals(actual, pyDataverse, true); + String expected = + Files.readString( + Paths.get("src/test/resources/croissant/max/expected/max-croissant.json"), + StandardCharsets.UTF_8); + JSONAssert.assertEquals(expected, actual, true); + assertEquals(prettyPrint(expected), prettyPrint(outputStreamMax.toString())); + } + + /* + The data in stata13-auto.dta looks something like this: + make price mpg rep78 headroom trunk weight length turn displacement gear_ratio foreign + "AMC Concord" 4099 22 3 2.5 11 2930 186 40 121 3.58 0 + "AMC Pacer" 4749 17 3 3.0 11 3350 173 40 258 2.53 0 + "AMC Spirit" 3799 22 3.0 12 2640 168 35 121 3.08 0 + */ + @Test + public void testExportDatasetCars() throws Exception { + exporter.exportDataset(dataProviderCars, outputStreamCars); + String actual = outputStreamCars.toString(); + writeCroissantFile(actual, "cars"); + String expected = + Files.readString( + Paths.get("src/test/resources/croissant/cars/expected/cars-croissant.json"), + StandardCharsets.UTF_8); + JSONAssert.assertEquals(expected, actual, true); + assertEquals(prettyPrint(expected), prettyPrint(outputStreamCars.toString())); + } + + /** Same as the cars data but the stata13-auto.dta file is restricted. */ + @Test + public void testExportDatasetRestricted() throws Exception { + exporter.exportDataset(dataProviderRestricted, outputStreamRestricted); + String actual = outputStreamRestricted.toString(); + writeCroissantFile(actual, "restricted"); + String expected = + Files.readString( + Paths.get( + "src/test/resources/croissant/restricted/expected/restricted-croissant.json"), + StandardCharsets.UTF_8); + JSONAssert.assertEquals(expected, actual, true); + assertEquals(prettyPrint(expected), prettyPrint(outputStreamRestricted.toString())); + } + + @Test + public void testExportDatasetJunk() throws Exception { + exporter.exportDataset(dataProviderJunk, outputStreamJunk); + String actual = outputStreamJunk.toString(); + writeCroissantFile(actual, "junk"); + String expected = + Files.readString( + Paths.get("src/test/resources/croissant/junk/expected/junk-croissant.json"), + StandardCharsets.UTF_8); + JSONAssert.assertEquals(expected, actual, true); + assertEquals(prettyPrint(expected), prettyPrint(outputStreamJunk.toString())); + } + + @Test + public void testExportDatasetDraft() throws Exception { + exporter.exportDataset(dataProviderDraft, outputStreamDraft); + String actual = outputStreamDraft.toString(); + writeCroissantFile(actual, "draft"); + String expected = + Files.readString( + Paths.get("src/test/resources/croissant/draft/expected/draft-croissant.json"), + StandardCharsets.UTF_8); + JSONAssert.assertEquals(expected, actual, true); + assertEquals(prettyPrint(expected), prettyPrint(outputStreamDraft.toString())); + } + + private void writeCroissantFile(String actual, String name) throws IOException { + Path dir = Files.createDirectories(Paths.get("src/test/resources/croissant/" + name + "/out")); + Path out = Paths.get(dir + "/croissant.json"); + Files.writeString(out, prettyPrint(actual), StandardCharsets.UTF_8); + } + + public static String prettyPrint(String jsonObject) { + try { + return prettyPrint(getJsonObject(jsonObject)); + } catch (Exception ex) { + return jsonObject; + } + } + + public static String prettyPrint(JsonObject jsonObject) { + Map config = new HashMap<>(); + config.put(JsonGenerator.PRETTY_PRINTING, true); + JsonWriterFactory jsonWriterFactory = Json.createWriterFactory(config); + StringWriter stringWriter = new StringWriter(); + try (JsonWriter jsonWriter = jsonWriterFactory.createWriter(stringWriter)) { + jsonWriter.writeObject(jsonObject); + } + return stringWriter.toString(); + } + + public static JsonObject getJsonObject(String serializedJson) { + try (StringReader rdr = new StringReader(serializedJson)) { + try (JsonReader jsonReader = Json.createReader(rdr)) { + return jsonReader.readObject(); + } + } + } +} diff --git a/src/test/resources/croissant/cars/expected/cars-croissant.json b/src/test/resources/croissant/cars/expected/cars-croissant.json new file mode 100644 index 00000000000..a9c0d48b217 --- /dev/null +++ b/src/test/resources/croissant/cars/expected/cars-croissant.json @@ -0,0 +1,302 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "rai": "http://mlcommons.org/croissant/RAI/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "samplingRate": "cr:samplingRate", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform", + "wd": "https://www.wikidata.org/wiki/" + }, + "@type": "sc:Dataset", + "conformsTo": "http://mlcommons.org/croissant/1.0", + "name": "Cars", + "url": "https://doi.org/10.5072/FK2/CY7BWA", + "creator": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "affiliation": { + "@type": "Organization", + "name": "Harvard" + }, + "name": "Durbin, Philip" + } + ], + "description": "This dataset is about cars.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "datePublished": "2025-05-16", + "dateModified": "2025-05-16", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "http://localhost:8080" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "version": "1.0", + "citeAs": "@data{FK2/CY7BWA_2025,author = {Durbin, Philip},publisher = {Root},title = {Cars},year = {2025},url = {https://doi.org/10.5072/FK2/CY7BWA}}", + "distribution": [ + { + "@type": "cr:FileObject", + "@id": "code/compute.py", + "name": "compute.py", + "encodingFormat": "text/x-python", + "md5": "d84985e94dde671f318076bd7a137f15", + "contentSize": "15", + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/7" + }, + { + "@type": "cr:FileObject", + "@id": "data/stata13-auto.dta", + "name": "stata13-auto.dta", + "encodingFormat": "application/x-stata-13", + "md5": "7b1201ce6b469796837a835377338c5a", + "contentSize": "6443", + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/9?format=original" + }, + { + "@type": "cr:FileObject", + "@id": "doc/README.md", + "name": "README.md", + "encodingFormat": "text/markdown", + "md5": "a2e484d07ee5590cc32182dc2c6ccc83", + "contentSize": "28", + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/8" + } + ], + "recordSet": [ + { + "@type": "cr:RecordSet", + "field": [ + { + "@type": "cr:Field", + "name": "make", + "description": "Make and Model", + "dataType": "sc:Text", + "source": { + "@id": "2", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "make" + } + } + }, + { + "@type": "cr:Field", + "name": "price", + "description": "Price", + "dataType": "sc:Integer", + "source": { + "@id": "5", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "price" + } + } + }, + { + "@type": "cr:Field", + "name": "mpg", + "description": "Mileage (mpg)", + "dataType": "sc:Integer", + "source": { + "@id": "3", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "mpg" + } + } + }, + { + "@type": "cr:Field", + "name": "rep78", + "description": "Repair Record 1978", + "dataType": "sc:Integer", + "source": { + "@id": "12", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "rep78" + } + } + }, + { + "@type": "cr:Field", + "name": "headroom", + "description": "Headroom (in.)", + "dataType": "sc:Float", + "source": { + "@id": "1", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "headroom" + } + } + }, + { + "@type": "cr:Field", + "name": "trunk", + "description": "Trunk space (cu. ft.)", + "dataType": "sc:Integer", + "source": { + "@id": "7", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "trunk" + } + } + }, + { + "@type": "cr:Field", + "name": "weight", + "description": "Weight (lbs.)", + "dataType": "sc:Integer", + "source": { + "@id": "4", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "weight" + } + } + }, + { + "@type": "cr:Field", + "name": "length", + "description": "Length (in.)", + "dataType": "sc:Integer", + "source": { + "@id": "8", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "length" + } + } + }, + { + "@type": "cr:Field", + "name": "turn", + "description": "Turn Circle (ft.) ", + "dataType": "sc:Integer", + "source": { + "@id": "9", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "turn" + } + } + }, + { + "@type": "cr:Field", + "name": "displacement", + "description": "Displacement (cu. in.)", + "dataType": "sc:Integer", + "source": { + "@id": "10", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "displacement" + } + } + }, + { + "@type": "cr:Field", + "name": "gear_ratio", + "description": "Gear Ratio", + "dataType": "sc:Float", + "source": { + "@id": "6", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "gear_ratio" + } + } + }, + { + "@type": "cr:Field", + "name": "foreign", + "description": "Car type", + "dataType": "sc:Integer", + "source": { + "@id": "11", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "foreign" + } + } + } + ] + } + ] +} \ No newline at end of file diff --git a/src/test/resources/croissant/cars/in/dataCiteXml.xml b/src/test/resources/croissant/cars/in/dataCiteXml.xml new file mode 100644 index 00000000000..7c6c89385fd --- /dev/null +++ b/src/test/resources/croissant/cars/in/dataCiteXml.xml @@ -0,0 +1,51 @@ + + + 10.5072/FK2/CY7BWA + + + Durbin, Philip + Philip + Durbin + Harvard + + + + Cars + + Root + 2025 + + Other + + + + Durbin, Philip + Philip + Durbin + Harvard + + + + 2024-03-13 + 2025-05-16 + + + + 15 + 28 + 4026 + + + text/x-python + text/markdown + text/tab-separated-values + + 1.0 + + + Creative Commons CC0 1.0 Universal Public Domain Dedication. + + + This dataset is about cars. + + diff --git a/src/test/resources/croissant/cars/in/datasetFileDetails.json b/src/test/resources/croissant/cars/in/datasetFileDetails.json new file mode 100644 index 00000000000..2ce12a4abe9 --- /dev/null +++ b/src/test/resources/croissant/cars/in/datasetFileDetails.json @@ -0,0 +1,355 @@ +[ + { + "id": 7, + "persistentId": "", + "filename": "compute.py", + "contentType": "text/x-python", + "friendlyType": "Python Source Code", + "filesize": 15, + "description": "", + "storageIdentifier": "local://196d9f154f7-8cadf34ee905", + "rootDataFileId": -1, + "md5": "d84985e94dde671f318076bd7a137f15", + "checksum": { + "type": "MD5", + "value": "d84985e94dde671f318076bd7a137f15" + }, + "tabularData": false, + "creationDate": "2025-05-16", + "publicationDate": "2025-05-16", + "fileAccessRequest": true, + "restricted": false, + "fileMetadataId": 1, + "varGroups": [] + }, + { + "id": 9, + "persistentId": "", + "filename": "stata13-auto.tab", + "contentType": "text/tab-separated-values", + "friendlyType": "Tab-Delimited", + "filesize": 4026, + "description": "", + "storageIdentifier": "local://196d9f15719-2270bfca2b48", + "originalFileFormat": "application/x-stata-13", + "originalFormatLabel": "Stata 13 Binary", + "originalFileSize": 6443, + "originalFileName": "stata13-auto.dta", + "UNF": "UNF:6:RPd9EWHSZwqUvRZuKTJMqg==", + "rootDataFileId": -1, + "md5": "7b1201ce6b469796837a835377338c5a", + "checksum": { + "type": "MD5", + "value": "7b1201ce6b469796837a835377338c5a" + }, + "tabularData": true, + "creationDate": "2025-05-16", + "publicationDate": "2025-05-16", + "fileAccessRequest": true, + "restricted": false, + "fileMetadataId": 3, + "dataTables": [ + { + "varQuantity": 12, + "caseQuantity": 74, + "UNF": "UNF:6:RPd9EWHSZwqUvRZuKTJMqg==", + "dataVariables": [ + { + "id": 2, + "name": "make", + "label": "Make and Model", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "CHARACTER", + "isOrderedCategorical": false, + "fileOrder": 0, + "UNF": "UNF:6:Oo4vwiL8ffhSECOcjsKk2g==", + "variableMetadata": [] + }, + { + "id": 5, + "name": "price", + "label": "Price", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 1, + "UNF": "UNF:6:rvfkkdA36AaCSqCQciybfA==", + "variableMetadata": [], + "summaryStatistics": { + "min": "3291.0", + "medn": "5006.5", + "mean": "6165.256756756757", + "max": "15906.0", + "vald": "74.0", + "mode": ".", + "stdev": "2949.4958847689186", + "invd": "0.0" + } + }, + { + "id": 3, + "name": "mpg", + "label": "Mileage (mpg)", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 2, + "UNF": "UNF:6:vVr3w8CgeZq1KpDfJQudOg==", + "variableMetadata": [], + "summaryStatistics": { + "max": "41.0", + "vald": "74.0", + "medn": "20.0", + "min": "12.0", + "stdev": "5.785503209735141", + "mean": "21.2972972972973", + "invd": "0.0", + "mode": "." + } + }, + { + "id": 12, + "name": "rep78", + "label": "Repair Record 1978", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 3, + "UNF": "UNF:6:gbFI98swTWNhAjCRyi2cdA==", + "variableMetadata": [], + "summaryStatistics": { + "stdev": "0.989932270109041", + "mode": ".", + "min": "1.0", + "max": "5.0", + "medn": "3.0", + "mean": "3.4057971014492754", + "vald": "69.0", + "invd": "5.0" + } + }, + { + "id": 1, + "name": "headroom", + "label": "Headroom (in.)", + "weighted": false, + "variableIntervalType": "contin", + "variableFormatType": "NUMERIC", + "format": "float", + "isOrderedCategorical": false, + "fileOrder": 4, + "UNF": "UNF:6:g4Pl3T0Oz2e/OKJ64WiTnA==", + "variableMetadata": [], + "summaryStatistics": { + "mean": "2.993243243243243", + "mode": ".", + "vald": "74.0", + "invd": "0.0", + "stdev": "0.845994766828771", + "min": "1.5", + "medn": "3.0", + "max": "5.0" + } + }, + { + "id": 7, + "name": "trunk", + "label": "Trunk space (cu. ft.)", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 5, + "UNF": "UNF:6:iab0POsE3By7dQfgX/TY4g==", + "variableMetadata": [], + "summaryStatistics": { + "vald": "74.0", + "mode": ".", + "mean": "13.756756756756756", + "max": "23.0", + "min": "5.0", + "medn": "14.0", + "invd": "0.0", + "stdev": "4.277404189173201" + } + }, + { + "id": 4, + "name": "weight", + "label": "Weight (lbs.)", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 6, + "UNF": "UNF:6:cdoTdfUNeYWHHFEBCDxg+w==", + "variableMetadata": [], + "summaryStatistics": { + "invd": "0.0", + "min": "1760.0", + "vald": "74.0", + "max": "4840.0", + "stdev": "777.1935671373664", + "mean": "3019.459459459459", + "mode": ".", + "medn": "3190.0" + } + }, + { + "id": 8, + "name": "length", + "label": "Length (in.)", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 7, + "UNF": "UNF:6:8z1rjwhqBN4meYIiKI4P1A==", + "variableMetadata": [], + "summaryStatistics": { + "mode": ".", + "vald": "74.0", + "invd": "0.0", + "stdev": "22.266339902021585", + "max": "233.0", + "medn": "192.5", + "mean": "187.93243243243245", + "min": "142.0" + } + }, + { + "id": 9, + "name": "turn", + "label": "Turn Circle (ft.) ", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 8, + "UNF": "UNF:6:QxhjrrNtVz4qA8RulQ2MuQ==", + "variableMetadata": [], + "summaryStatistics": { + "stdev": "4.399353727233908", + "vald": "74.0", + "max": "51.0", + "min": "31.0", + "medn": "40.0", + "mean": "39.648648648648646", + "invd": "0.0", + "mode": "." + } + }, + { + "id": 10, + "name": "displacement", + "label": "Displacement (cu. in.)", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 9, + "UNF": "UNF:6:ftk+RAQpTCT1/y6G/rLWfA==", + "variableMetadata": [], + "summaryStatistics": { + "stdev": "91.83721896440396", + "invd": "0.0", + "min": "79.0", + "medn": "196.0", + "mode": ".", + "vald": "74.0", + "mean": "197.2972972972973", + "max": "425.0" + } + }, + { + "id": 6, + "name": "gear_ratio", + "label": "Gear Ratio", + "weighted": false, + "variableIntervalType": "contin", + "variableFormatType": "NUMERIC", + "format": "float", + "isOrderedCategorical": false, + "fileOrder": 10, + "UNF": "UNF:6:qjnY/qbx26FTepoPqRZ6lw==", + "variableMetadata": [], + "summaryStatistics": { + "medn": "2.9550000429153442", + "stdev": "0.45628709670763035", + "mean": "3.0148648667979883", + "min": "2.190000057220459", + "max": "3.890000104904175", + "mode": ".", + "vald": "74.0", + "invd": "0.0" + } + }, + { + "id": 11, + "name": "foreign", + "label": "Car type", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 11, + "UNF": "UNF:6:nbjzgh3wfAFqKpaoFnHalA==", + "variableMetadata": [], + "summaryStatistics": { + "max": "1.0", + "invd": "0.0", + "mode": ".", + "medn": "0.0", + "stdev": "0.46018845840901884", + "min": "0.0", + "mean": "0.2972972972972975", + "vald": "74.0" + }, + "variableCategories": [ + { + "label": "Domestic", + "value": "0", + "isMissing": false, + "frequency": 52.0 + }, + { + "label": "Foreign", + "value": "1", + "isMissing": false, + "frequency": 22.0 + } + ] + } + ] + } + ], + "varGroups": [] + }, + { + "id": 8, + "persistentId": "", + "filename": "README.md", + "contentType": "text/markdown", + "friendlyType": "Markdown Text", + "filesize": 28, + "description": "", + "storageIdentifier": "local://196d9f15664-1d4bb4e96a97", + "rootDataFileId": -1, + "md5": "a2e484d07ee5590cc32182dc2c6ccc83", + "checksum": { + "type": "MD5", + "value": "a2e484d07ee5590cc32182dc2c6ccc83" + }, + "tabularData": false, + "creationDate": "2025-05-16", + "publicationDate": "2025-05-16", + "fileAccessRequest": true, + "restricted": false, + "fileMetadataId": 2, + "varGroups": [] + } +] diff --git a/src/test/resources/croissant/cars/in/datasetJson.json b/src/test/resources/croissant/cars/in/datasetJson.json new file mode 100644 index 00000000000..96aa26c9228 --- /dev/null +++ b/src/test/resources/croissant/cars/in/datasetJson.json @@ -0,0 +1,228 @@ +{ + "id": 6, + "identifier": "FK2/CY7BWA", + "persistentUrl": "https://doi.org/10.5072/FK2/CY7BWA", + "protocol": "doi", + "authority": "10.5072", + "separator": "/", + "publisher": "Root", + "publicationDate": "2025-05-16", + "storageIdentifier": "local://10.5072/FK2/CY7BWA", + "datasetType": "dataset", + "datasetVersion": { + "id": 3, + "datasetId": 6, + "datasetPersistentId": "doi:10.5072/FK2/CY7BWA", + "storageIdentifier": "local://10.5072/FK2/CY7BWA", + "versionNumber": 1, + "internalVersionNumber": 10, + "versionMinorNumber": 0, + "versionState": "RELEASED", + "latestVersionPublishingState": "RELEASED", + "UNF": "UNF:6:RPd9EWHSZwqUvRZuKTJMqg==", + "lastUpdateTime": "2025-05-16T16:33:18Z", + "releaseTime": "2025-05-16T16:33:18Z", + "createTime": "2025-05-16T16:33:13Z", + "publicationDate": "2025-05-16", + "citationDate": "2025-05-16", + "license": { + "name": "CC0 1.0", + "uri": "http://creativecommons.org/publicdomain/zero/1.0", + "iconUri": "https://licensebuttons.net/p/zero/1.0/88x31.png", + "rightsIdentifier": "CC0-1.0", + "rightsIdentifierScheme": "SPDX", + "schemeUri": "https://spdx.org/licenses/", + "languageCode": "en" + }, + "fileAccessRequest": true, + "metadataBlocks": { + "citation": { + "displayName": "Citation Metadata", + "name": "citation", + "fields": [ + { + "typeName": "title", + "multiple": false, + "typeClass": "primitive", + "value": "Cars" + }, + { + "typeName": "author", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "authorName": { + "typeName": "authorName", + "multiple": false, + "typeClass": "primitive", + "value": "Durbin, Philip" + }, + "authorAffiliation": { + "typeName": "authorAffiliation", + "multiple": false, + "typeClass": "primitive", + "value": "Harvard" + } + } + ] + }, + { + "typeName": "datasetContact", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "datasetContactName": { + "typeName": "datasetContactName", + "multiple": false, + "typeClass": "primitive", + "value": "Durbin, Philip" + }, + "datasetContactAffiliation": { + "typeName": "datasetContactAffiliation", + "multiple": false, + "typeClass": "primitive", + "value": "Harvard" + }, + "datasetContactEmail": { + "typeName": "datasetContactEmail", + "multiple": false, + "typeClass": "primitive", + "value": "dataverse@mailinator.com" + } + } + ] + }, + { + "typeName": "dsDescription", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "dsDescriptionValue": { + "typeName": "dsDescriptionValue", + "multiple": false, + "typeClass": "primitive", + "value": "This dataset is about cars." + } + } + ] + }, + { + "typeName": "subject", + "multiple": true, + "typeClass": "controlledVocabulary", + "value": [ + "Other" + ] + }, + { + "typeName": "depositor", + "multiple": false, + "typeClass": "primitive", + "value": "Durbin, Philip" + }, + { + "typeName": "dateOfDeposit", + "multiple": false, + "typeClass": "primitive", + "value": "2024-03-13" + } + ] + } + }, + "files": [ + { + "description": "", + "label": "compute.py", + "restricted": false, + "directoryLabel": "code", + "version": 2, + "datasetVersionId": 3, + "dataFile": { + "id": 7, + "persistentId": "", + "filename": "compute.py", + "contentType": "text/x-python", + "friendlyType": "Python Source Code", + "filesize": 15, + "description": "", + "storageIdentifier": "local://196d9f154f7-8cadf34ee905", + "rootDataFileId": -1, + "md5": "d84985e94dde671f318076bd7a137f15", + "checksum": { + "type": "MD5", + "value": "d84985e94dde671f318076bd7a137f15" + }, + "tabularData": false, + "creationDate": "2025-05-16", + "publicationDate": "2025-05-16", + "fileAccessRequest": true + } + }, + { + "description": "", + "label": "README.md", + "restricted": false, + "directoryLabel": "doc", + "version": 2, + "datasetVersionId": 3, + "dataFile": { + "id": 8, + "persistentId": "", + "filename": "README.md", + "contentType": "text/markdown", + "friendlyType": "Markdown Text", + "filesize": 28, + "description": "", + "storageIdentifier": "local://196d9f15664-1d4bb4e96a97", + "rootDataFileId": -1, + "md5": "a2e484d07ee5590cc32182dc2c6ccc83", + "checksum": { + "type": "MD5", + "value": "a2e484d07ee5590cc32182dc2c6ccc83" + }, + "tabularData": false, + "creationDate": "2025-05-16", + "publicationDate": "2025-05-16", + "fileAccessRequest": true + } + }, + { + "description": "", + "label": "stata13-auto.tab", + "restricted": false, + "directoryLabel": "data", + "version": 4, + "datasetVersionId": 3, + "dataFile": { + "id": 9, + "persistentId": "", + "filename": "stata13-auto.tab", + "contentType": "text/tab-separated-values", + "friendlyType": "Tab-Delimited", + "filesize": 4026, + "description": "", + "storageIdentifier": "local://196d9f15719-2270bfca2b48", + "originalFileFormat": "application/x-stata-13", + "originalFormatLabel": "Stata 13 Binary", + "originalFileSize": 6443, + "originalFileName": "stata13-auto.dta", + "UNF": "UNF:6:RPd9EWHSZwqUvRZuKTJMqg==", + "rootDataFileId": -1, + "md5": "7b1201ce6b469796837a835377338c5a", + "checksum": { + "type": "MD5", + "value": "7b1201ce6b469796837a835377338c5a" + }, + "tabularData": true, + "creationDate": "2025-05-16", + "publicationDate": "2025-05-16", + "fileAccessRequest": true + } + } + ], + "citation": "Durbin, Philip, 2025, \"Cars\", https://doi.org/10.5072/FK2/CY7BWA, Root, V1, UNF:6:RPd9EWHSZwqUvRZuKTJMqg== [fileUNF]" + } +} diff --git a/src/test/resources/croissant/cars/in/datasetORE.json b/src/test/resources/croissant/cars/in/datasetORE.json new file mode 100644 index 00000000000..0b244ada0c1 --- /dev/null +++ b/src/test/resources/croissant/cars/in/datasetORE.json @@ -0,0 +1,133 @@ +{ + "dcterms:modified": "2025-05-19", + "dcterms:creator": "Root", + "@type": "ore:ResourceMap", + "schema:additionalType": "Dataverse OREMap Format v1.0.1", + "dvcore:generatedBy": { + "@type": "schema:SoftwareApplication", + "schema:name": "Dataverse", + "schema:version": "6.6", + "schema:url": "https://github.com/iqss/dataverse" + }, + "@id": "http://localhost:8080/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.5072/FK2/CY7BWA", + "ore:describes": { + "citation:datasetContact": { + "citation:datasetContactName": "Durbin, Philip", + "citation:datasetContactAffiliation": "Harvard", + "citation:datasetContactEmail": "dataverse@mailinator.com" + }, + "author": { + "citation:authorName": "Durbin, Philip", + "citation:authorAffiliation": "Harvard" + }, + "citation:dsDescription": { + "citation:dsDescriptionValue": "This dataset is about cars." + }, + "dateOfDeposit": "2024-03-13", + "title": "Cars", + "citation:depositor": "Durbin, Philip", + "subject": "Other", + "@id": "https://doi.org/10.5072/FK2/CY7BWA", + "@type": [ + "ore:Aggregation", + "schema:Dataset" + ], + "schema:version": "1.0", + "schema:name": "Cars", + "schema:dateModified": "Fri May 16 16:33:18 UTC 2025", + "schema:datePublished": "2025-05-16", + "schema:creativeWorkStatus": "RELEASED", + "schema:license": "http://creativecommons.org/publicdomain/zero/1.0", + "dvcore:fileTermsOfAccess": { + "dvcore:fileRequestAccess": true + }, + "schema:includedInDataCatalog": "Root", + "schema:isPartOf": { + "schema:name": "Cars", + "@id": "http://localhost:8080/dataverse/cars", + "schema:description": "Data about cars.", + "schema:isPartOf": { + "schema:name": "Root", + "@id": "http://localhost:8080/dataverse/root", + "schema:description": "The root dataverse." + } + }, + "ore:aggregates": [ + { + "schema:description": "", + "schema:name": "compute.py", + "dvcore:restricted": false, + "dvcore:directoryLabel": "code", + "schema:version": 2, + "dvcore:datasetVersionId": 3, + "@id": "http://localhost:8080/file.xhtml?fileId=7", + "schema:sameAs": "http://localhost:8080/api/access/datafile/7", + "@type": "ore:AggregatedResource", + "schema:fileFormat": "text/x-python", + "dvcore:filesize": 15, + "dvcore:storageIdentifier": "local://196d9f154f7-8cadf34ee905", + "dvcore:rootDataFileId": -1, + "dvcore:checksum": { + "@type": "MD5", + "@value": "d84985e94dde671f318076bd7a137f15" + } + }, + { + "schema:description": "", + "schema:name": "README.md", + "dvcore:restricted": false, + "dvcore:directoryLabel": "doc", + "schema:version": 2, + "dvcore:datasetVersionId": 3, + "@id": "http://localhost:8080/file.xhtml?fileId=8", + "schema:sameAs": "http://localhost:8080/api/access/datafile/8", + "@type": "ore:AggregatedResource", + "schema:fileFormat": "text/markdown", + "dvcore:filesize": 28, + "dvcore:storageIdentifier": "local://196d9f15664-1d4bb4e96a97", + "dvcore:rootDataFileId": -1, + "dvcore:checksum": { + "@type": "MD5", + "@value": "a2e484d07ee5590cc32182dc2c6ccc83" + } + }, + { + "schema:description": "", + "schema:name": "stata13-auto.dta", + "dvcore:restricted": false, + "dvcore:directoryLabel": "data", + "schema:version": 4, + "dvcore:datasetVersionId": 3, + "@id": "http://localhost:8080/file.xhtml?fileId=9", + "schema:sameAs": "http://localhost:8080/api/access/datafile/9?format=original", + "@type": "ore:AggregatedResource", + "schema:fileFormat": "application/x-stata-13", + "dvcore:filesize": 6443, + "dvcore:storageIdentifier": "local://196d9f15719-2270bfca2b48", + "dvcore:currentIngestedName": "stata13-auto.tab", + "dvcore:UNF": "UNF:6:RPd9EWHSZwqUvRZuKTJMqg==", + "dvcore:rootDataFileId": -1, + "dvcore:checksum": { + "@type": "MD5", + "@value": "7b1201ce6b469796837a835377338c5a" + } + } + ], + "schema:hasPart": [ + "http://localhost:8080/file.xhtml?fileId=7", + "http://localhost:8080/file.xhtml?fileId=8", + "http://localhost:8080/file.xhtml?fileId=9" + ] + }, + "@context": { + "author": "http://purl.org/dc/terms/creator", + "citation": "https://dataverse.org/schema/citation/", + "dateOfDeposit": "http://purl.org/dc/terms/dateSubmitted", + "dcterms": "http://purl.org/dc/terms/", + "dvcore": "https://dataverse.org/schema/core#", + "ore": "http://www.openarchives.org/ore/terms/", + "schema": "http://schema.org/", + "subject": "http://purl.org/dc/terms/subject", + "title": "http://purl.org/dc/terms/title" + } +} diff --git a/src/test/resources/croissant/cars/in/datasetSchemaDotOrg.json b/src/test/resources/croissant/cars/in/datasetSchemaDotOrg.json new file mode 100644 index 00000000000..83f587c5fd7 --- /dev/null +++ b/src/test/resources/croissant/cars/in/datasetSchemaDotOrg.json @@ -0,0 +1,78 @@ +{ + "@context": "http://schema.org", + "@type": "Dataset", + "@id": "https://doi.org/10.5072/FK2/CY7BWA", + "identifier": "https://doi.org/10.5072/FK2/CY7BWA", + "name": "Cars", + "creator": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "affiliation": { + "@type": "Organization", + "name": "Harvard" + }, + "name": "Durbin, Philip" + } + ], + "author": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "affiliation": { + "@type": "Organization", + "name": "Harvard" + }, + "name": "Durbin, Philip" + } + ], + "datePublished": "2025-05-16", + "dateModified": "2025-05-16", + "version": "1", + "description": "This dataset is about cars.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "http://localhost:8080" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "provider": { + "@type": "Organization", + "name": "Root" + }, + "distribution": [ + { + "@type": "DataDownload", + "name": "compute.py", + "encodingFormat": "text/x-python", + "contentSize": 15, + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/7" + }, + { + "@type": "DataDownload", + "name": "stata13-auto.tab", + "encodingFormat": "text/tab-separated-values", + "contentSize": 4026, + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/9" + }, + { + "@type": "DataDownload", + "name": "README.md", + "encodingFormat": "text/markdown", + "contentSize": 28, + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/8" + } + ] +} diff --git a/src/test/resources/croissant/cars/out/croissant.json b/src/test/resources/croissant/cars/out/croissant.json new file mode 100644 index 00000000000..a9c0d48b217 --- /dev/null +++ b/src/test/resources/croissant/cars/out/croissant.json @@ -0,0 +1,302 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "rai": "http://mlcommons.org/croissant/RAI/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "samplingRate": "cr:samplingRate", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform", + "wd": "https://www.wikidata.org/wiki/" + }, + "@type": "sc:Dataset", + "conformsTo": "http://mlcommons.org/croissant/1.0", + "name": "Cars", + "url": "https://doi.org/10.5072/FK2/CY7BWA", + "creator": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "affiliation": { + "@type": "Organization", + "name": "Harvard" + }, + "name": "Durbin, Philip" + } + ], + "description": "This dataset is about cars.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "datePublished": "2025-05-16", + "dateModified": "2025-05-16", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "http://localhost:8080" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "version": "1.0", + "citeAs": "@data{FK2/CY7BWA_2025,author = {Durbin, Philip},publisher = {Root},title = {Cars},year = {2025},url = {https://doi.org/10.5072/FK2/CY7BWA}}", + "distribution": [ + { + "@type": "cr:FileObject", + "@id": "code/compute.py", + "name": "compute.py", + "encodingFormat": "text/x-python", + "md5": "d84985e94dde671f318076bd7a137f15", + "contentSize": "15", + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/7" + }, + { + "@type": "cr:FileObject", + "@id": "data/stata13-auto.dta", + "name": "stata13-auto.dta", + "encodingFormat": "application/x-stata-13", + "md5": "7b1201ce6b469796837a835377338c5a", + "contentSize": "6443", + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/9?format=original" + }, + { + "@type": "cr:FileObject", + "@id": "doc/README.md", + "name": "README.md", + "encodingFormat": "text/markdown", + "md5": "a2e484d07ee5590cc32182dc2c6ccc83", + "contentSize": "28", + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/8" + } + ], + "recordSet": [ + { + "@type": "cr:RecordSet", + "field": [ + { + "@type": "cr:Field", + "name": "make", + "description": "Make and Model", + "dataType": "sc:Text", + "source": { + "@id": "2", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "make" + } + } + }, + { + "@type": "cr:Field", + "name": "price", + "description": "Price", + "dataType": "sc:Integer", + "source": { + "@id": "5", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "price" + } + } + }, + { + "@type": "cr:Field", + "name": "mpg", + "description": "Mileage (mpg)", + "dataType": "sc:Integer", + "source": { + "@id": "3", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "mpg" + } + } + }, + { + "@type": "cr:Field", + "name": "rep78", + "description": "Repair Record 1978", + "dataType": "sc:Integer", + "source": { + "@id": "12", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "rep78" + } + } + }, + { + "@type": "cr:Field", + "name": "headroom", + "description": "Headroom (in.)", + "dataType": "sc:Float", + "source": { + "@id": "1", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "headroom" + } + } + }, + { + "@type": "cr:Field", + "name": "trunk", + "description": "Trunk space (cu. ft.)", + "dataType": "sc:Integer", + "source": { + "@id": "7", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "trunk" + } + } + }, + { + "@type": "cr:Field", + "name": "weight", + "description": "Weight (lbs.)", + "dataType": "sc:Integer", + "source": { + "@id": "4", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "weight" + } + } + }, + { + "@type": "cr:Field", + "name": "length", + "description": "Length (in.)", + "dataType": "sc:Integer", + "source": { + "@id": "8", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "length" + } + } + }, + { + "@type": "cr:Field", + "name": "turn", + "description": "Turn Circle (ft.) ", + "dataType": "sc:Integer", + "source": { + "@id": "9", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "turn" + } + } + }, + { + "@type": "cr:Field", + "name": "displacement", + "description": "Displacement (cu. in.)", + "dataType": "sc:Integer", + "source": { + "@id": "10", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "displacement" + } + } + }, + { + "@type": "cr:Field", + "name": "gear_ratio", + "description": "Gear Ratio", + "dataType": "sc:Float", + "source": { + "@id": "6", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "gear_ratio" + } + } + }, + { + "@type": "cr:Field", + "name": "foreign", + "description": "Car type", + "dataType": "sc:Integer", + "source": { + "@id": "11", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "foreign" + } + } + } + ] + } + ] +} \ No newline at end of file diff --git a/src/test/resources/croissant/draft/expected/draft-croissant.json b/src/test/resources/croissant/draft/expected/draft-croissant.json new file mode 100644 index 00000000000..b2065f79195 --- /dev/null +++ b/src/test/resources/croissant/draft/expected/draft-croissant.json @@ -0,0 +1,94 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "rai": "http://mlcommons.org/croissant/RAI/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "samplingRate": "cr:samplingRate", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform", + "wd": "https://www.wikidata.org/wiki/" + }, + "@type": "sc:Dataset", + "conformsTo": "http://mlcommons.org/croissant/1.0", + "name": "Draft Dataset", + "url": "https://doi.org/10.5072/FK2/OO7TEP", + "creator": [ + { + "@type": "Person", + "givenName": "Draft", + "familyName": "Punk", + "affiliation": { + "@type": "Organization", + "name": "French house" + }, + "name": "Punk, Draft" + } + ], + "description": "This dataset hasn't been published yet.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "dateModified": "", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "http://localhost:8080" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "version": "DRAFT", + "citeAs": "@data{FK2/OO7TEP,author = {Punk, Draft},publisher = {Root},title = {Draft Dataset},url = {https://doi.org/10.5072/FK2/OO7TEP}}", + "distribution": [ + { + "@type": "cr:FileObject", + "@id": "data.txt", + "name": "data.txt", + "encodingFormat": "text/plain", + "md5": "050644e853fdfe46a3707695ba2fe736", + "contentSize": "18", + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/4" + } + ] +} \ No newline at end of file diff --git a/src/test/resources/croissant/draft/in/dataCiteXml.xml b/src/test/resources/croissant/draft/in/dataCiteXml.xml new file mode 100644 index 00000000000..814f3d365e7 --- /dev/null +++ b/src/test/resources/croissant/draft/in/dataCiteXml.xml @@ -0,0 +1,46 @@ + + + 10.5072/FK2/OO7TEP + + + Punk, Draft + Draft + Punk + French house + + + + Draft Dataset + + Root + 2025 + + Other + + + + Admin, Dataverse + Dataverse + Admin + Dataverse.org + + + + 2025-04-14 + + + + 18 + + + text/plain + + DRAFT + + + Creative Commons CC0 1.0 Universal Public Domain Dedication. + + + This dataset hasn&apos;t been published yet. + + diff --git a/src/test/resources/croissant/draft/in/datasetFileDetails.json b/src/test/resources/croissant/draft/in/datasetFileDetails.json new file mode 100644 index 00000000000..1460aedba00 --- /dev/null +++ b/src/test/resources/croissant/draft/in/datasetFileDetails.json @@ -0,0 +1,23 @@ +[ + { + "id": 4, + "persistentId": "", + "filename": "data.txt", + "contentType": "text/plain", + "friendlyType": "Plain Text", + "filesize": 18, + "storageIdentifier": "local://196347bdb85-7b4820f8e4ef", + "rootDataFileId": -1, + "md5": "050644e853fdfe46a3707695ba2fe736", + "checksum": { + "type": "MD5", + "value": "050644e853fdfe46a3707695ba2fe736" + }, + "tabularData": false, + "creationDate": "2025-04-14", + "fileAccessRequest": false, + "restricted": false, + "fileMetadataId": 1, + "varGroups": [] + } +] diff --git a/src/test/resources/croissant/draft/in/datasetJson.json b/src/test/resources/croissant/draft/in/datasetJson.json new file mode 100644 index 00000000000..bbfd30ed03a --- /dev/null +++ b/src/test/resources/croissant/draft/in/datasetJson.json @@ -0,0 +1,156 @@ +{ + "id": 3, + "identifier": "FK2/OO7TEP", + "persistentUrl": "https://doi.org/10.5072/FK2/OO7TEP", + "protocol": "doi", + "authority": "10.5072", + "separator": "/", + "publisher": "Root", + "storageIdentifier": "local://10.5072/FK2/OO7TEP", + "datasetType": "dataset", + "datasetVersion": { + "id": 1, + "datasetId": 3, + "datasetPersistentId": "doi:10.5072/FK2/OO7TEP", + "storageIdentifier": "local://10.5072/FK2/OO7TEP", + "internalVersionNumber": 2, + "versionState": "DRAFT", + "latestVersionPublishingState": "DRAFT", + "lastUpdateTime": "2025-04-14T13:27:47Z", + "createTime": "2025-04-14T13:26:41Z", + "license": { + "name": "CC0 1.0", + "uri": "http://creativecommons.org/publicdomain/zero/1.0", + "iconUri": "https://licensebuttons.net/p/zero/1.0/88x31.png", + "rightsIdentifier": "CC0-1.0", + "rightsIdentifierScheme": "SPDX", + "schemeUri": "https://spdx.org/licenses/", + "languageCode": "en" + }, + "fileAccessRequest": true, + "metadataBlocks": { + "citation": { + "displayName": "Citation Metadata", + "name": "citation", + "fields": [ + { + "typeName": "title", + "multiple": false, + "typeClass": "primitive", + "value": "Draft Dataset" + }, + { + "typeName": "author", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "authorName": { + "typeName": "authorName", + "multiple": false, + "typeClass": "primitive", + "value": "Punk, Draft" + }, + "authorAffiliation": { + "typeName": "authorAffiliation", + "multiple": false, + "typeClass": "primitive", + "value": "French house" + } + } + ] + }, + { + "typeName": "datasetContact", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "datasetContactName": { + "typeName": "datasetContactName", + "multiple": false, + "typeClass": "primitive", + "value": "Admin, Dataverse" + }, + "datasetContactAffiliation": { + "typeName": "datasetContactAffiliation", + "multiple": false, + "typeClass": "primitive", + "value": "Dataverse.org" + }, + "datasetContactEmail": { + "typeName": "datasetContactEmail", + "multiple": false, + "typeClass": "primitive", + "value": "dataverse@mailinator.com" + } + } + ] + }, + { + "typeName": "dsDescription", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "dsDescriptionValue": { + "typeName": "dsDescriptionValue", + "multiple": false, + "typeClass": "primitive", + "value": "This dataset hasn't been published yet." + } + } + ] + }, + { + "typeName": "subject", + "multiple": true, + "typeClass": "controlledVocabulary", + "value": [ + "Other" + ] + }, + { + "typeName": "depositor", + "multiple": false, + "typeClass": "primitive", + "value": "Admin, Dataverse" + }, + { + "typeName": "dateOfDeposit", + "multiple": false, + "typeClass": "primitive", + "value": "2025-04-14" + } + ] + } + }, + "files": [ + { + "label": "data.txt", + "restricted": false, + "version": 1, + "datasetVersionId": 1, + "dataFile": { + "id": 4, + "persistentId": "", + "filename": "data.txt", + "contentType": "text/plain", + "friendlyType": "Plain Text", + "filesize": 18, + "storageIdentifier": "local://196347bdb85-7b4820f8e4ef", + "rootDataFileId": -1, + "md5": "050644e853fdfe46a3707695ba2fe736", + "checksum": { + "type": "MD5", + "value": "050644e853fdfe46a3707695ba2fe736" + }, + "tabularData": false, + "creationDate": "2025-04-14", + "fileAccessRequest": false + } + } + ], + "citation": "Punk, Draft, 2025, \"Draft Dataset\", https://doi.org/10.5072/FK2/OO7TEP, Root, DRAFT VERSION" + } +} diff --git a/src/test/resources/croissant/draft/in/datasetORE.json b/src/test/resources/croissant/draft/in/datasetORE.json new file mode 100644 index 00000000000..8f9cfe6fb63 --- /dev/null +++ b/src/test/resources/croissant/draft/in/datasetORE.json @@ -0,0 +1,87 @@ +{ + "dcterms:modified": "2025-04-14", + "dcterms:creator": "Root", + "@type": "ore:ResourceMap", + "schema:additionalType": "Dataverse OREMap Format v1.0.1", + "dvcore:generatedBy": { + "@type": "schema:SoftwareApplication", + "schema:name": "Dataverse", + "schema:version": "6.6", + "schema:url": "https://github.com/iqss/dataverse" + }, + "@id": "http://localhost:8080/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.5072/FK2/OO7TEP", + "ore:describes": { + "author": { + "citation:authorName": "Punk, Draft", + "citation:authorAffiliation": "French house" + }, + "citation:datasetContact": { + "citation:datasetContactName": "Admin, Dataverse", + "citation:datasetContactAffiliation": "Dataverse.org", + "citation:datasetContactEmail": "dataverse@mailinator.com" + }, + "citation:dsDescription": { + "citation:dsDescriptionValue": "This dataset hasn't been published yet." + }, + "dateOfDeposit": "2025-04-14", + "citation:depositor": "Admin, Dataverse", + "subject": "Other", + "title": "Draft Dataset", + "@id": "https://doi.org/10.5072/FK2/OO7TEP", + "@type": [ + "ore:Aggregation", + "schema:Dataset" + ], + "schema:version": "DRAFT", + "schema:name": "Draft Dataset", + "schema:dateModified": "Mon Apr 14 13:27:47 UTC 2025", + "schema:creativeWorkStatus": "DRAFT", + "schema:license": "http://creativecommons.org/publicdomain/zero/1.0", + "dvcore:fileTermsOfAccess": { + "dvcore:fileRequestAccess": true + }, + "schema:includedInDataCatalog": "Root", + "schema:isPartOf": { + "schema:name": "Draft Collection", + "@id": "http://localhost:8080/dataverse/draft", + "schema:isPartOf": { + "schema:name": "Root", + "@id": "http://localhost:8080/dataverse/root", + "schema:description": "The root dataverse." + } + }, + "ore:aggregates": [ + { + "schema:name": "data.txt", + "dvcore:restricted": false, + "schema:version": 1, + "dvcore:datasetVersionId": 1, + "@id": "http://localhost:8080/file.xhtml?fileId=4", + "schema:sameAs": "http://localhost:8080/api/access/datafile/4", + "@type": "ore:AggregatedResource", + "schema:fileFormat": "text/plain", + "dvcore:filesize": 18, + "dvcore:storageIdentifier": "local://196347bdb85-7b4820f8e4ef", + "dvcore:rootDataFileId": -1, + "dvcore:checksum": { + "@type": "MD5", + "@value": "050644e853fdfe46a3707695ba2fe736" + } + } + ], + "schema:hasPart": [ + "http://localhost:8080/file.xhtml?fileId=4" + ] + }, + "@context": { + "author": "http://purl.org/dc/terms/creator", + "citation": "https://dataverse.org/schema/citation/", + "dateOfDeposit": "http://purl.org/dc/terms/dateSubmitted", + "dcterms": "http://purl.org/dc/terms/", + "dvcore": "https://dataverse.org/schema/core#", + "ore": "http://www.openarchives.org/ore/terms/", + "schema": "http://schema.org/", + "subject": "http://purl.org/dc/terms/subject", + "title": "http://purl.org/dc/terms/title" + } +} diff --git a/src/test/resources/croissant/draft/in/datasetSchemaDotOrg.json b/src/test/resources/croissant/draft/in/datasetSchemaDotOrg.json new file mode 100644 index 00000000000..62328140af8 --- /dev/null +++ b/src/test/resources/croissant/draft/in/datasetSchemaDotOrg.json @@ -0,0 +1,60 @@ +{ + "@context": "http://schema.org", + "@type": "Dataset", + "@id": "https://doi.org/10.5072/FK2/OO7TEP", + "identifier": "https://doi.org/10.5072/FK2/OO7TEP", + "name": "Draft Dataset", + "creator": [ + { + "@type": "Person", + "givenName": "Draft", + "familyName": "Punk", + "affiliation": { + "@type": "Organization", + "name": "French house" + }, + "name": "Punk, Draft" + } + ], + "author": [ + { + "@type": "Person", + "givenName": "Draft", + "familyName": "Punk", + "affiliation": { + "@type": "Organization", + "name": "French house" + }, + "name": "Punk, Draft" + } + ], + "dateModified": "", + "version": "DRAFT", + "description": "This dataset hasn't been published yet.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "http://localhost:8080" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "provider": { + "@type": "Organization", + "name": "Root" + }, + "distribution": [ + { + "@type": "DataDownload", + "name": "data.txt", + "encodingFormat": "text/plain", + "contentSize": 18, + "contentUrl": "http://localhost:8080/api/access/datafile/4" + } + ] +} diff --git a/src/test/resources/croissant/draft/out/croissant.json b/src/test/resources/croissant/draft/out/croissant.json new file mode 100644 index 00000000000..b2065f79195 --- /dev/null +++ b/src/test/resources/croissant/draft/out/croissant.json @@ -0,0 +1,94 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "rai": "http://mlcommons.org/croissant/RAI/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "samplingRate": "cr:samplingRate", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform", + "wd": "https://www.wikidata.org/wiki/" + }, + "@type": "sc:Dataset", + "conformsTo": "http://mlcommons.org/croissant/1.0", + "name": "Draft Dataset", + "url": "https://doi.org/10.5072/FK2/OO7TEP", + "creator": [ + { + "@type": "Person", + "givenName": "Draft", + "familyName": "Punk", + "affiliation": { + "@type": "Organization", + "name": "French house" + }, + "name": "Punk, Draft" + } + ], + "description": "This dataset hasn't been published yet.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "dateModified": "", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "http://localhost:8080" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "version": "DRAFT", + "citeAs": "@data{FK2/OO7TEP,author = {Punk, Draft},publisher = {Root},title = {Draft Dataset},url = {https://doi.org/10.5072/FK2/OO7TEP}}", + "distribution": [ + { + "@type": "cr:FileObject", + "@id": "data.txt", + "name": "data.txt", + "encodingFormat": "text/plain", + "md5": "050644e853fdfe46a3707695ba2fe736", + "contentSize": "18", + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/4" + } + ] +} \ No newline at end of file diff --git a/src/test/resources/croissant/junk/expected/junk-croissant.json b/src/test/resources/croissant/junk/expected/junk-croissant.json new file mode 100644 index 00000000000..b02bed5694e --- /dev/null +++ b/src/test/resources/croissant/junk/expected/junk-croissant.json @@ -0,0 +1,83 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "rai": "http://mlcommons.org/croissant/RAI/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "samplingRate": "cr:samplingRate", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform", + "wd": "https://www.wikidata.org/wiki/" + }, + "@type": "sc:Dataset", + "conformsTo": "http://mlcommons.org/croissant/1.0", + "name": "</script><script>alert(666)</script>", + "url": "https://doi.org/10.5072/FK2/0CNXUJ", + "creator": [ + { + "@type": "Person", + "givenName": "Sylvester", + "familyName": "Ritter", + "affiliation": { + "@type": "Organization", + "name": "WWF" + }, + "name": "Ritter, Sylvester" + } + ], + "description": "A junk dataset.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "datePublished": "2025-03-13", + "dateModified": "2025-03-13", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "http://localhost:8080" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "version": "1.0", + "citeAs": "@data{FK2/0CNXUJ_2025,author = {Ritter, Sylvester},publisher = {Root},title = {},year = {2025},url = {https://doi.org/10.5072/FK2/0CNXUJ}}" +} \ No newline at end of file diff --git a/src/test/resources/croissant/junk/in/dataCiteXml.xml b/src/test/resources/croissant/junk/in/dataCiteXml.xml new file mode 100644 index 00000000000..d6c11b056e2 --- /dev/null +++ b/src/test/resources/croissant/junk/in/dataCiteXml.xml @@ -0,0 +1,33 @@ + + + 10.5072/FK2/0CNXUJ + + + Ritter, Sylvester + Sylvester + Ritter + WWF + + + + :unav + + Root + 2025 + + Other + + + 2025-03-13 + 2025-03-13 + + + 1.0 + + + Creative Commons CC0 1.0 Universal Public Domain Dedication. + + + A junk dataset. + + diff --git a/src/test/resources/croissant/junk/in/datasetFileDetails.json b/src/test/resources/croissant/junk/in/datasetFileDetails.json new file mode 100644 index 00000000000..fe51488c706 --- /dev/null +++ b/src/test/resources/croissant/junk/in/datasetFileDetails.json @@ -0,0 +1 @@ +[] diff --git a/src/test/resources/croissant/junk/in/datasetJson.json b/src/test/resources/croissant/junk/in/datasetJson.json new file mode 100644 index 00000000000..984ae55cb92 --- /dev/null +++ b/src/test/resources/croissant/junk/in/datasetJson.json @@ -0,0 +1,124 @@ +{ + "id": 2, + "identifier": "FK2/0CNXUJ", + "persistentUrl": "https://doi.org/10.5072/FK2/0CNXUJ", + "protocol": "doi", + "authority": "10.5072", + "separator": "/", + "publisher": "Root", + "publicationDate": "2025-03-13", + "storageIdentifier": "local://10.5072/FK2/0CNXUJ", + "datasetType": "dataset", + "datasetVersion": { + "id": 1, + "datasetId": 2, + "datasetPersistentId": "doi:10.5072/FK2/0CNXUJ", + "storageIdentifier": "local://10.5072/FK2/0CNXUJ", + "versionNumber": 1, + "versionMinorNumber": 0, + "versionState": "RELEASED", + "latestVersionPublishingState": "RELEASED", + "lastUpdateTime": "2025-03-13T14:56:36Z", + "releaseTime": "2025-03-13T14:56:36Z", + "createTime": "2025-03-13T14:56:26Z", + "publicationDate": "2025-03-13", + "citationDate": "2025-03-13", + "license": { + "name": "CC0 1.0", + "uri": "http://creativecommons.org/publicdomain/zero/1.0", + "iconUri": "https://licensebuttons.net/p/zero/1.0/88x31.png", + "rightsIdentifier": "CC0-1.0", + "rightsIdentifierScheme": "SPDX", + "schemeUri": "https://spdx.org/licenses/", + "languageCode": "en" + }, + "fileAccessRequest": true, + "metadataBlocks": { + "citation": { + "displayName": "Citation Metadata", + "name": "citation", + "fields": [ + { + "typeName": "title", + "multiple": false, + "typeClass": "primitive", + "value": "" + }, + { + "typeName": "author", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "authorName": { + "typeName": "authorName", + "multiple": false, + "typeClass": "primitive", + "value": "Ritter, Sylvester" + }, + "authorAffiliation": { + "typeName": "authorAffiliation", + "multiple": false, + "typeClass": "primitive", + "value": "WWF" + } + } + ] + }, + { + "typeName": "datasetContact", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "datasetContactEmail": { + "typeName": "datasetContactEmail", + "multiple": false, + "typeClass": "primitive", + "value": "dataverse@mailinator.com" + } + } + ] + }, + { + "typeName": "dsDescription", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "dsDescriptionValue": { + "typeName": "dsDescriptionValue", + "multiple": false, + "typeClass": "primitive", + "value": "A junk dataset." + } + } + ] + }, + { + "typeName": "subject", + "multiple": true, + "typeClass": "controlledVocabulary", + "value": [ + "Other" + ] + }, + { + "typeName": "depositor", + "multiple": false, + "typeClass": "primitive", + "value": "Admin, Dataverse" + }, + { + "typeName": "dateOfDeposit", + "multiple": false, + "typeClass": "primitive", + "value": "2025-03-13" + } + ] + } + }, + "files": [], + "citation": "Ritter, Sylvester, 2025, https://doi.org/10.5072/FK2/0CNXUJ, Root, V1" + } +} diff --git a/src/test/resources/croissant/junk/in/datasetORE.json b/src/test/resources/croissant/junk/in/datasetORE.json new file mode 100644 index 00000000000..646955bbb17 --- /dev/null +++ b/src/test/resources/croissant/junk/in/datasetORE.json @@ -0,0 +1,62 @@ +{ + "dcterms:modified": "2025-03-13", + "dcterms:creator": "Root", + "@type": "ore:ResourceMap", + "schema:additionalType": "Dataverse OREMap Format v1.0.1", + "dvcore:generatedBy": { + "@type": "schema:SoftwareApplication", + "schema:name": "Dataverse", + "schema:version": "6.5", + "schema:url": "https://github.com/iqss/dataverse" + }, + "@id": "http://localhost:8080/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.5072/FK2/0CNXUJ", + "ore:describes": { + "citation:dsDescription": { + "citation:dsDescriptionValue": "A junk dataset." + }, + "author": { + "citation:authorName": "Ritter, Sylvester", + "citation:authorAffiliation": "WWF" + }, + "citation:datasetContact": { + "citation:datasetContactEmail": "dataverse@mailinator.com" + }, + "citation:depositor": "Admin, Dataverse", + "subject": "Other", + "title": "", + "dateOfDeposit": "2025-03-13", + "@id": "https://doi.org/10.5072/FK2/0CNXUJ", + "@type": [ + "ore:Aggregation", + "schema:Dataset" + ], + "schema:version": "1.0", + "schema:name": "", + "schema:dateModified": "Thu Mar 13 14:56:36 UTC 2025", + "schema:datePublished": "2025-03-13", + "schema:creativeWorkStatus": "RELEASED", + "schema:license": "http://creativecommons.org/publicdomain/zero/1.0", + "dvcore:fileTermsOfAccess": { + "dvcore:fileRequestAccess": true + }, + "schema:includedInDataCatalog": "Root", + "schema:isPartOf": { + "schema:name": "Root", + "@id": "http://localhost:8080/dataverse/root", + "schema:description": "The root dataverse." + }, + "ore:aggregates": [], + "schema:hasPart": [] + }, + "@context": { + "author": "http://purl.org/dc/terms/creator", + "citation": "https://dataverse.org/schema/citation/", + "dateOfDeposit": "http://purl.org/dc/terms/dateSubmitted", + "dcterms": "http://purl.org/dc/terms/", + "dvcore": "https://dataverse.org/schema/core#", + "ore": "http://www.openarchives.org/ore/terms/", + "schema": "http://schema.org/", + "subject": "http://purl.org/dc/terms/subject", + "title": "http://purl.org/dc/terms/title" + } +} diff --git a/src/test/resources/croissant/junk/in/datasetSchemaDotOrg.json b/src/test/resources/croissant/junk/in/datasetSchemaDotOrg.json new file mode 100644 index 00000000000..e487f075115 --- /dev/null +++ b/src/test/resources/croissant/junk/in/datasetSchemaDotOrg.json @@ -0,0 +1,52 @@ +{ + "@context": "http://schema.org", + "@type": "Dataset", + "@id": "https://doi.org/10.5072/FK2/0CNXUJ", + "identifier": "https://doi.org/10.5072/FK2/0CNXUJ", + "name": "", + "creator": [ + { + "@type": "Person", + "givenName": "Sylvester", + "familyName": "Ritter", + "affiliation": { + "@type": "Organization", + "name": "WWF" + }, + "name": "Ritter, Sylvester" + } + ], + "author": [ + { + "@type": "Person", + "givenName": "Sylvester", + "familyName": "Ritter", + "affiliation": { + "@type": "Organization", + "name": "WWF" + }, + "name": "Ritter, Sylvester" + } + ], + "datePublished": "2025-03-13", + "dateModified": "2025-03-13", + "version": "1", + "description": "A junk dataset.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "http://localhost:8080" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "provider": { + "@type": "Organization", + "name": "Root" + } +} diff --git a/src/test/resources/croissant/junk/out/croissant.json b/src/test/resources/croissant/junk/out/croissant.json new file mode 100644 index 00000000000..b02bed5694e --- /dev/null +++ b/src/test/resources/croissant/junk/out/croissant.json @@ -0,0 +1,83 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "rai": "http://mlcommons.org/croissant/RAI/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "samplingRate": "cr:samplingRate", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform", + "wd": "https://www.wikidata.org/wiki/" + }, + "@type": "sc:Dataset", + "conformsTo": "http://mlcommons.org/croissant/1.0", + "name": "</script><script>alert(666)</script>", + "url": "https://doi.org/10.5072/FK2/0CNXUJ", + "creator": [ + { + "@type": "Person", + "givenName": "Sylvester", + "familyName": "Ritter", + "affiliation": { + "@type": "Organization", + "name": "WWF" + }, + "name": "Ritter, Sylvester" + } + ], + "description": "A junk dataset.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "datePublished": "2025-03-13", + "dateModified": "2025-03-13", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "http://localhost:8080" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "version": "1.0", + "citeAs": "@data{FK2/0CNXUJ_2025,author = {Ritter, Sylvester},publisher = {Root},title = {},year = {2025},url = {https://doi.org/10.5072/FK2/0CNXUJ}}" +} \ No newline at end of file diff --git a/src/test/resources/croissant/max/expected/max-croissant.json b/src/test/resources/croissant/max/expected/max-croissant.json new file mode 100644 index 00000000000..bf1941c7289 --- /dev/null +++ b/src/test/resources/croissant/max/expected/max-croissant.json @@ -0,0 +1,196 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "rai": "http://mlcommons.org/croissant/RAI/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "samplingRate": "cr:samplingRate", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform", + "wd": "https://www.wikidata.org/wiki/" + }, + "@type": "sc:Dataset", + "conformsTo": "http://mlcommons.org/croissant/1.0", + "name": "Max Schema.org", + "url": "https://doi.org/10.5072/FK2/VQTYHD", + "creator": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "affiliation": { + "@type": "Organization", + "name": "Harvard University" + }, + "sameAs": "https://orcid.org/0000-0002-9528-9470", + "@id": "https://orcid.org/0000-0002-9528-9470", + "identifier": "https://orcid.org/0000-0002-9528-9470", + "name": "Durbin, Philip" + }, + { + "@type": "Person", + "affiliation": { + "@type": "Organization", + "name": "Harvard University" + }, + "name": "IQSS" + } + ], + "description": "Exercising fields used by `schema.org` exporter.", + "keywords": [ + "Social Sciences", + "Other", + "foo", + "bar" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "datePublished": "2024-05-01", + "dateModified": "2025-05-21", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "https://beta.dataverse.org" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "version": "3.0", + "citeAs": "@data{FK2/VQTYHD_2024,author = {Durbin, Philip and IQSS},publisher = {Root},title = {Max Schema.org},year = {2024},url = {https://doi.org/10.5072/FK2/VQTYHD}}", + "funder": [ + { + "@type": "Organization", + "name": "NSF" + }, + { + "@type": "Organization", + "name": "NIH" + } + ], + "spatialCoverage": [ + "Cambridge, MA, United States, Harvard Square" + ], + "citation": [ + { + "@type": "CreativeWork", + "name": "Tykhonov, V., & Durbin, P. (2024, March 20). Croissant ML standard in the context of Dataverse, EOSC and beyond. Zenodo. https://doi.org/10.5281/zenodo.10843668", + "@id": "https://doi.org/10.5281/zenodo.10843668", + "identifier": "https://doi.org/10.5281/zenodo.10843668", + "url": "https://doi.org/10.5281/zenodo.10843668" + } + ], + "temporalCoverage": [ + "2023-01-01/2023-12-31" + ], + "distribution": [ + { + "@type": "cr:FileObject", + "@id": "data.tsv", + "name": "data.tsv", + "encodingFormat": "text/tab-separated-values", + "md5": "3663d6a436ac00f5541a7336d6fa18c9", + "contentSize": "33", + "description": "", + "contentUrl": "https://beta.dataverse.org/api/access/datafile/26646?format=original" + }, + { + "@type": "cr:FileObject", + "@id": "doc/README.md", + "name": "README.md", + "encodingFormat": "text/markdown", + "md5": "ebf050ec8cce5df0a72b100cfc9f442f", + "contentSize": "34", + "description": "Additional documentation.", + "contentUrl": "https://beta.dataverse.org/api/access/datafile/26148" + } + ], + "recordSet": [ + { + "@type": "cr:RecordSet", + "field": [ + { + "@type": "cr:Field", + "name": "foo", + "description": "foo", + "dataType": "sc:Text", + "source": { + "@id": "1287", + "fileObject": { + "@id": "data.tsv" + }, + "extract": { + "column": "foo" + } + } + }, + { + "@type": "cr:Field", + "name": "bar", + "description": "bar", + "dataType": "sc:Integer", + "source": { + "@id": "1285", + "fileObject": { + "@id": "data.tsv" + }, + "extract": { + "column": "bar" + } + } + }, + { + "@type": "cr:Field", + "name": "baz", + "description": "baz", + "dataType": "sc:Integer", + "source": { + "@id": "1286", + "fileObject": { + "@id": "data.tsv" + }, + "extract": { + "column": "baz" + } + } + } + ] + } + ] +} \ No newline at end of file diff --git a/src/test/resources/croissant/max/in/dataCiteXml.xml b/src/test/resources/croissant/max/in/dataCiteXml.xml new file mode 100644 index 00000000000..e91c0583b71 --- /dev/null +++ b/src/test/resources/croissant/max/in/dataCiteXml.xml @@ -0,0 +1,77 @@ + + + 10.5072/FK2/VQTYHD + + + Durbin, Philip + Philip + Durbin + https://orcid.org/0000-0002-9528-9470 + Harvard University + + + IQSS + Harvard University + + + + Max Schema.org + + Root + 2024 + + Social Sciences + Other + foo + bar + + + + Durbin, Philip + Philip + Durbin + + + + 2024-05-01 + 2024-05-01 + 2025-05-21 + 2023-01-01/2023-12-31 + + + + 10.5281/ZENODO.10843668 + + + 34 + 21865 + 27 + + + text/markdown + text/tab-separated-values + text/tab-separated-values + + 3.0 + + + Creative Commons CC0 1.0 Universal Public Domain Dedication. + + + Exercising fields used by `schema.org` exporter. + + + + United States, MA,, Cambridge,, Harvard Square, + + + + + NSF + + + NIH + 3OT2DB000004-01S3 + + + diff --git a/src/test/resources/croissant/max/in/datasetFileDetails.json b/src/test/resources/croissant/max/in/datasetFileDetails.json new file mode 100644 index 00000000000..35881e3eae1 --- /dev/null +++ b/src/test/resources/croissant/max/in/datasetFileDetails.json @@ -0,0 +1,117 @@ +[ + { + "id": 26646, + "persistentId": "", + "filename": "data.tab", + "contentType": "text/tab-separated-values", + "friendlyType": "Tab-Delimited", + "filesize": 27, + "storageIdentifier": "s3://beta-dataverse-direct:196f44cc758-4d710ffac5d8", + "originalFileFormat": "text/tsv", + "originalFormatLabel": "Tab-Separated Values", + "originalFileSize": 33, + "originalFileName": "data.tsv", + "UNF": "UNF:6:ngOUmEnfm08jahzBYqStQA==", + "rootDataFileId": -1, + "md5": "3663d6a436ac00f5541a7336d6fa18c9", + "checksum": { + "type": "MD5", + "value": "3663d6a436ac00f5541a7336d6fa18c9" + }, + "tabularData": true, + "creationDate": "2025-05-21", + "publicationDate": "2025-05-21", + "fileAccessRequest": true, + "restricted": false, + "fileMetadataId": 32509, + "dataTables": [ + { + "varQuantity": 3, + "caseQuantity": 3, + "UNF": "UNF:6:ngOUmEnfm08jahzBYqStQA==", + "dataVariables": [ + { + "id": 1287, + "name": "foo", + "label": "foo", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "CHARACTER", + "isOrderedCategorical": false, + "fileOrder": 0, + "UNF": "UNF:6:FWBO/a1GcxDnM3fNLdzrHw==", + "variableMetadata": [] + }, + { + "id": 1285, + "name": "bar", + "label": "bar", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 1, + "UNF": "UNF:6:AvELPR5QTaBbnq6S22Msow==", + "variableMetadata": [], + "summaryStatistics": { + "mode": ".", + "invd": "0.0", + "min": "1.0", + "stdev": "1.0", + "max": "3.0", + "vald": "3.0", + "mean": "2.0", + "medn": "2.0" + } + }, + { + "id": 1286, + "name": "baz", + "label": "baz", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 2, + "UNF": "UNF:6:WkRUZjFbozW1nFYiqMGWeQ==", + "variableMetadata": [], + "summaryStatistics": { + "mean": "20.0", + "mode": ".", + "min": "10.0", + "max": "30.0", + "invd": "0.0", + "stdev": "10.0", + "vald": "3.0", + "medn": "20.0" + } + } + ] + } + ], + "varGroups": [] + }, + { + "id": 26148, + "persistentId": "", + "filename": "README.md", + "contentType": "text/markdown", + "friendlyType": "Markdown Text", + "filesize": 34, + "description": "Additional documentation.", + "storageIdentifier": "s3://beta-dataverse-direct:18f35bee76a-f45ece0b0fcc", + "rootDataFileId": -1, + "md5": "ebf050ec8cce5df0a72b100cfc9f442f", + "checksum": { + "type": "MD5", + "value": "ebf050ec8cce5df0a72b100cfc9f442f" + }, + "tabularData": false, + "creationDate": "2024-05-01", + "publicationDate": "2024-05-01", + "fileAccessRequest": true, + "restricted": false, + "fileMetadataId": 32511, + "varGroups": [] + } +] diff --git a/src/test/resources/croissant/max/in/datasetJson.json b/src/test/resources/croissant/max/in/datasetJson.json new file mode 100644 index 00000000000..a0ddaa54436 --- /dev/null +++ b/src/test/resources/croissant/max/in/datasetJson.json @@ -0,0 +1,376 @@ +{ + "id": 26147, + "identifier": "FK2/VQTYHD", + "persistentUrl": "https://doi.org/10.5072/FK2/VQTYHD", + "protocol": "doi", + "authority": "10.5072", + "separator": "/", + "publisher": "Root", + "publicationDate": "2024-05-01", + "storageIdentifier": "s3://10.5072/FK2/VQTYHD", + "datasetType": "dataset", + "datasetVersion": { + "id": 266, + "datasetId": 26147, + "datasetPersistentId": "doi:10.5072/FK2/VQTYHD", + "storageIdentifier": "s3://10.5072/FK2/VQTYHD", + "versionNumber": 3, + "internalVersionNumber": 7, + "versionMinorNumber": 0, + "versionState": "RELEASED", + "latestVersionPublishingState": "RELEASED", + "UNF": "UNF:6:ngOUmEnfm08jahzBYqStQA==", + "lastUpdateTime": "2025-05-21T19:25:29Z", + "releaseTime": "2025-05-21T19:25:29Z", + "createTime": "2025-05-21T19:23:21Z", + "publicationDate": "2024-05-01", + "citationDate": "2024-05-01", + "license": { + "name": "CC0 1.0", + "uri": "http://creativecommons.org/publicdomain/zero/1.0", + "iconUri": "https://licensebuttons.net/p/zero/1.0/88x31.png", + "rightsIdentifier": "CC0-1.0", + "rightsIdentifierScheme": "SPDX", + "schemeUri": "https://spdx.org/licenses/", + "languageCode": "en" + }, + "fileAccessRequest": true, + "metadataBlocks": { + "citation": { + "displayName": "Citation Metadata", + "name": "citation", + "fields": [ + { + "typeName": "title", + "multiple": false, + "typeClass": "primitive", + "value": "Max Schema.org" + }, + { + "typeName": "author", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "authorName": { + "typeName": "authorName", + "multiple": false, + "typeClass": "primitive", + "value": "Durbin, Philip" + }, + "authorAffiliation": { + "typeName": "authorAffiliation", + "multiple": false, + "typeClass": "primitive", + "value": "Harvard University" + }, + "authorIdentifierScheme": { + "typeName": "authorIdentifierScheme", + "multiple": false, + "typeClass": "controlledVocabulary", + "value": "ORCID" + }, + "authorIdentifier": { + "typeName": "authorIdentifier", + "multiple": false, + "typeClass": "primitive", + "value": "0000-0002-9528-9470" + } + }, + { + "authorName": { + "typeName": "authorName", + "multiple": false, + "typeClass": "primitive", + "value": "IQSS" + }, + "authorAffiliation": { + "typeName": "authorAffiliation", + "multiple": false, + "typeClass": "primitive", + "value": "Harvard University" + } + } + ] + }, + { + "typeName": "datasetContact", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "datasetContactName": { + "typeName": "datasetContactName", + "multiple": false, + "typeClass": "primitive", + "value": "Durbin, Philip" + }, + "datasetContactEmail": { + "typeName": "datasetContactEmail", + "multiple": false, + "typeClass": "primitive", + "value": "philip_durbin@harvard.edu" + } + } + ] + }, + { + "typeName": "dsDescription", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "dsDescriptionValue": { + "typeName": "dsDescriptionValue", + "multiple": false, + "typeClass": "primitive", + "value": "Exercising fields used by `schema.org` exporter." + } + } + ] + }, + { + "typeName": "subject", + "multiple": true, + "typeClass": "controlledVocabulary", + "value": [ + "Social Sciences", + "Other" + ] + }, + { + "typeName": "keyword", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "keywordValue": { + "typeName": "keywordValue", + "multiple": false, + "typeClass": "primitive", + "value": "foo" + } + }, + { + "keywordValue": { + "typeName": "keywordValue", + "multiple": false, + "typeClass": "primitive", + "value": "bar" + } + } + ] + }, + { + "typeName": "publication", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "publicationCitation": { + "typeName": "publicationCitation", + "multiple": false, + "typeClass": "primitive", + "value": "Tykhonov, V., & Durbin, P. (2024, March 20). Croissant ML standard in the context of Dataverse, EOSC and beyond. Zenodo. https://doi.org/10.5281/zenodo.10843668" + }, + "publicationIDType": { + "typeName": "publicationIDType", + "multiple": false, + "typeClass": "controlledVocabulary", + "value": "doi" + }, + "publicationIDNumber": { + "typeName": "publicationIDNumber", + "multiple": false, + "typeClass": "primitive", + "value": "10.5281/zenodo.10843668" + }, + "publicationURL": { + "typeName": "publicationURL", + "multiple": false, + "typeClass": "primitive", + "value": "https://doi.org/10.5281/zenodo.10843668" + } + } + ] + }, + { + "typeName": "contributor", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "contributorType": { + "typeName": "contributorType", + "multiple": false, + "typeClass": "controlledVocabulary", + "value": "Funder" + }, + "contributorName": { + "typeName": "contributorName", + "multiple": false, + "typeClass": "primitive", + "value": "NSF" + } + } + ] + }, + { + "typeName": "grantNumber", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "grantNumberAgency": { + "typeName": "grantNumberAgency", + "multiple": false, + "typeClass": "primitive", + "value": "NIH" + }, + "grantNumberValue": { + "typeName": "grantNumberValue", + "multiple": false, + "typeClass": "primitive", + "value": "3OT2DB000004-01S3" + } + } + ] + }, + { + "typeName": "depositor", + "multiple": false, + "typeClass": "primitive", + "value": "Durbin, Philip" + }, + { + "typeName": "dateOfDeposit", + "multiple": false, + "typeClass": "primitive", + "value": "2024-05-01" + }, + { + "typeName": "timePeriodCovered", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "timePeriodCoveredStart": { + "typeName": "timePeriodCoveredStart", + "multiple": false, + "typeClass": "primitive", + "value": "2023-01-01" + }, + "timePeriodCoveredEnd": { + "typeName": "timePeriodCoveredEnd", + "multiple": false, + "typeClass": "primitive", + "value": "2023-12-31" + } + } + ] + } + ] + }, + "geospatial": { + "displayName": "Geospatial Metadata", + "name": "geospatial", + "fields": [ + { + "typeName": "geographicCoverage", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "country": { + "typeName": "country", + "multiple": false, + "typeClass": "controlledVocabulary", + "value": "United States" + }, + "state": { + "typeName": "state", + "multiple": false, + "typeClass": "primitive", + "value": "MA" + }, + "city": { + "typeName": "city", + "multiple": false, + "typeClass": "primitive", + "value": "Cambridge" + }, + "otherGeographicCoverage": { + "typeName": "otherGeographicCoverage", + "multiple": false, + "typeClass": "primitive", + "value": "Harvard Square" + } + } + ] + } + ] + } + }, + "files": [ + { + "description": "Additional documentation.", + "label": "README.md", + "restricted": false, + "directoryLabel": "doc", + "version": 1, + "datasetVersionId": 266, + "dataFile": { + "id": 26148, + "persistentId": "", + "filename": "README.md", + "contentType": "text/markdown", + "friendlyType": "Markdown Text", + "filesize": 34, + "description": "Additional documentation.", + "storageIdentifier": "s3://beta-dataverse-direct:18f35bee76a-f45ece0b0fcc", + "rootDataFileId": -1, + "md5": "ebf050ec8cce5df0a72b100cfc9f442f", + "checksum": { + "type": "MD5", + "value": "ebf050ec8cce5df0a72b100cfc9f442f" + }, + "tabularData": false, + "creationDate": "2024-05-01", + "publicationDate": "2024-05-01", + "fileAccessRequest": true + } + }, + { + "label": "data.tab", + "restricted": false, + "version": 3, + "datasetVersionId": 266, + "dataFile": { + "id": 26646, + "persistentId": "", + "filename": "data.tab", + "contentType": "text/tab-separated-values", + "friendlyType": "Tab-Delimited", + "filesize": 27, + "storageIdentifier": "s3://beta-dataverse-direct:196f44cc758-4d710ffac5d8", + "originalFileFormat": "text/tsv", + "originalFormatLabel": "Tab-Separated Values", + "originalFileSize": 33, + "originalFileName": "data.tsv", + "UNF": "UNF:6:ngOUmEnfm08jahzBYqStQA==", + "rootDataFileId": -1, + "md5": "3663d6a436ac00f5541a7336d6fa18c9", + "checksum": { + "type": "MD5", + "value": "3663d6a436ac00f5541a7336d6fa18c9" + }, + "tabularData": true, + "creationDate": "2025-05-21", + "publicationDate": "2025-05-21", + "fileAccessRequest": true + } + } + ], + "citation": "Durbin, Philip; IQSS, 2024, \"Max Schema.org\", https://doi.org/10.5072/FK2/VQTYHD, Root, V3, UNF:6:ngOUmEnfm08jahzBYqStQA== [fileUNF]" + } +} diff --git a/src/test/resources/croissant/max/in/datasetORE.json b/src/test/resources/croissant/max/in/datasetORE.json new file mode 100644 index 00000000000..2c3cce7ab6a --- /dev/null +++ b/src/test/resources/croissant/max/in/datasetORE.json @@ -0,0 +1,163 @@ +{ + "dcterms:modified": "2025-05-21", + "dcterms:creator": "Root", + "@type": "ore:ResourceMap", + "schema:additionalType": "Dataverse OREMap Format v1.0.1", + "dvcore:generatedBy": { + "@type": "schema:SoftwareApplication", + "schema:name": "Dataverse", + "schema:version": "6.6 build develop-c4379a0", + "schema:url": "https://github.com/iqss/dataverse" + }, + "@id": "https://beta.dataverse.org/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.5072/FK2/VQTYHD", + "ore:describes": { + "author": [ + { + "citation:authorName": "Durbin, Philip", + "citation:authorAffiliation": "Harvard University", + "authorIdentifierScheme": "ORCID", + "authorIdentifier": "0000-0002-9528-9470" + }, + { + "citation:authorName": "IQSS", + "citation:authorAffiliation": "Harvard University" + } + ], + "citation:keyword": [ + { + "citation:keywordValue": "foo" + }, + { + "citation:keywordValue": "bar" + } + ], + "timePeriodCovered": { + "citation:timePeriodCoveredStart": "2023-01-01", + "citation:timePeriodCoveredEnd": "2023-12-31" + }, + "contributor": { + "citation:contributorType": "Funder", + "citation:contributorName": "NSF" + }, + "citation:dsDescription": { + "citation:dsDescriptionValue": "Exercising fields used by `schema.org` exporter." + }, + "publication": { + "publicationCitation": "Tykhonov, V., & Durbin, P. (2024, March 20). Croissant ML standard in the context of Dataverse, EOSC and beyond. Zenodo. https://doi.org/10.5281/zenodo.10843668", + "publicationIDType": "doi", + "publicationIDNumber": "10.5281/zenodo.10843668", + "publicationURL": "https://doi.org/10.5281/zenodo.10843668" + }, + "grantNumber": { + "citation:grantNumberAgency": "NIH", + "citation:grantNumberValue": "3OT2DB000004-01S3" + }, + "geospatial:geographicCoverage": { + "geospatial:country": "United States", + "geospatial:state": "MA", + "geospatial:city": "Cambridge", + "geospatial:otherGeographicCoverage": "Harvard Square" + }, + "citation:datasetContact": { + "citation:datasetContactName": "Durbin, Philip", + "citation:datasetContactEmail": "philip_durbin@harvard.edu" + }, + "dateOfDeposit": "2024-05-01", + "subject": [ + "Social Sciences", + "Other" + ], + "citation:depositor": "Durbin, Philip", + "title": "Max Schema.org", + "@id": "https://doi.org/10.5072/FK2/VQTYHD", + "@type": [ + "ore:Aggregation", + "schema:Dataset" + ], + "schema:version": "3.0", + "schema:name": "Max Schema.org", + "schema:dateModified": "2025-05-21 19:25:29.653", + "schema:datePublished": "2024-05-01", + "schema:creativeWorkStatus": "RELEASED", + "schema:license": "http://creativecommons.org/publicdomain/zero/1.0", + "dvcore:fileTermsOfAccess": { + "dvcore:fileRequestAccess": true + }, + "schema:includedInDataCatalog": "Root", + "schema:isPartOf": { + "schema:name": "Philip Durbin Dataverse", + "@id": "https://beta.dataverse.org/dataverse/pdurbin", + "schema:isPartOf": { + "schema:name": "Root", + "@id": "https://beta.dataverse.org/dataverse/root", + "schema:description": "The root dataverse." + } + }, + "ore:aggregates": [ + { + "schema:description": "Additional documentation.", + "schema:name": "README.md", + "dvcore:restricted": false, + "dvcore:directoryLabel": "doc", + "schema:version": 1, + "dvcore:datasetVersionId": 266, + "@id": "https://beta.dataverse.org/file.xhtml?fileId=26148", + "schema:sameAs": "https://beta.dataverse.org/api/access/datafile/26148", + "@type": "ore:AggregatedResource", + "schema:fileFormat": "text/markdown", + "dvcore:filesize": 34, + "dvcore:storageIdentifier": "s3://beta-dataverse-direct:18f35bee76a-f45ece0b0fcc", + "dvcore:rootDataFileId": -1, + "dvcore:checksum": { + "@type": "MD5", + "@value": "ebf050ec8cce5df0a72b100cfc9f442f" + } + }, + { + "schema:name": "data.tsv", + "dvcore:restricted": false, + "schema:version": 3, + "dvcore:datasetVersionId": 266, + "@id": "https://beta.dataverse.org/file.xhtml?fileId=26646", + "schema:sameAs": "https://beta.dataverse.org/api/access/datafile/26646?format=original", + "@type": "ore:AggregatedResource", + "schema:fileFormat": "text/tsv", + "dvcore:filesize": 33, + "dvcore:storageIdentifier": "s3://beta-dataverse-direct:196f44cc758-4d710ffac5d8", + "dvcore:currentIngestedName": "data.tab", + "dvcore:UNF": "UNF:6:ngOUmEnfm08jahzBYqStQA==", + "dvcore:rootDataFileId": -1, + "dvcore:checksum": { + "@type": "MD5", + "@value": "3663d6a436ac00f5541a7336d6fa18c9" + } + } + ], + "schema:hasPart": [ + "https://beta.dataverse.org/file.xhtml?fileId=26148", + "https://beta.dataverse.org/file.xhtml?fileId=26646" + ] + }, + "@context": { + "author": "http://purl.org/dc/terms/creator", + "authorIdentifier": "http://purl.org/spar/datacite/AgentIdentifier", + "authorIdentifierScheme": "http://purl.org/spar/datacite/AgentIdentifierScheme", + "citation": "https://dataverse.org/schema/citation/", + "contributor": "http://purl.org/dc/terms/contributor", + "dateOfDeposit": "http://purl.org/dc/terms/dateSubmitted", + "dcterms": "http://purl.org/dc/terms/", + "dvcore": "https://dataverse.org/schema/core#", + "geospatial": "https://beta.dataverse.org/schema/geospatial#", + "grantNumber": "https://schema.org/sponsor", + "ore": "http://www.openarchives.org/ore/terms/", + "publication": "http://purl.org/dc/terms/isReferencedBy", + "publicationCitation": "http://purl.org/dc/terms/bibliographicCitation", + "publicationIDNumber": "http://purl.org/spar/datacite/ResourceIdentifier", + "publicationIDType": "http://purl.org/spar/datacite/ResourceIdentifierScheme", + "publicationURL": "https://schema.org/distribution", + "schema": "http://schema.org/", + "subject": "http://purl.org/dc/terms/subject", + "timePeriodCovered": "https://schema.org/temporalCoverage", + "title": "http://purl.org/dc/terms/title" + } +} diff --git a/src/test/resources/croissant/max/in/datasetSchemaDotOrg.json b/src/test/resources/croissant/max/in/datasetSchemaDotOrg.json new file mode 100644 index 00000000000..d3f764255e8 --- /dev/null +++ b/src/test/resources/croissant/max/in/datasetSchemaDotOrg.json @@ -0,0 +1,119 @@ +{ + "@context": "http://schema.org", + "@type": "Dataset", + "@id": "https://doi.org/10.5072/FK2/VQTYHD", + "identifier": "https://doi.org/10.5072/FK2/VQTYHD", + "name": "Max Schema.org", + "creator": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "affiliation": { + "@type": "Organization", + "name": "Harvard University" + }, + "sameAs": "https://orcid.org/0000-0002-9528-9470", + "@id": "https://orcid.org/0000-0002-9528-9470", + "identifier": "https://orcid.org/0000-0002-9528-9470", + "name": "Durbin, Philip" + }, + { + "@type": "Person", + "affiliation": { + "@type": "Organization", + "name": "Harvard University" + }, + "name": "IQSS" + } + ], + "author": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "affiliation": { + "@type": "Organization", + "name": "Harvard University" + }, + "sameAs": "https://orcid.org/0000-0002-9528-9470", + "@id": "https://orcid.org/0000-0002-9528-9470", + "identifier": "https://orcid.org/0000-0002-9528-9470", + "name": "Durbin, Philip" + }, + { + "@type": "Person", + "affiliation": { + "@type": "Organization", + "name": "Harvard University" + }, + "name": "IQSS" + } + ], + "datePublished": "2024-05-01", + "dateModified": "2025-05-21", + "version": "3", + "description": "Exercising fields used by `schema.org` exporter.", + "keywords": [ + "Social Sciences", + "Other", + "foo", + "bar" + ], + "citation": [ + { + "@type": "CreativeWork", + "name": "Tykhonov, V., & Durbin, P. (2024, March 20). Croissant ML standard in the context of Dataverse, EOSC and beyond. Zenodo. https://doi.org/10.5281/zenodo.10843668", + "@id": "https://doi.org/10.5281/zenodo.10843668", + "identifier": "https://doi.org/10.5281/zenodo.10843668", + "url": "https://doi.org/10.5281/zenodo.10843668" + } + ], + "temporalCoverage": [ + "2023-01-01/2023-12-31" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "https://beta.dataverse.org" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "provider": { + "@type": "Organization", + "name": "Root" + }, + "funder": [ + { + "@type": "Organization", + "name": "NSF" + }, + { + "@type": "Organization", + "name": "NIH" + } + ], + "spatialCoverage": [ + "Cambridge, MA, United States, Harvard Square" + ], + "distribution": [ + { + "@type": "DataDownload", + "name": "data.tab", + "encodingFormat": "text/tab-separated-values", + "contentSize": 27, + "contentUrl": "https://beta.dataverse.org/api/access/datafile/26646" + }, + { + "@type": "DataDownload", + "name": "README.md", + "encodingFormat": "text/markdown", + "contentSize": 34, + "description": "Additional documentation.", + "contentUrl": "https://beta.dataverse.org/api/access/datafile/26148" + } + ] +} diff --git a/src/test/resources/croissant/max/out/croissant.json b/src/test/resources/croissant/max/out/croissant.json new file mode 100644 index 00000000000..bf1941c7289 --- /dev/null +++ b/src/test/resources/croissant/max/out/croissant.json @@ -0,0 +1,196 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "rai": "http://mlcommons.org/croissant/RAI/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "samplingRate": "cr:samplingRate", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform", + "wd": "https://www.wikidata.org/wiki/" + }, + "@type": "sc:Dataset", + "conformsTo": "http://mlcommons.org/croissant/1.0", + "name": "Max Schema.org", + "url": "https://doi.org/10.5072/FK2/VQTYHD", + "creator": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "affiliation": { + "@type": "Organization", + "name": "Harvard University" + }, + "sameAs": "https://orcid.org/0000-0002-9528-9470", + "@id": "https://orcid.org/0000-0002-9528-9470", + "identifier": "https://orcid.org/0000-0002-9528-9470", + "name": "Durbin, Philip" + }, + { + "@type": "Person", + "affiliation": { + "@type": "Organization", + "name": "Harvard University" + }, + "name": "IQSS" + } + ], + "description": "Exercising fields used by `schema.org` exporter.", + "keywords": [ + "Social Sciences", + "Other", + "foo", + "bar" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "datePublished": "2024-05-01", + "dateModified": "2025-05-21", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "https://beta.dataverse.org" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "version": "3.0", + "citeAs": "@data{FK2/VQTYHD_2024,author = {Durbin, Philip and IQSS},publisher = {Root},title = {Max Schema.org},year = {2024},url = {https://doi.org/10.5072/FK2/VQTYHD}}", + "funder": [ + { + "@type": "Organization", + "name": "NSF" + }, + { + "@type": "Organization", + "name": "NIH" + } + ], + "spatialCoverage": [ + "Cambridge, MA, United States, Harvard Square" + ], + "citation": [ + { + "@type": "CreativeWork", + "name": "Tykhonov, V., & Durbin, P. (2024, March 20). Croissant ML standard in the context of Dataverse, EOSC and beyond. Zenodo. https://doi.org/10.5281/zenodo.10843668", + "@id": "https://doi.org/10.5281/zenodo.10843668", + "identifier": "https://doi.org/10.5281/zenodo.10843668", + "url": "https://doi.org/10.5281/zenodo.10843668" + } + ], + "temporalCoverage": [ + "2023-01-01/2023-12-31" + ], + "distribution": [ + { + "@type": "cr:FileObject", + "@id": "data.tsv", + "name": "data.tsv", + "encodingFormat": "text/tab-separated-values", + "md5": "3663d6a436ac00f5541a7336d6fa18c9", + "contentSize": "33", + "description": "", + "contentUrl": "https://beta.dataverse.org/api/access/datafile/26646?format=original" + }, + { + "@type": "cr:FileObject", + "@id": "doc/README.md", + "name": "README.md", + "encodingFormat": "text/markdown", + "md5": "ebf050ec8cce5df0a72b100cfc9f442f", + "contentSize": "34", + "description": "Additional documentation.", + "contentUrl": "https://beta.dataverse.org/api/access/datafile/26148" + } + ], + "recordSet": [ + { + "@type": "cr:RecordSet", + "field": [ + { + "@type": "cr:Field", + "name": "foo", + "description": "foo", + "dataType": "sc:Text", + "source": { + "@id": "1287", + "fileObject": { + "@id": "data.tsv" + }, + "extract": { + "column": "foo" + } + } + }, + { + "@type": "cr:Field", + "name": "bar", + "description": "bar", + "dataType": "sc:Integer", + "source": { + "@id": "1285", + "fileObject": { + "@id": "data.tsv" + }, + "extract": { + "column": "bar" + } + } + }, + { + "@type": "cr:Field", + "name": "baz", + "description": "baz", + "dataType": "sc:Integer", + "source": { + "@id": "1286", + "fileObject": { + "@id": "data.tsv" + }, + "extract": { + "column": "baz" + } + } + } + ] + } + ] +} \ No newline at end of file diff --git a/src/test/resources/croissant/minimal/expected/minimal-croissant.json b/src/test/resources/croissant/minimal/expected/minimal-croissant.json new file mode 100644 index 00000000000..7c47afc1485 --- /dev/null +++ b/src/test/resources/croissant/minimal/expected/minimal-croissant.json @@ -0,0 +1,79 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "rai": "http://mlcommons.org/croissant/RAI/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "samplingRate": "cr:samplingRate", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform", + "wd": "https://www.wikidata.org/wiki/" + }, + "@type": "sc:Dataset", + "conformsTo": "http://mlcommons.org/croissant/1.0", + "name": "Minimal", + "url": "https://doi.org/10.5072/FK2/4C0JYC", + "creator": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "name": "Durbin, Philip" + } + ], + "description": "Minimal metadata and no files.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "datePublished": "2024-05-01", + "dateModified": "2024-05-01", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "https://beta.dataverse.org" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "version": "1.0", + "citeAs": "@data{FK2/4C0JYC_2024,author = {Durbin, Philip},publisher = {Root},title = {Minimal},year = {2024},url = {https://doi.org/10.5072/FK2/4C0JYC}}" +} \ No newline at end of file diff --git a/src/test/resources/croissant/minimal/in/dataCiteXml.xml b/src/test/resources/croissant/minimal/in/dataCiteXml.xml new file mode 100644 index 00000000000..14feafba53d --- /dev/null +++ b/src/test/resources/croissant/minimal/in/dataCiteXml.xml @@ -0,0 +1,17 @@ + + + 10.5072/FK2/4C0JYC + Durbin, Philip + + Minimal + + Root + 2024 + + + Minimal metadata and no files. + + + diff --git a/src/test/resources/croissant/minimal/in/datasetFileDetails.json b/src/test/resources/croissant/minimal/in/datasetFileDetails.json new file mode 100644 index 00000000000..fe51488c706 --- /dev/null +++ b/src/test/resources/croissant/minimal/in/datasetFileDetails.json @@ -0,0 +1 @@ +[] diff --git a/src/test/resources/croissant/minimal/in/datasetJson.json b/src/test/resources/croissant/minimal/in/datasetJson.json new file mode 100644 index 00000000000..cedd4723dd5 --- /dev/null +++ b/src/test/resources/croissant/minimal/in/datasetJson.json @@ -0,0 +1,100 @@ +{ + "id": 26146, + "identifier": "FK2/4C0JYC", + "persistentUrl": "https://doi.org/10.5072/FK2/4C0JYC", + "protocol": "doi", + "authority": "10.5072", + "publisher": "Root", + "publicationDate": "2024-05-01", + "storageIdentifier": "s3://10.5072/FK2/4C0JYC", + "datasetVersion": { + "id": 108, + "datasetId": 26146, + "datasetPersistentId": "doi:10.5072/FK2/4C0JYC", + "storageIdentifier": "s3://10.5072/FK2/4C0JYC", + "versionNumber": 1, + "versionMinorNumber": 0, + "versionState": "RELEASED", + "latestVersionPublishingState": "RELEASED", + "lastUpdateTime": "2024-05-01T14:27:17Z", + "releaseTime": "2024-05-01T14:27:17Z", + "createTime": "2024-05-01T14:26:54Z", + "publicationDate": "2024-05-01", + "citationDate": "2024-05-01", + "license": { + "name": "CC0 1.0", + "uri": "http://creativecommons.org/publicdomain/zero/1.0", + "iconUri": "https://licensebuttons.net/p/zero/1.0/88x31.png" + }, + "fileAccessRequest": true, + "metadataBlocks": { + "citation": { + "displayName": "Citation Metadata", + "name": "citation", + "fields": [ + { + "typeName": "title", + "multiple": false, + "typeClass": "primitive", + "value": "Minimal" + }, + { + "typeName": "author", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "authorName": { + "typeName": "authorName", + "multiple": false, + "typeClass": "primitive", + "value": "Durbin, Philip" + } + } + ] + }, + { + "typeName": "datasetContact", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "datasetContactEmail": { + "typeName": "datasetContactEmail", + "multiple": false, + "typeClass": "primitive", + "value": "philip_durbin@harvard.edu" + } + } + ] + }, + { + "typeName": "dsDescription", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "dsDescriptionValue": { + "typeName": "dsDescriptionValue", + "multiple": false, + "typeClass": "primitive", + "value": "Minimal metadata and no files." + } + } + ] + }, + { + "typeName": "subject", + "multiple": true, + "typeClass": "controlledVocabulary", + "value": [ + "Other" + ] + } + ] + } + }, + "files": [], + "citation": "Durbin, Philip, 2024, \"Minimal\", https://doi.org/10.5072/FK2/4C0JYC, Root, V1" + } +} diff --git a/src/test/resources/croissant/minimal/in/datasetORE.json b/src/test/resources/croissant/minimal/in/datasetORE.json new file mode 100644 index 00000000000..a76ec9ea0ac --- /dev/null +++ b/src/test/resources/croissant/minimal/in/datasetORE.json @@ -0,0 +1,62 @@ +{ + "dcterms:modified": "2024-05-01", + "dcterms:creator": "Root", + "@type": "ore:ResourceMap", + "schema:additionalType": "Dataverse OREMap Format v1.0.0", + "dvcore:generatedBy": { + "@type": "schema:SoftwareApplication", + "schema:name": "Dataverse", + "schema:version": "6.2 build develop-e615050", + "schema:url": "https://github.com/iqss/dataverse" + }, + "@id": "https://beta.dataverse.org/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.5072/FK2/4C0JYC", + "ore:describes": { + "citation:dsDescription": { + "citation:dsDescriptionValue": "Minimal metadata and no files." + }, + "author": { + "citation:authorName": "Durbin, Philip" + }, + "citation:datasetContact": { + "citation:datasetContactEmail": "philip_durbin@harvard.edu" + }, + "title": "Minimal", + "subject": "Other", + "@id": "https://doi.org/10.5072/FK2/4C0JYC", + "@type": [ + "ore:Aggregation", + "schema:Dataset" + ], + "schema:version": "1.0", + "schema:name": "Minimal", + "schema:dateModified": "2024-05-01 14:27:17.719", + "schema:datePublished": "2024-05-01", + "schema:creativeWorkStatus": "RELEASED", + "schema:license": "http://creativecommons.org/publicdomain/zero/1.0", + "dvcore:fileTermsOfAccess": { + "dvcore:fileRequestAccess": true + }, + "schema:includedInDataCatalog": "Root", + "schema:isPartOf": { + "schema:name": "Philip Durbin Dataverse", + "@id": "https://beta.dataverse.org/dataverse/pdurbin", + "schema:isPartOf": { + "schema:name": "Root", + "@id": "https://beta.dataverse.org/dataverse/root", + "schema:description": "The root dataverse." + } + }, + "ore:aggregates": [], + "schema:hasPart": [] + }, + "@context": { + "author": "http://purl.org/dc/terms/creator", + "citation": "https://dataverse.org/schema/citation/", + "dcterms": "http://purl.org/dc/terms/", + "dvcore": "https://dataverse.org/schema/core#", + "ore": "http://www.openarchives.org/ore/terms/", + "schema": "http://schema.org/", + "subject": "http://purl.org/dc/terms/subject", + "title": "http://purl.org/dc/terms/title" + } +} diff --git a/src/test/resources/croissant/minimal/in/datasetSchemaDotOrg.json b/src/test/resources/croissant/minimal/in/datasetSchemaDotOrg.json new file mode 100644 index 00000000000..36dcab588a3 --- /dev/null +++ b/src/test/resources/croissant/minimal/in/datasetSchemaDotOrg.json @@ -0,0 +1,44 @@ +{ + "@context": "http://schema.org", + "@type": "Dataset", + "@id": "https://doi.org/10.5072/FK2/4C0JYC", + "identifier": "https://doi.org/10.5072/FK2/4C0JYC", + "name": "Minimal", + "creator": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "name": "Durbin, Philip" + } + ], + "author": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "name": "Durbin, Philip" + } + ], + "datePublished": "2024-05-01", + "dateModified": "2024-05-01", + "version": "1", + "description": "Minimal metadata and no files.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "https://beta.dataverse.org" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "provider": { + "@type": "Organization", + "name": "Root" + } +} diff --git a/src/test/resources/croissant/minimal/out/croissant.json b/src/test/resources/croissant/minimal/out/croissant.json new file mode 100644 index 00000000000..7c47afc1485 --- /dev/null +++ b/src/test/resources/croissant/minimal/out/croissant.json @@ -0,0 +1,79 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "rai": "http://mlcommons.org/croissant/RAI/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "samplingRate": "cr:samplingRate", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform", + "wd": "https://www.wikidata.org/wiki/" + }, + "@type": "sc:Dataset", + "conformsTo": "http://mlcommons.org/croissant/1.0", + "name": "Minimal", + "url": "https://doi.org/10.5072/FK2/4C0JYC", + "creator": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "name": "Durbin, Philip" + } + ], + "description": "Minimal metadata and no files.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "datePublished": "2024-05-01", + "dateModified": "2024-05-01", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "https://beta.dataverse.org" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "version": "1.0", + "citeAs": "@data{FK2/4C0JYC_2024,author = {Durbin, Philip},publisher = {Root},title = {Minimal},year = {2024},url = {https://doi.org/10.5072/FK2/4C0JYC}}" +} \ No newline at end of file diff --git a/src/test/resources/croissant/restricted/expected/restricted-croissant.json b/src/test/resources/croissant/restricted/expected/restricted-croissant.json new file mode 100644 index 00000000000..19d970d1bbb --- /dev/null +++ b/src/test/resources/croissant/restricted/expected/restricted-croissant.json @@ -0,0 +1,115 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "rai": "http://mlcommons.org/croissant/RAI/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "samplingRate": "cr:samplingRate", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform", + "wd": "https://www.wikidata.org/wiki/" + }, + "@type": "sc:Dataset", + "conformsTo": "http://mlcommons.org/croissant/1.0", + "name": "Cars", + "url": "https://doi.org/10.5072/FK2/CY7BWA", + "creator": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "affiliation": { + "@type": "Organization", + "name": "Harvard" + }, + "name": "Durbin, Philip" + } + ], + "description": "This dataset is about cars.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "datePublished": "2025-05-16", + "dateModified": "2025-05-16", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "http://localhost:8080" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "version": "1.0", + "citeAs": "@data{FK2/CY7BWA_2025,author = {Durbin, Philip},publisher = {Root},title = {Cars},year = {2025},url = {https://doi.org/10.5072/FK2/CY7BWA}}", + "distribution": [ + { + "@type": "cr:FileObject", + "@id": "code/compute.py", + "name": "compute.py", + "encodingFormat": "text/x-python", + "md5": "d84985e94dde671f318076bd7a137f15", + "contentSize": "15", + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/7" + }, + { + "@type": "cr:FileObject", + "@id": "data/stata13-auto.dta", + "name": "stata13-auto.dta", + "encodingFormat": "application/x-stata-13", + "md5": "7b1201ce6b469796837a835377338c5a", + "contentSize": "6443", + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/9?format=original" + }, + { + "@type": "cr:FileObject", + "@id": "doc/README.md", + "name": "README.md", + "encodingFormat": "text/markdown", + "md5": "a2e484d07ee5590cc32182dc2c6ccc83", + "contentSize": "28", + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/8" + } + ] +} \ No newline at end of file diff --git a/src/test/resources/croissant/restricted/in/dataCiteXml.xml b/src/test/resources/croissant/restricted/in/dataCiteXml.xml new file mode 100644 index 00000000000..7c6c89385fd --- /dev/null +++ b/src/test/resources/croissant/restricted/in/dataCiteXml.xml @@ -0,0 +1,51 @@ + + + 10.5072/FK2/CY7BWA + + + Durbin, Philip + Philip + Durbin + Harvard + + + + Cars + + Root + 2025 + + Other + + + + Durbin, Philip + Philip + Durbin + Harvard + + + + 2024-03-13 + 2025-05-16 + + + + 15 + 28 + 4026 + + + text/x-python + text/markdown + text/tab-separated-values + + 1.0 + + + Creative Commons CC0 1.0 Universal Public Domain Dedication. + + + This dataset is about cars. + + diff --git a/src/test/resources/croissant/restricted/in/datasetFileDetails.json b/src/test/resources/croissant/restricted/in/datasetFileDetails.json new file mode 100644 index 00000000000..f2cdff072da --- /dev/null +++ b/src/test/resources/croissant/restricted/in/datasetFileDetails.json @@ -0,0 +1,355 @@ +[ + { + "id": 7, + "persistentId": "", + "filename": "compute.py", + "contentType": "text/x-python", + "friendlyType": "Python Source Code", + "filesize": 15, + "description": "", + "storageIdentifier": "local://196d9f154f7-8cadf34ee905", + "rootDataFileId": -1, + "md5": "d84985e94dde671f318076bd7a137f15", + "checksum": { + "type": "MD5", + "value": "d84985e94dde671f318076bd7a137f15" + }, + "tabularData": false, + "creationDate": "2025-05-16", + "publicationDate": "2025-05-16", + "fileAccessRequest": true, + "restricted": false, + "fileMetadataId": 1, + "varGroups": [] + }, + { + "id": 9, + "persistentId": "", + "filename": "stata13-auto.tab", + "contentType": "text/tab-separated-values", + "friendlyType": "Tab-Delimited", + "filesize": 4026, + "description": "", + "storageIdentifier": "local://196d9f15719-2270bfca2b48", + "originalFileFormat": "application/x-stata-13", + "originalFormatLabel": "Stata 13 Binary", + "originalFileSize": 6443, + "originalFileName": "stata13-auto.dta", + "UNF": "UNF:6:RPd9EWHSZwqUvRZuKTJMqg==", + "rootDataFileId": -1, + "md5": "7b1201ce6b469796837a835377338c5a", + "checksum": { + "type": "MD5", + "value": "7b1201ce6b469796837a835377338c5a" + }, + "tabularData": true, + "creationDate": "2025-05-16", + "publicationDate": "2025-05-16", + "fileAccessRequest": true, + "restricted": true, + "fileMetadataId": 3, + "dataTables": [ + { + "varQuantity": 12, + "caseQuantity": 74, + "UNF": "UNF:6:RPd9EWHSZwqUvRZuKTJMqg==", + "dataVariables": [ + { + "id": 2, + "name": "make", + "label": "Make and Model", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "CHARACTER", + "isOrderedCategorical": false, + "fileOrder": 0, + "UNF": "UNF:6:Oo4vwiL8ffhSECOcjsKk2g==", + "variableMetadata": [] + }, + { + "id": 5, + "name": "price", + "label": "Price", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 1, + "UNF": "UNF:6:rvfkkdA36AaCSqCQciybfA==", + "variableMetadata": [], + "summaryStatistics": { + "min": "3291.0", + "medn": "5006.5", + "mean": "6165.256756756757", + "max": "15906.0", + "vald": "74.0", + "mode": ".", + "stdev": "2949.4958847689186", + "invd": "0.0" + } + }, + { + "id": 3, + "name": "mpg", + "label": "Mileage (mpg)", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 2, + "UNF": "UNF:6:vVr3w8CgeZq1KpDfJQudOg==", + "variableMetadata": [], + "summaryStatistics": { + "max": "41.0", + "vald": "74.0", + "medn": "20.0", + "min": "12.0", + "stdev": "5.785503209735141", + "mean": "21.2972972972973", + "invd": "0.0", + "mode": "." + } + }, + { + "id": 12, + "name": "rep78", + "label": "Repair Record 1978", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 3, + "UNF": "UNF:6:gbFI98swTWNhAjCRyi2cdA==", + "variableMetadata": [], + "summaryStatistics": { + "stdev": "0.989932270109041", + "mode": ".", + "min": "1.0", + "max": "5.0", + "medn": "3.0", + "mean": "3.4057971014492754", + "vald": "69.0", + "invd": "5.0" + } + }, + { + "id": 1, + "name": "headroom", + "label": "Headroom (in.)", + "weighted": false, + "variableIntervalType": "contin", + "variableFormatType": "NUMERIC", + "format": "float", + "isOrderedCategorical": false, + "fileOrder": 4, + "UNF": "UNF:6:g4Pl3T0Oz2e/OKJ64WiTnA==", + "variableMetadata": [], + "summaryStatistics": { + "mean": "2.993243243243243", + "mode": ".", + "vald": "74.0", + "invd": "0.0", + "stdev": "0.845994766828771", + "min": "1.5", + "medn": "3.0", + "max": "5.0" + } + }, + { + "id": 7, + "name": "trunk", + "label": "Trunk space (cu. ft.)", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 5, + "UNF": "UNF:6:iab0POsE3By7dQfgX/TY4g==", + "variableMetadata": [], + "summaryStatistics": { + "vald": "74.0", + "mode": ".", + "mean": "13.756756756756756", + "max": "23.0", + "min": "5.0", + "medn": "14.0", + "invd": "0.0", + "stdev": "4.277404189173201" + } + }, + { + "id": 4, + "name": "weight", + "label": "Weight (lbs.)", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 6, + "UNF": "UNF:6:cdoTdfUNeYWHHFEBCDxg+w==", + "variableMetadata": [], + "summaryStatistics": { + "invd": "0.0", + "min": "1760.0", + "vald": "74.0", + "max": "4840.0", + "stdev": "777.1935671373664", + "mean": "3019.459459459459", + "mode": ".", + "medn": "3190.0" + } + }, + { + "id": 8, + "name": "length", + "label": "Length (in.)", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 7, + "UNF": "UNF:6:8z1rjwhqBN4meYIiKI4P1A==", + "variableMetadata": [], + "summaryStatistics": { + "mode": ".", + "vald": "74.0", + "invd": "0.0", + "stdev": "22.266339902021585", + "max": "233.0", + "medn": "192.5", + "mean": "187.93243243243245", + "min": "142.0" + } + }, + { + "id": 9, + "name": "turn", + "label": "Turn Circle (ft.) ", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 8, + "UNF": "UNF:6:QxhjrrNtVz4qA8RulQ2MuQ==", + "variableMetadata": [], + "summaryStatistics": { + "stdev": "4.399353727233908", + "vald": "74.0", + "max": "51.0", + "min": "31.0", + "medn": "40.0", + "mean": "39.648648648648646", + "invd": "0.0", + "mode": "." + } + }, + { + "id": 10, + "name": "displacement", + "label": "Displacement (cu. in.)", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 9, + "UNF": "UNF:6:ftk+RAQpTCT1/y6G/rLWfA==", + "variableMetadata": [], + "summaryStatistics": { + "stdev": "91.83721896440396", + "invd": "0.0", + "min": "79.0", + "medn": "196.0", + "mode": ".", + "vald": "74.0", + "mean": "197.2972972972973", + "max": "425.0" + } + }, + { + "id": 6, + "name": "gear_ratio", + "label": "Gear Ratio", + "weighted": false, + "variableIntervalType": "contin", + "variableFormatType": "NUMERIC", + "format": "float", + "isOrderedCategorical": false, + "fileOrder": 10, + "UNF": "UNF:6:qjnY/qbx26FTepoPqRZ6lw==", + "variableMetadata": [], + "summaryStatistics": { + "medn": "2.9550000429153442", + "stdev": "0.45628709670763035", + "mean": "3.0148648667979883", + "min": "2.190000057220459", + "max": "3.890000104904175", + "mode": ".", + "vald": "74.0", + "invd": "0.0" + } + }, + { + "id": 11, + "name": "foreign", + "label": "Car type", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 11, + "UNF": "UNF:6:nbjzgh3wfAFqKpaoFnHalA==", + "variableMetadata": [], + "summaryStatistics": { + "max": "1.0", + "invd": "0.0", + "mode": ".", + "medn": "0.0", + "stdev": "0.46018845840901884", + "min": "0.0", + "mean": "0.2972972972972975", + "vald": "74.0" + }, + "variableCategories": [ + { + "label": "Domestic", + "value": "0", + "isMissing": false, + "frequency": 52.0 + }, + { + "label": "Foreign", + "value": "1", + "isMissing": false, + "frequency": 22.0 + } + ] + } + ] + } + ], + "varGroups": [] + }, + { + "id": 8, + "persistentId": "", + "filename": "README.md", + "contentType": "text/markdown", + "friendlyType": "Markdown Text", + "filesize": 28, + "description": "", + "storageIdentifier": "local://196d9f15664-1d4bb4e96a97", + "rootDataFileId": -1, + "md5": "a2e484d07ee5590cc32182dc2c6ccc83", + "checksum": { + "type": "MD5", + "value": "a2e484d07ee5590cc32182dc2c6ccc83" + }, + "tabularData": false, + "creationDate": "2025-05-16", + "publicationDate": "2025-05-16", + "fileAccessRequest": true, + "restricted": false, + "fileMetadataId": 2, + "varGroups": [] + } +] diff --git a/src/test/resources/croissant/restricted/in/datasetJson.json b/src/test/resources/croissant/restricted/in/datasetJson.json new file mode 100644 index 00000000000..3234579cddd --- /dev/null +++ b/src/test/resources/croissant/restricted/in/datasetJson.json @@ -0,0 +1,228 @@ +{ + "id": 6, + "identifier": "FK2/CY7BWA", + "persistentUrl": "https://doi.org/10.5072/FK2/CY7BWA", + "protocol": "doi", + "authority": "10.5072", + "separator": "/", + "publisher": "Root", + "publicationDate": "2025-05-16", + "storageIdentifier": "local://10.5072/FK2/CY7BWA", + "datasetType": "dataset", + "datasetVersion": { + "id": 3, + "datasetId": 6, + "datasetPersistentId": "doi:10.5072/FK2/CY7BWA", + "storageIdentifier": "local://10.5072/FK2/CY7BWA", + "versionNumber": 1, + "internalVersionNumber": 10, + "versionMinorNumber": 0, + "versionState": "RELEASED", + "latestVersionPublishingState": "RELEASED", + "UNF": "UNF:6:RPd9EWHSZwqUvRZuKTJMqg==", + "lastUpdateTime": "2025-05-16T16:33:18Z", + "releaseTime": "2025-05-16T16:33:18Z", + "createTime": "2025-05-16T16:33:13Z", + "publicationDate": "2025-05-16", + "citationDate": "2025-05-16", + "license": { + "name": "CC0 1.0", + "uri": "http://creativecommons.org/publicdomain/zero/1.0", + "iconUri": "https://licensebuttons.net/p/zero/1.0/88x31.png", + "rightsIdentifier": "CC0-1.0", + "rightsIdentifierScheme": "SPDX", + "schemeUri": "https://spdx.org/licenses/", + "languageCode": "en" + }, + "fileAccessRequest": true, + "metadataBlocks": { + "citation": { + "displayName": "Citation Metadata", + "name": "citation", + "fields": [ + { + "typeName": "title", + "multiple": false, + "typeClass": "primitive", + "value": "Cars" + }, + { + "typeName": "author", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "authorName": { + "typeName": "authorName", + "multiple": false, + "typeClass": "primitive", + "value": "Durbin, Philip" + }, + "authorAffiliation": { + "typeName": "authorAffiliation", + "multiple": false, + "typeClass": "primitive", + "value": "Harvard" + } + } + ] + }, + { + "typeName": "datasetContact", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "datasetContactName": { + "typeName": "datasetContactName", + "multiple": false, + "typeClass": "primitive", + "value": "Durbin, Philip" + }, + "datasetContactAffiliation": { + "typeName": "datasetContactAffiliation", + "multiple": false, + "typeClass": "primitive", + "value": "Harvard" + }, + "datasetContactEmail": { + "typeName": "datasetContactEmail", + "multiple": false, + "typeClass": "primitive", + "value": "dataverse@mailinator.com" + } + } + ] + }, + { + "typeName": "dsDescription", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "dsDescriptionValue": { + "typeName": "dsDescriptionValue", + "multiple": false, + "typeClass": "primitive", + "value": "This dataset is about cars." + } + } + ] + }, + { + "typeName": "subject", + "multiple": true, + "typeClass": "controlledVocabulary", + "value": [ + "Other" + ] + }, + { + "typeName": "depositor", + "multiple": false, + "typeClass": "primitive", + "value": "Durbin, Philip" + }, + { + "typeName": "dateOfDeposit", + "multiple": false, + "typeClass": "primitive", + "value": "2024-03-13" + } + ] + } + }, + "files": [ + { + "description": "", + "label": "compute.py", + "restricted": false, + "directoryLabel": "code", + "version": 2, + "datasetVersionId": 3, + "dataFile": { + "id": 7, + "persistentId": "", + "filename": "compute.py", + "contentType": "text/x-python", + "friendlyType": "Python Source Code", + "filesize": 15, + "description": "", + "storageIdentifier": "local://196d9f154f7-8cadf34ee905", + "rootDataFileId": -1, + "md5": "d84985e94dde671f318076bd7a137f15", + "checksum": { + "type": "MD5", + "value": "d84985e94dde671f318076bd7a137f15" + }, + "tabularData": false, + "creationDate": "2025-05-16", + "publicationDate": "2025-05-16", + "fileAccessRequest": true + } + }, + { + "description": "", + "label": "README.md", + "restricted": false, + "directoryLabel": "doc", + "version": 2, + "datasetVersionId": 3, + "dataFile": { + "id": 8, + "persistentId": "", + "filename": "README.md", + "contentType": "text/markdown", + "friendlyType": "Markdown Text", + "filesize": 28, + "description": "", + "storageIdentifier": "local://196d9f15664-1d4bb4e96a97", + "rootDataFileId": -1, + "md5": "a2e484d07ee5590cc32182dc2c6ccc83", + "checksum": { + "type": "MD5", + "value": "a2e484d07ee5590cc32182dc2c6ccc83" + }, + "tabularData": false, + "creationDate": "2025-05-16", + "publicationDate": "2025-05-16", + "fileAccessRequest": true + } + }, + { + "description": "", + "label": "stata13-auto.tab", + "restricted": true, + "directoryLabel": "data", + "version": 4, + "datasetVersionId": 3, + "dataFile": { + "id": 9, + "persistentId": "", + "filename": "stata13-auto.tab", + "contentType": "text/tab-separated-values", + "friendlyType": "Tab-Delimited", + "filesize": 4026, + "description": "", + "storageIdentifier": "local://196d9f15719-2270bfca2b48", + "originalFileFormat": "application/x-stata-13", + "originalFormatLabel": "Stata 13 Binary", + "originalFileSize": 6443, + "originalFileName": "stata13-auto.dta", + "UNF": "UNF:6:RPd9EWHSZwqUvRZuKTJMqg==", + "rootDataFileId": -1, + "md5": "7b1201ce6b469796837a835377338c5a", + "checksum": { + "type": "MD5", + "value": "7b1201ce6b469796837a835377338c5a" + }, + "tabularData": true, + "creationDate": "2025-05-16", + "publicationDate": "2025-05-16", + "fileAccessRequest": true + } + } + ], + "citation": "Durbin, Philip, 2025, \"Cars\", https://doi.org/10.5072/FK2/CY7BWA, Root, V1, UNF:6:RPd9EWHSZwqUvRZuKTJMqg== [fileUNF]" + } +} diff --git a/src/test/resources/croissant/restricted/in/datasetORE.json b/src/test/resources/croissant/restricted/in/datasetORE.json new file mode 100644 index 00000000000..8e6c5b93507 --- /dev/null +++ b/src/test/resources/croissant/restricted/in/datasetORE.json @@ -0,0 +1,133 @@ +{ + "dcterms:modified": "2025-05-19", + "dcterms:creator": "Root", + "@type": "ore:ResourceMap", + "schema:additionalType": "Dataverse OREMap Format v1.0.1", + "dvcore:generatedBy": { + "@type": "schema:SoftwareApplication", + "schema:name": "Dataverse", + "schema:version": "6.6", + "schema:url": "https://github.com/iqss/dataverse" + }, + "@id": "http://localhost:8080/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.5072/FK2/CY7BWA", + "ore:describes": { + "citation:datasetContact": { + "citation:datasetContactName": "Durbin, Philip", + "citation:datasetContactAffiliation": "Harvard", + "citation:datasetContactEmail": "dataverse@mailinator.com" + }, + "author": { + "citation:authorName": "Durbin, Philip", + "citation:authorAffiliation": "Harvard" + }, + "citation:dsDescription": { + "citation:dsDescriptionValue": "This dataset is about cars." + }, + "dateOfDeposit": "2024-03-13", + "title": "Cars", + "citation:depositor": "Durbin, Philip", + "subject": "Other", + "@id": "https://doi.org/10.5072/FK2/CY7BWA", + "@type": [ + "ore:Aggregation", + "schema:Dataset" + ], + "schema:version": "1.0", + "schema:name": "Cars", + "schema:dateModified": "Fri May 16 16:33:18 UTC 2025", + "schema:datePublished": "2025-05-16", + "schema:creativeWorkStatus": "RELEASED", + "schema:license": "http://creativecommons.org/publicdomain/zero/1.0", + "dvcore:fileTermsOfAccess": { + "dvcore:fileRequestAccess": true + }, + "schema:includedInDataCatalog": "Root", + "schema:isPartOf": { + "schema:name": "Cars", + "@id": "http://localhost:8080/dataverse/cars", + "schema:description": "Data about cars.", + "schema:isPartOf": { + "schema:name": "Root", + "@id": "http://localhost:8080/dataverse/root", + "schema:description": "The root dataverse." + } + }, + "ore:aggregates": [ + { + "schema:description": "", + "schema:name": "compute.py", + "dvcore:restricted": false, + "dvcore:directoryLabel": "code", + "schema:version": 2, + "dvcore:datasetVersionId": 3, + "@id": "http://localhost:8080/file.xhtml?fileId=7", + "schema:sameAs": "http://localhost:8080/api/access/datafile/7", + "@type": "ore:AggregatedResource", + "schema:fileFormat": "text/x-python", + "dvcore:filesize": 15, + "dvcore:storageIdentifier": "local://196d9f154f7-8cadf34ee905", + "dvcore:rootDataFileId": -1, + "dvcore:checksum": { + "@type": "MD5", + "@value": "d84985e94dde671f318076bd7a137f15" + } + }, + { + "schema:description": "", + "schema:name": "README.md", + "dvcore:restricted": false, + "dvcore:directoryLabel": "doc", + "schema:version": 2, + "dvcore:datasetVersionId": 3, + "@id": "http://localhost:8080/file.xhtml?fileId=8", + "schema:sameAs": "http://localhost:8080/api/access/datafile/8", + "@type": "ore:AggregatedResource", + "schema:fileFormat": "text/markdown", + "dvcore:filesize": 28, + "dvcore:storageIdentifier": "local://196d9f15664-1d4bb4e96a97", + "dvcore:rootDataFileId": -1, + "dvcore:checksum": { + "@type": "MD5", + "@value": "a2e484d07ee5590cc32182dc2c6ccc83" + } + }, + { + "schema:description": "", + "schema:name": "stata13-auto.dta", + "dvcore:restricted": true, + "dvcore:directoryLabel": "data", + "schema:version": 4, + "dvcore:datasetVersionId": 3, + "@id": "http://localhost:8080/file.xhtml?fileId=9", + "schema:sameAs": "http://localhost:8080/api/access/datafile/9?format=original", + "@type": "ore:AggregatedResource", + "schema:fileFormat": "application/x-stata-13", + "dvcore:filesize": 6443, + "dvcore:storageIdentifier": "local://196d9f15719-2270bfca2b48", + "dvcore:currentIngestedName": "stata13-auto.tab", + "dvcore:UNF": "UNF:6:RPd9EWHSZwqUvRZuKTJMqg==", + "dvcore:rootDataFileId": -1, + "dvcore:checksum": { + "@type": "MD5", + "@value": "7b1201ce6b469796837a835377338c5a" + } + } + ], + "schema:hasPart": [ + "http://localhost:8080/file.xhtml?fileId=7", + "http://localhost:8080/file.xhtml?fileId=8", + "http://localhost:8080/file.xhtml?fileId=9" + ] + }, + "@context": { + "author": "http://purl.org/dc/terms/creator", + "citation": "https://dataverse.org/schema/citation/", + "dateOfDeposit": "http://purl.org/dc/terms/dateSubmitted", + "dcterms": "http://purl.org/dc/terms/", + "dvcore": "https://dataverse.org/schema/core#", + "ore": "http://www.openarchives.org/ore/terms/", + "schema": "http://schema.org/", + "subject": "http://purl.org/dc/terms/subject", + "title": "http://purl.org/dc/terms/title" + } +} diff --git a/src/test/resources/croissant/restricted/in/datasetSchemaDotOrg.json b/src/test/resources/croissant/restricted/in/datasetSchemaDotOrg.json new file mode 100644 index 00000000000..83f587c5fd7 --- /dev/null +++ b/src/test/resources/croissant/restricted/in/datasetSchemaDotOrg.json @@ -0,0 +1,78 @@ +{ + "@context": "http://schema.org", + "@type": "Dataset", + "@id": "https://doi.org/10.5072/FK2/CY7BWA", + "identifier": "https://doi.org/10.5072/FK2/CY7BWA", + "name": "Cars", + "creator": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "affiliation": { + "@type": "Organization", + "name": "Harvard" + }, + "name": "Durbin, Philip" + } + ], + "author": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "affiliation": { + "@type": "Organization", + "name": "Harvard" + }, + "name": "Durbin, Philip" + } + ], + "datePublished": "2025-05-16", + "dateModified": "2025-05-16", + "version": "1", + "description": "This dataset is about cars.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "http://localhost:8080" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "provider": { + "@type": "Organization", + "name": "Root" + }, + "distribution": [ + { + "@type": "DataDownload", + "name": "compute.py", + "encodingFormat": "text/x-python", + "contentSize": 15, + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/7" + }, + { + "@type": "DataDownload", + "name": "stata13-auto.tab", + "encodingFormat": "text/tab-separated-values", + "contentSize": 4026, + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/9" + }, + { + "@type": "DataDownload", + "name": "README.md", + "encodingFormat": "text/markdown", + "contentSize": 28, + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/8" + } + ] +} diff --git a/src/test/resources/croissant/restricted/out/croissant.json b/src/test/resources/croissant/restricted/out/croissant.json new file mode 100644 index 00000000000..19d970d1bbb --- /dev/null +++ b/src/test/resources/croissant/restricted/out/croissant.json @@ -0,0 +1,115 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "rai": "http://mlcommons.org/croissant/RAI/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "samplingRate": "cr:samplingRate", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform", + "wd": "https://www.wikidata.org/wiki/" + }, + "@type": "sc:Dataset", + "conformsTo": "http://mlcommons.org/croissant/1.0", + "name": "Cars", + "url": "https://doi.org/10.5072/FK2/CY7BWA", + "creator": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "affiliation": { + "@type": "Organization", + "name": "Harvard" + }, + "name": "Durbin, Philip" + } + ], + "description": "This dataset is about cars.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "datePublished": "2025-05-16", + "dateModified": "2025-05-16", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "http://localhost:8080" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "version": "1.0", + "citeAs": "@data{FK2/CY7BWA_2025,author = {Durbin, Philip},publisher = {Root},title = {Cars},year = {2025},url = {https://doi.org/10.5072/FK2/CY7BWA}}", + "distribution": [ + { + "@type": "cr:FileObject", + "@id": "code/compute.py", + "name": "compute.py", + "encodingFormat": "text/x-python", + "md5": "d84985e94dde671f318076bd7a137f15", + "contentSize": "15", + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/7" + }, + { + "@type": "cr:FileObject", + "@id": "data/stata13-auto.dta", + "name": "stata13-auto.dta", + "encodingFormat": "application/x-stata-13", + "md5": "7b1201ce6b469796837a835377338c5a", + "contentSize": "6443", + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/9?format=original" + }, + { + "@type": "cr:FileObject", + "@id": "doc/README.md", + "name": "README.md", + "encodingFormat": "text/markdown", + "md5": "a2e484d07ee5590cc32182dc2c6ccc83", + "contentSize": "28", + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/8" + } + ] +} \ No newline at end of file From d13cd9d99ed78425d76788aaa90e629ddab33fe1 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Thu, 29 Jan 2026 14:00:05 -0500 Subject: [PATCH 2/9] add spotless config, limit it to croissant for now --- .../source/developers/coding-style.rst | 15 ++++- pom.xml | 24 +++++++ .../export/CroissantExporterTest.java | 66 ++++++++++++------- 3 files changed, 82 insertions(+), 23 deletions(-) diff --git a/doc/sphinx-guides/source/developers/coding-style.rst b/doc/sphinx-guides/source/developers/coding-style.rst index 2a1c0d5d232..f3935aae30f 100755 --- a/doc/sphinx-guides/source/developers/coding-style.rst +++ b/doc/sphinx-guides/source/developers/coding-style.rst @@ -13,6 +13,8 @@ Java Formatting Code ~~~~~~~~~~~~~~~ +How to format Java code is being discussed on `Zulip `_ and the `dev mailing list `_. + Tabs vs. Spaces ^^^^^^^^^^^^^^^ @@ -59,10 +61,21 @@ Place curly braces according to the style below, which is an example you can see } } +Format Code with Spotless +^^^^^^^^^^^^^^^^^^^^^^^^^ + +In some of our libraries we've had success formatting code with `Spotless `_. See https://github.com/gdcc/xoai/issues/35 for an early discussion. + +We've added Spotless to the main repo but have limited it to certain files. If you'd like to use Spotless on files you're editing, update the config in pom.xml to include them. + +To run Spotless on your code: + +``mvn spotless:apply`` + Format Code You Changed with Netbeans ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -IQSS has standardized on Netbeans. It is much appreciated when you format your code (but only the code you touched!) using the out-of-the-box Netbeans configuration. If you have created an entirely new Java class, you can just click Source -> Format. If you are adjusting code in an existing class, highlight the code you changed and then click Source -> Format. Keeping the "diff" in your pull requests small makes them easier to code review. +For a long time IQSS standardized on Netbeans. For files not included in the Spotless config mentioned above, it is much appreciated when you format your code (but only the code you touched!) using the out-of-the-box Netbeans configuration. If you have created an entirely new Java class, you can just click Source -> Format. If you are adjusting code in an existing class, highlight the code you changed and then click Source -> Format. Keeping the "diff" in your pull requests small makes them easier to code review. Checking Your Formatting With Checkstyle ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pom.xml b/pom.xml index 011ee25a7fc..ec24d89b0e5 100644 --- a/pom.xml +++ b/pom.xml @@ -1116,6 +1116,30 @@ + + com.diffplug.spotless + spotless-maven-plugin + 3.2.1 + + + + src/main/java/edu/harvard/iq/dataverse/export/CroissantExporter.java + src/test/java/edu/harvard/iq/dataverse/export/CroissantExporterTest.java + + + false + + + google-java-format + + + 1.17.0 + + true + + + + diff --git a/src/test/java/edu/harvard/iq/dataverse/export/CroissantExporterTest.java b/src/test/java/edu/harvard/iq/dataverse/export/CroissantExporterTest.java index 59e7b2cc329..6c6da792d4e 100644 --- a/src/test/java/edu/harvard/iq/dataverse/export/CroissantExporterTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/export/CroissantExporterTest.java @@ -52,7 +52,8 @@ public static void setUp() { new ExportDataProvider() { @Override public JsonObject getDatasetJson() { - String pathToJsonFile = "src/test/resources/croissant/minimal/in/datasetJson.json"; + String pathToJsonFile = + "src/test/resources/croissant/minimal/in/datasetJson.json"; try (JsonReader jsonReader = Json.createReader(new FileReader(pathToJsonFile))) { return jsonReader.readObject(); @@ -63,7 +64,8 @@ public JsonObject getDatasetJson() { @Override public JsonObject getDatasetORE() { - String pathToJsonFile = "src/test/resources/croissant/minimal/in/datasetORE.json"; + String pathToJsonFile = + "src/test/resources/croissant/minimal/in/datasetORE.json"; try (JsonReader jsonReader = Json.createReader(new FileReader(pathToJsonFile))) { return jsonReader.readObject(); @@ -100,7 +102,8 @@ public JsonObject getDatasetSchemaDotOrg() { public String getDataCiteXml() { try { return Files.readString( - Paths.get("src/test/resources/croissant/minimal/in/dataCiteXml.xml"), + Paths.get( + "src/test/resources/croissant/minimal/in/dataCiteXml.xml"), StandardCharsets.UTF_8); } catch (IOException ex) { return null; @@ -113,7 +116,8 @@ public String getDataCiteXml() { new ExportDataProvider() { @Override public JsonObject getDatasetJson() { - String pathToJsonFile = "src/test/resources/croissant/max/in/datasetJson.json"; + String pathToJsonFile = + "src/test/resources/croissant/max/in/datasetJson.json"; try (JsonReader jsonReader = Json.createReader(new FileReader(pathToJsonFile))) { return jsonReader.readObject(); @@ -124,7 +128,8 @@ public JsonObject getDatasetJson() { @Override public JsonObject getDatasetORE() { - String pathToJsonFile = "src/test/resources/croissant/max/in/datasetORE.json"; + String pathToJsonFile = + "src/test/resources/croissant/max/in/datasetORE.json"; try (JsonReader jsonReader = Json.createReader(new FileReader(pathToJsonFile))) { return jsonReader.readObject(); @@ -135,7 +140,8 @@ public JsonObject getDatasetORE() { @Override public JsonArray getDatasetFileDetails() { - String pathToJsonFile = "src/test/resources/croissant/max/in/datasetFileDetails.json"; + String pathToJsonFile = + "src/test/resources/croissant/max/in/datasetFileDetails.json"; try (JsonReader jsonReader = Json.createReader(new FileReader(pathToJsonFile))) { return jsonReader.readArray(); @@ -160,7 +166,8 @@ public JsonObject getDatasetSchemaDotOrg() { public String getDataCiteXml() { try { return Files.readString( - Paths.get("src/test/resources/croissant/max/in/dataCiteXml.xml"), + Paths.get( + "src/test/resources/croissant/max/in/dataCiteXml.xml"), StandardCharsets.UTF_8); } catch (IOException ex) { return null; @@ -173,7 +180,8 @@ public String getDataCiteXml() { new ExportDataProvider() { @Override public JsonObject getDatasetJson() { - String pathToJsonFile = "src/test/resources/croissant/cars/in/datasetJson.json"; + String pathToJsonFile = + "src/test/resources/croissant/cars/in/datasetJson.json"; try (JsonReader jsonReader = Json.createReader(new FileReader(pathToJsonFile))) { return jsonReader.readObject(); @@ -184,7 +192,8 @@ public JsonObject getDatasetJson() { @Override public JsonObject getDatasetORE() { - String pathToJsonFile = "src/test/resources/croissant/cars/in/datasetORE.json"; + String pathToJsonFile = + "src/test/resources/croissant/cars/in/datasetORE.json"; try (JsonReader jsonReader = Json.createReader(new FileReader(pathToJsonFile))) { return jsonReader.readObject(); @@ -221,7 +230,8 @@ public JsonObject getDatasetSchemaDotOrg() { public String getDataCiteXml() { try { return Files.readString( - Paths.get("src/test/resources/croissant/cars/in/dataCiteXml.xml"), + Paths.get( + "src/test/resources/croissant/cars/in/dataCiteXml.xml"), StandardCharsets.UTF_8); } catch (IOException ex) { return null; @@ -234,7 +244,8 @@ public String getDataCiteXml() { new ExportDataProvider() { @Override public JsonObject getDatasetJson() { - String pathToJsonFile = "src/test/resources/croissant/restricted/in/datasetJson.json"; + String pathToJsonFile = + "src/test/resources/croissant/restricted/in/datasetJson.json"; try (JsonReader jsonReader = Json.createReader(new FileReader(pathToJsonFile))) { return jsonReader.readObject(); @@ -245,7 +256,8 @@ public JsonObject getDatasetJson() { @Override public JsonObject getDatasetORE() { - String pathToJsonFile = "src/test/resources/croissant/restricted/in/datasetORE.json"; + String pathToJsonFile = + "src/test/resources/croissant/restricted/in/datasetORE.json"; try (JsonReader jsonReader = Json.createReader(new FileReader(pathToJsonFile))) { return jsonReader.readObject(); @@ -282,7 +294,8 @@ public JsonObject getDatasetSchemaDotOrg() { public String getDataCiteXml() { try { return Files.readString( - Paths.get("src/test/resources/croissant/restricted/in/dataCiteXml.xml"), + Paths.get( + "src/test/resources/croissant/restricted/in/dataCiteXml.xml"), StandardCharsets.UTF_8); } catch (IOException ex) { return null; @@ -295,7 +308,8 @@ public String getDataCiteXml() { new ExportDataProvider() { @Override public JsonObject getDatasetJson() { - String pathToJsonFile = "src/test/resources/croissant/junk/in/datasetJson.json"; + String pathToJsonFile = + "src/test/resources/croissant/junk/in/datasetJson.json"; try (JsonReader jsonReader = Json.createReader(new FileReader(pathToJsonFile))) { return jsonReader.readObject(); @@ -306,7 +320,8 @@ public JsonObject getDatasetJson() { @Override public JsonObject getDatasetORE() { - String pathToJsonFile = "src/test/resources/croissant/junk/in/datasetORE.json"; + String pathToJsonFile = + "src/test/resources/croissant/junk/in/datasetORE.json"; try (JsonReader jsonReader = Json.createReader(new FileReader(pathToJsonFile))) { return jsonReader.readObject(); @@ -343,7 +358,8 @@ public JsonObject getDatasetSchemaDotOrg() { public String getDataCiteXml() { try { return Files.readString( - Paths.get("src/test/resources/croissant/junk/in/dataCiteXml.xml"), + Paths.get( + "src/test/resources/croissant/junk/in/dataCiteXml.xml"), StandardCharsets.UTF_8); } catch (IOException ex) { return null; @@ -356,7 +372,8 @@ public String getDataCiteXml() { new ExportDataProvider() { @Override public JsonObject getDatasetJson() { - String pathToJsonFile = "src/test/resources/croissant/draft/in/datasetJson.json"; + String pathToJsonFile = + "src/test/resources/croissant/draft/in/datasetJson.json"; try (JsonReader jsonReader = Json.createReader(new FileReader(pathToJsonFile))) { return jsonReader.readObject(); @@ -367,7 +384,8 @@ public JsonObject getDatasetJson() { @Override public JsonObject getDatasetORE() { - String pathToJsonFile = "src/test/resources/croissant/draft/in/datasetORE.json"; + String pathToJsonFile = + "src/test/resources/croissant/draft/in/datasetORE.json"; try (JsonReader jsonReader = Json.createReader(new FileReader(pathToJsonFile))) { return jsonReader.readObject(); @@ -404,7 +422,8 @@ public JsonObject getDatasetSchemaDotOrg() { public String getDataCiteXml() { try { return Files.readString( - Paths.get("src/test/resources/croissant/draft/in/dataCiteXml.xml"), + Paths.get( + "src/test/resources/croissant/draft/in/dataCiteXml.xml"), StandardCharsets.UTF_8); } catch (IOException ex) { return null; @@ -448,7 +467,8 @@ public void testExportDatasetMinimal() throws Exception { writeCroissantFile(actual, "minimal"); String expected = Files.readString( - Paths.get("src/test/resources/croissant/minimal/expected/minimal-croissant.json"), + Paths.get( + "src/test/resources/croissant/minimal/expected/minimal-croissant.json"), StandardCharsets.UTF_8); JSONAssert.assertEquals(expected, actual, true); assertEquals(prettyPrint(expected), prettyPrint(outputStreamMinimal.toString())); @@ -542,14 +562,16 @@ public void testExportDatasetDraft() throws Exception { writeCroissantFile(actual, "draft"); String expected = Files.readString( - Paths.get("src/test/resources/croissant/draft/expected/draft-croissant.json"), + Paths.get( + "src/test/resources/croissant/draft/expected/draft-croissant.json"), StandardCharsets.UTF_8); JSONAssert.assertEquals(expected, actual, true); assertEquals(prettyPrint(expected), prettyPrint(outputStreamDraft.toString())); } private void writeCroissantFile(String actual, String name) throws IOException { - Path dir = Files.createDirectories(Paths.get("src/test/resources/croissant/" + name + "/out")); + Path dir = + Files.createDirectories(Paths.get("src/test/resources/croissant/" + name + "/out")); Path out = Paths.get(dir + "/croissant.json"); Files.writeString(out, prettyPrint(actual), StandardCharsets.UTF_8); } From f1d52f137925dd5d01465b069adb7f4734997bdb Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Fri, 30 Jan 2026 10:05:11 -0500 Subject: [PATCH 3/9] put Croissant in by default, add flag for old behavior #11254 --- .../source/admin/discoverability.rst | 17 +++++++++-------- doc/sphinx-guides/source/api/native-api.rst | 3 ++- .../source/installation/advanced.rst | 3 +-- .../source/installation/config.rst | 6 ++++++ .../source/user/dataset-management.rst | 4 ++-- .../iq/dataverse/settings/FeatureFlags.java | 6 ++++++ src/main/webapp/dataset.xhtml | 10 ++++++---- 7 files changed, 32 insertions(+), 17 deletions(-) diff --git a/doc/sphinx-guides/source/admin/discoverability.rst b/doc/sphinx-guides/source/admin/discoverability.rst index 22ff66246f0..285e7f248a7 100644 --- a/doc/sphinx-guides/source/admin/discoverability.rst +++ b/doc/sphinx-guides/source/admin/discoverability.rst @@ -30,21 +30,22 @@ The HTML source of a dataset landing page includes "DC" (Dublin Core) ```` `` of Dataset Landing Pages ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -The ```` of the HTML source of a dataset landing page includes Schema.org JSON-LD metadata like this:: +`Croissant `_ is a metadata format for machine learning datasets. +In Dataverse, the ```` of the HTML source of a dataset landing page includes Croissant metadata like this:: - From 3571d2900ac1ed6b4ce7561815199649e325b2f0 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Fri, 30 Jan 2026 10:35:58 -0500 Subject: [PATCH 4/9] add release note #11254 --- doc/release-notes/11254-croissant-builtin.md | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 doc/release-notes/11254-croissant-builtin.md diff --git a/doc/release-notes/11254-croissant-builtin.md b/doc/release-notes/11254-croissant-builtin.md new file mode 100644 index 00000000000..e25927a7220 --- /dev/null +++ b/doc/release-notes/11254-croissant-builtin.md @@ -0,0 +1,9 @@ +## Croissant Support Is Now Built In + +Croissant is a metadata export format for machine learning datasets that (until this release) was optional and implemented as external exporter. The code has been merged into the main Dataverse code base which means the Croissant format is automatically available in your installation of Dataverse, alongside older formats like Dublin Core and DDI. If you were using the external Croissant exporter, the merged code is equivalent to verion 0.1.6. Croissant bugs and feature requests should now be filed against the main Dataverse repo (https://github.com/IQSS/dataverse) and the old repo (https://github.com/gdcc/exporter-croissant) should be considered retired. + +As described in the [Discoverability](https://dataverse-guide--12130.org.readthedocs.build/en/12130/admin/discoverability.html#id6) section of the Admin Guide, Croissant is inserted into the "head" of the HTML of dataset landing pages, as requested by the [Google Dataset Search](https://datasetsearch.research.google.com) team so that their tool can filter by datasets that support Croissant. In previous versions of Dataverse, when Croissant was optional and hadn't been enabled, we used the older "Schema.org JSON-LD" format in the "head". If you'd like to keep this behavior, you can use the feature flag [dataverse.feature.legacy-format-in-head](https://dataverse-guide--12130.org.readthedocs.build/en/12130/installation/config.html#dataverse-feature-legacy-format-in-head). + +We are aware that the amount of data in the "head" of the HTML can grow quite large for both Croissant and Schema.org JSON-LD. This is especially true of Croissant which exposes variable-level information. We plan to address this in https://github.com/IQSS/dataverse/issues/12123 . We also plan to support Croissant 1.1 in the future and are tracking this at https://github.com/IQSS/dataverse/issues/12014 . + +See also #11254 and #12130. From 2e392ceb20bf07e658ad93c90be6569a95f764af Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Fri, 30 Jan 2026 10:49:00 -0500 Subject: [PATCH 5/9] remove generated files --- .../croissant/cars/out/croissant.json | 302 ------------------ .../croissant/draft/out/croissant.json | 94 ------ .../croissant/junk/out/croissant.json | 83 ----- .../croissant/max/out/croissant.json | 196 ------------ .../croissant/minimal/out/croissant.json | 79 ----- .../croissant/restricted/out/croissant.json | 115 ------- 6 files changed, 869 deletions(-) delete mode 100644 src/test/resources/croissant/cars/out/croissant.json delete mode 100644 src/test/resources/croissant/draft/out/croissant.json delete mode 100644 src/test/resources/croissant/junk/out/croissant.json delete mode 100644 src/test/resources/croissant/max/out/croissant.json delete mode 100644 src/test/resources/croissant/minimal/out/croissant.json delete mode 100644 src/test/resources/croissant/restricted/out/croissant.json diff --git a/src/test/resources/croissant/cars/out/croissant.json b/src/test/resources/croissant/cars/out/croissant.json deleted file mode 100644 index a9c0d48b217..00000000000 --- a/src/test/resources/croissant/cars/out/croissant.json +++ /dev/null @@ -1,302 +0,0 @@ -{ - "@context": { - "@language": "en", - "@vocab": "https://schema.org/", - "citeAs": "cr:citeAs", - "column": "cr:column", - "conformsTo": "dct:conformsTo", - "cr": "http://mlcommons.org/croissant/", - "rai": "http://mlcommons.org/croissant/RAI/", - "data": { - "@id": "cr:data", - "@type": "@json" - }, - "dataType": { - "@id": "cr:dataType", - "@type": "@vocab" - }, - "dct": "http://purl.org/dc/terms/", - "examples": { - "@id": "cr:examples", - "@type": "@json" - }, - "extract": "cr:extract", - "field": "cr:field", - "fileProperty": "cr:fileProperty", - "fileObject": "cr:fileObject", - "fileSet": "cr:fileSet", - "format": "cr:format", - "includes": "cr:includes", - "isLiveDataset": "cr:isLiveDataset", - "jsonPath": "cr:jsonPath", - "key": "cr:key", - "md5": "cr:md5", - "parentField": "cr:parentField", - "path": "cr:path", - "recordSet": "cr:recordSet", - "references": "cr:references", - "regex": "cr:regex", - "repeated": "cr:repeated", - "replace": "cr:replace", - "samplingRate": "cr:samplingRate", - "sc": "https://schema.org/", - "separator": "cr:separator", - "source": "cr:source", - "subField": "cr:subField", - "transform": "cr:transform", - "wd": "https://www.wikidata.org/wiki/" - }, - "@type": "sc:Dataset", - "conformsTo": "http://mlcommons.org/croissant/1.0", - "name": "Cars", - "url": "https://doi.org/10.5072/FK2/CY7BWA", - "creator": [ - { - "@type": "Person", - "givenName": "Philip", - "familyName": "Durbin", - "affiliation": { - "@type": "Organization", - "name": "Harvard" - }, - "name": "Durbin, Philip" - } - ], - "description": "This dataset is about cars.", - "keywords": [ - "Other" - ], - "license": "http://creativecommons.org/publicdomain/zero/1.0", - "datePublished": "2025-05-16", - "dateModified": "2025-05-16", - "includedInDataCatalog": { - "@type": "DataCatalog", - "name": "Root", - "url": "http://localhost:8080" - }, - "publisher": { - "@type": "Organization", - "name": "Root" - }, - "version": "1.0", - "citeAs": "@data{FK2/CY7BWA_2025,author = {Durbin, Philip},publisher = {Root},title = {Cars},year = {2025},url = {https://doi.org/10.5072/FK2/CY7BWA}}", - "distribution": [ - { - "@type": "cr:FileObject", - "@id": "code/compute.py", - "name": "compute.py", - "encodingFormat": "text/x-python", - "md5": "d84985e94dde671f318076bd7a137f15", - "contentSize": "15", - "description": "", - "contentUrl": "http://localhost:8080/api/access/datafile/7" - }, - { - "@type": "cr:FileObject", - "@id": "data/stata13-auto.dta", - "name": "stata13-auto.dta", - "encodingFormat": "application/x-stata-13", - "md5": "7b1201ce6b469796837a835377338c5a", - "contentSize": "6443", - "description": "", - "contentUrl": "http://localhost:8080/api/access/datafile/9?format=original" - }, - { - "@type": "cr:FileObject", - "@id": "doc/README.md", - "name": "README.md", - "encodingFormat": "text/markdown", - "md5": "a2e484d07ee5590cc32182dc2c6ccc83", - "contentSize": "28", - "description": "", - "contentUrl": "http://localhost:8080/api/access/datafile/8" - } - ], - "recordSet": [ - { - "@type": "cr:RecordSet", - "field": [ - { - "@type": "cr:Field", - "name": "make", - "description": "Make and Model", - "dataType": "sc:Text", - "source": { - "@id": "2", - "fileObject": { - "@id": "data/stata13-auto.dta" - }, - "extract": { - "column": "make" - } - } - }, - { - "@type": "cr:Field", - "name": "price", - "description": "Price", - "dataType": "sc:Integer", - "source": { - "@id": "5", - "fileObject": { - "@id": "data/stata13-auto.dta" - }, - "extract": { - "column": "price" - } - } - }, - { - "@type": "cr:Field", - "name": "mpg", - "description": "Mileage (mpg)", - "dataType": "sc:Integer", - "source": { - "@id": "3", - "fileObject": { - "@id": "data/stata13-auto.dta" - }, - "extract": { - "column": "mpg" - } - } - }, - { - "@type": "cr:Field", - "name": "rep78", - "description": "Repair Record 1978", - "dataType": "sc:Integer", - "source": { - "@id": "12", - "fileObject": { - "@id": "data/stata13-auto.dta" - }, - "extract": { - "column": "rep78" - } - } - }, - { - "@type": "cr:Field", - "name": "headroom", - "description": "Headroom (in.)", - "dataType": "sc:Float", - "source": { - "@id": "1", - "fileObject": { - "@id": "data/stata13-auto.dta" - }, - "extract": { - "column": "headroom" - } - } - }, - { - "@type": "cr:Field", - "name": "trunk", - "description": "Trunk space (cu. ft.)", - "dataType": "sc:Integer", - "source": { - "@id": "7", - "fileObject": { - "@id": "data/stata13-auto.dta" - }, - "extract": { - "column": "trunk" - } - } - }, - { - "@type": "cr:Field", - "name": "weight", - "description": "Weight (lbs.)", - "dataType": "sc:Integer", - "source": { - "@id": "4", - "fileObject": { - "@id": "data/stata13-auto.dta" - }, - "extract": { - "column": "weight" - } - } - }, - { - "@type": "cr:Field", - "name": "length", - "description": "Length (in.)", - "dataType": "sc:Integer", - "source": { - "@id": "8", - "fileObject": { - "@id": "data/stata13-auto.dta" - }, - "extract": { - "column": "length" - } - } - }, - { - "@type": "cr:Field", - "name": "turn", - "description": "Turn Circle (ft.) ", - "dataType": "sc:Integer", - "source": { - "@id": "9", - "fileObject": { - "@id": "data/stata13-auto.dta" - }, - "extract": { - "column": "turn" - } - } - }, - { - "@type": "cr:Field", - "name": "displacement", - "description": "Displacement (cu. in.)", - "dataType": "sc:Integer", - "source": { - "@id": "10", - "fileObject": { - "@id": "data/stata13-auto.dta" - }, - "extract": { - "column": "displacement" - } - } - }, - { - "@type": "cr:Field", - "name": "gear_ratio", - "description": "Gear Ratio", - "dataType": "sc:Float", - "source": { - "@id": "6", - "fileObject": { - "@id": "data/stata13-auto.dta" - }, - "extract": { - "column": "gear_ratio" - } - } - }, - { - "@type": "cr:Field", - "name": "foreign", - "description": "Car type", - "dataType": "sc:Integer", - "source": { - "@id": "11", - "fileObject": { - "@id": "data/stata13-auto.dta" - }, - "extract": { - "column": "foreign" - } - } - } - ] - } - ] -} \ No newline at end of file diff --git a/src/test/resources/croissant/draft/out/croissant.json b/src/test/resources/croissant/draft/out/croissant.json deleted file mode 100644 index b2065f79195..00000000000 --- a/src/test/resources/croissant/draft/out/croissant.json +++ /dev/null @@ -1,94 +0,0 @@ -{ - "@context": { - "@language": "en", - "@vocab": "https://schema.org/", - "citeAs": "cr:citeAs", - "column": "cr:column", - "conformsTo": "dct:conformsTo", - "cr": "http://mlcommons.org/croissant/", - "rai": "http://mlcommons.org/croissant/RAI/", - "data": { - "@id": "cr:data", - "@type": "@json" - }, - "dataType": { - "@id": "cr:dataType", - "@type": "@vocab" - }, - "dct": "http://purl.org/dc/terms/", - "examples": { - "@id": "cr:examples", - "@type": "@json" - }, - "extract": "cr:extract", - "field": "cr:field", - "fileProperty": "cr:fileProperty", - "fileObject": "cr:fileObject", - "fileSet": "cr:fileSet", - "format": "cr:format", - "includes": "cr:includes", - "isLiveDataset": "cr:isLiveDataset", - "jsonPath": "cr:jsonPath", - "key": "cr:key", - "md5": "cr:md5", - "parentField": "cr:parentField", - "path": "cr:path", - "recordSet": "cr:recordSet", - "references": "cr:references", - "regex": "cr:regex", - "repeated": "cr:repeated", - "replace": "cr:replace", - "samplingRate": "cr:samplingRate", - "sc": "https://schema.org/", - "separator": "cr:separator", - "source": "cr:source", - "subField": "cr:subField", - "transform": "cr:transform", - "wd": "https://www.wikidata.org/wiki/" - }, - "@type": "sc:Dataset", - "conformsTo": "http://mlcommons.org/croissant/1.0", - "name": "Draft Dataset", - "url": "https://doi.org/10.5072/FK2/OO7TEP", - "creator": [ - { - "@type": "Person", - "givenName": "Draft", - "familyName": "Punk", - "affiliation": { - "@type": "Organization", - "name": "French house" - }, - "name": "Punk, Draft" - } - ], - "description": "This dataset hasn't been published yet.", - "keywords": [ - "Other" - ], - "license": "http://creativecommons.org/publicdomain/zero/1.0", - "dateModified": "", - "includedInDataCatalog": { - "@type": "DataCatalog", - "name": "Root", - "url": "http://localhost:8080" - }, - "publisher": { - "@type": "Organization", - "name": "Root" - }, - "version": "DRAFT", - "citeAs": "@data{FK2/OO7TEP,author = {Punk, Draft},publisher = {Root},title = {Draft Dataset},url = {https://doi.org/10.5072/FK2/OO7TEP}}", - "distribution": [ - { - "@type": "cr:FileObject", - "@id": "data.txt", - "name": "data.txt", - "encodingFormat": "text/plain", - "md5": "050644e853fdfe46a3707695ba2fe736", - "contentSize": "18", - "description": "", - "contentUrl": "http://localhost:8080/api/access/datafile/4" - } - ] -} \ No newline at end of file diff --git a/src/test/resources/croissant/junk/out/croissant.json b/src/test/resources/croissant/junk/out/croissant.json deleted file mode 100644 index b02bed5694e..00000000000 --- a/src/test/resources/croissant/junk/out/croissant.json +++ /dev/null @@ -1,83 +0,0 @@ -{ - "@context": { - "@language": "en", - "@vocab": "https://schema.org/", - "citeAs": "cr:citeAs", - "column": "cr:column", - "conformsTo": "dct:conformsTo", - "cr": "http://mlcommons.org/croissant/", - "rai": "http://mlcommons.org/croissant/RAI/", - "data": { - "@id": "cr:data", - "@type": "@json" - }, - "dataType": { - "@id": "cr:dataType", - "@type": "@vocab" - }, - "dct": "http://purl.org/dc/terms/", - "examples": { - "@id": "cr:examples", - "@type": "@json" - }, - "extract": "cr:extract", - "field": "cr:field", - "fileProperty": "cr:fileProperty", - "fileObject": "cr:fileObject", - "fileSet": "cr:fileSet", - "format": "cr:format", - "includes": "cr:includes", - "isLiveDataset": "cr:isLiveDataset", - "jsonPath": "cr:jsonPath", - "key": "cr:key", - "md5": "cr:md5", - "parentField": "cr:parentField", - "path": "cr:path", - "recordSet": "cr:recordSet", - "references": "cr:references", - "regex": "cr:regex", - "repeated": "cr:repeated", - "replace": "cr:replace", - "samplingRate": "cr:samplingRate", - "sc": "https://schema.org/", - "separator": "cr:separator", - "source": "cr:source", - "subField": "cr:subField", - "transform": "cr:transform", - "wd": "https://www.wikidata.org/wiki/" - }, - "@type": "sc:Dataset", - "conformsTo": "http://mlcommons.org/croissant/1.0", - "name": "</script><script>alert(666)</script>", - "url": "https://doi.org/10.5072/FK2/0CNXUJ", - "creator": [ - { - "@type": "Person", - "givenName": "Sylvester", - "familyName": "Ritter", - "affiliation": { - "@type": "Organization", - "name": "WWF" - }, - "name": "Ritter, Sylvester" - } - ], - "description": "A junk dataset.", - "keywords": [ - "Other" - ], - "license": "http://creativecommons.org/publicdomain/zero/1.0", - "datePublished": "2025-03-13", - "dateModified": "2025-03-13", - "includedInDataCatalog": { - "@type": "DataCatalog", - "name": "Root", - "url": "http://localhost:8080" - }, - "publisher": { - "@type": "Organization", - "name": "Root" - }, - "version": "1.0", - "citeAs": "@data{FK2/0CNXUJ_2025,author = {Ritter, Sylvester},publisher = {Root},title = {},year = {2025},url = {https://doi.org/10.5072/FK2/0CNXUJ}}" -} \ No newline at end of file diff --git a/src/test/resources/croissant/max/out/croissant.json b/src/test/resources/croissant/max/out/croissant.json deleted file mode 100644 index bf1941c7289..00000000000 --- a/src/test/resources/croissant/max/out/croissant.json +++ /dev/null @@ -1,196 +0,0 @@ -{ - "@context": { - "@language": "en", - "@vocab": "https://schema.org/", - "citeAs": "cr:citeAs", - "column": "cr:column", - "conformsTo": "dct:conformsTo", - "cr": "http://mlcommons.org/croissant/", - "rai": "http://mlcommons.org/croissant/RAI/", - "data": { - "@id": "cr:data", - "@type": "@json" - }, - "dataType": { - "@id": "cr:dataType", - "@type": "@vocab" - }, - "dct": "http://purl.org/dc/terms/", - "examples": { - "@id": "cr:examples", - "@type": "@json" - }, - "extract": "cr:extract", - "field": "cr:field", - "fileProperty": "cr:fileProperty", - "fileObject": "cr:fileObject", - "fileSet": "cr:fileSet", - "format": "cr:format", - "includes": "cr:includes", - "isLiveDataset": "cr:isLiveDataset", - "jsonPath": "cr:jsonPath", - "key": "cr:key", - "md5": "cr:md5", - "parentField": "cr:parentField", - "path": "cr:path", - "recordSet": "cr:recordSet", - "references": "cr:references", - "regex": "cr:regex", - "repeated": "cr:repeated", - "replace": "cr:replace", - "samplingRate": "cr:samplingRate", - "sc": "https://schema.org/", - "separator": "cr:separator", - "source": "cr:source", - "subField": "cr:subField", - "transform": "cr:transform", - "wd": "https://www.wikidata.org/wiki/" - }, - "@type": "sc:Dataset", - "conformsTo": "http://mlcommons.org/croissant/1.0", - "name": "Max Schema.org", - "url": "https://doi.org/10.5072/FK2/VQTYHD", - "creator": [ - { - "@type": "Person", - "givenName": "Philip", - "familyName": "Durbin", - "affiliation": { - "@type": "Organization", - "name": "Harvard University" - }, - "sameAs": "https://orcid.org/0000-0002-9528-9470", - "@id": "https://orcid.org/0000-0002-9528-9470", - "identifier": "https://orcid.org/0000-0002-9528-9470", - "name": "Durbin, Philip" - }, - { - "@type": "Person", - "affiliation": { - "@type": "Organization", - "name": "Harvard University" - }, - "name": "IQSS" - } - ], - "description": "Exercising fields used by `schema.org` exporter.", - "keywords": [ - "Social Sciences", - "Other", - "foo", - "bar" - ], - "license": "http://creativecommons.org/publicdomain/zero/1.0", - "datePublished": "2024-05-01", - "dateModified": "2025-05-21", - "includedInDataCatalog": { - "@type": "DataCatalog", - "name": "Root", - "url": "https://beta.dataverse.org" - }, - "publisher": { - "@type": "Organization", - "name": "Root" - }, - "version": "3.0", - "citeAs": "@data{FK2/VQTYHD_2024,author = {Durbin, Philip and IQSS},publisher = {Root},title = {Max Schema.org},year = {2024},url = {https://doi.org/10.5072/FK2/VQTYHD}}", - "funder": [ - { - "@type": "Organization", - "name": "NSF" - }, - { - "@type": "Organization", - "name": "NIH" - } - ], - "spatialCoverage": [ - "Cambridge, MA, United States, Harvard Square" - ], - "citation": [ - { - "@type": "CreativeWork", - "name": "Tykhonov, V., & Durbin, P. (2024, March 20). Croissant ML standard in the context of Dataverse, EOSC and beyond. Zenodo. https://doi.org/10.5281/zenodo.10843668", - "@id": "https://doi.org/10.5281/zenodo.10843668", - "identifier": "https://doi.org/10.5281/zenodo.10843668", - "url": "https://doi.org/10.5281/zenodo.10843668" - } - ], - "temporalCoverage": [ - "2023-01-01/2023-12-31" - ], - "distribution": [ - { - "@type": "cr:FileObject", - "@id": "data.tsv", - "name": "data.tsv", - "encodingFormat": "text/tab-separated-values", - "md5": "3663d6a436ac00f5541a7336d6fa18c9", - "contentSize": "33", - "description": "", - "contentUrl": "https://beta.dataverse.org/api/access/datafile/26646?format=original" - }, - { - "@type": "cr:FileObject", - "@id": "doc/README.md", - "name": "README.md", - "encodingFormat": "text/markdown", - "md5": "ebf050ec8cce5df0a72b100cfc9f442f", - "contentSize": "34", - "description": "Additional documentation.", - "contentUrl": "https://beta.dataverse.org/api/access/datafile/26148" - } - ], - "recordSet": [ - { - "@type": "cr:RecordSet", - "field": [ - { - "@type": "cr:Field", - "name": "foo", - "description": "foo", - "dataType": "sc:Text", - "source": { - "@id": "1287", - "fileObject": { - "@id": "data.tsv" - }, - "extract": { - "column": "foo" - } - } - }, - { - "@type": "cr:Field", - "name": "bar", - "description": "bar", - "dataType": "sc:Integer", - "source": { - "@id": "1285", - "fileObject": { - "@id": "data.tsv" - }, - "extract": { - "column": "bar" - } - } - }, - { - "@type": "cr:Field", - "name": "baz", - "description": "baz", - "dataType": "sc:Integer", - "source": { - "@id": "1286", - "fileObject": { - "@id": "data.tsv" - }, - "extract": { - "column": "baz" - } - } - } - ] - } - ] -} \ No newline at end of file diff --git a/src/test/resources/croissant/minimal/out/croissant.json b/src/test/resources/croissant/minimal/out/croissant.json deleted file mode 100644 index 7c47afc1485..00000000000 --- a/src/test/resources/croissant/minimal/out/croissant.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "@context": { - "@language": "en", - "@vocab": "https://schema.org/", - "citeAs": "cr:citeAs", - "column": "cr:column", - "conformsTo": "dct:conformsTo", - "cr": "http://mlcommons.org/croissant/", - "rai": "http://mlcommons.org/croissant/RAI/", - "data": { - "@id": "cr:data", - "@type": "@json" - }, - "dataType": { - "@id": "cr:dataType", - "@type": "@vocab" - }, - "dct": "http://purl.org/dc/terms/", - "examples": { - "@id": "cr:examples", - "@type": "@json" - }, - "extract": "cr:extract", - "field": "cr:field", - "fileProperty": "cr:fileProperty", - "fileObject": "cr:fileObject", - "fileSet": "cr:fileSet", - "format": "cr:format", - "includes": "cr:includes", - "isLiveDataset": "cr:isLiveDataset", - "jsonPath": "cr:jsonPath", - "key": "cr:key", - "md5": "cr:md5", - "parentField": "cr:parentField", - "path": "cr:path", - "recordSet": "cr:recordSet", - "references": "cr:references", - "regex": "cr:regex", - "repeated": "cr:repeated", - "replace": "cr:replace", - "samplingRate": "cr:samplingRate", - "sc": "https://schema.org/", - "separator": "cr:separator", - "source": "cr:source", - "subField": "cr:subField", - "transform": "cr:transform", - "wd": "https://www.wikidata.org/wiki/" - }, - "@type": "sc:Dataset", - "conformsTo": "http://mlcommons.org/croissant/1.0", - "name": "Minimal", - "url": "https://doi.org/10.5072/FK2/4C0JYC", - "creator": [ - { - "@type": "Person", - "givenName": "Philip", - "familyName": "Durbin", - "name": "Durbin, Philip" - } - ], - "description": "Minimal metadata and no files.", - "keywords": [ - "Other" - ], - "license": "http://creativecommons.org/publicdomain/zero/1.0", - "datePublished": "2024-05-01", - "dateModified": "2024-05-01", - "includedInDataCatalog": { - "@type": "DataCatalog", - "name": "Root", - "url": "https://beta.dataverse.org" - }, - "publisher": { - "@type": "Organization", - "name": "Root" - }, - "version": "1.0", - "citeAs": "@data{FK2/4C0JYC_2024,author = {Durbin, Philip},publisher = {Root},title = {Minimal},year = {2024},url = {https://doi.org/10.5072/FK2/4C0JYC}}" -} \ No newline at end of file diff --git a/src/test/resources/croissant/restricted/out/croissant.json b/src/test/resources/croissant/restricted/out/croissant.json deleted file mode 100644 index 19d970d1bbb..00000000000 --- a/src/test/resources/croissant/restricted/out/croissant.json +++ /dev/null @@ -1,115 +0,0 @@ -{ - "@context": { - "@language": "en", - "@vocab": "https://schema.org/", - "citeAs": "cr:citeAs", - "column": "cr:column", - "conformsTo": "dct:conformsTo", - "cr": "http://mlcommons.org/croissant/", - "rai": "http://mlcommons.org/croissant/RAI/", - "data": { - "@id": "cr:data", - "@type": "@json" - }, - "dataType": { - "@id": "cr:dataType", - "@type": "@vocab" - }, - "dct": "http://purl.org/dc/terms/", - "examples": { - "@id": "cr:examples", - "@type": "@json" - }, - "extract": "cr:extract", - "field": "cr:field", - "fileProperty": "cr:fileProperty", - "fileObject": "cr:fileObject", - "fileSet": "cr:fileSet", - "format": "cr:format", - "includes": "cr:includes", - "isLiveDataset": "cr:isLiveDataset", - "jsonPath": "cr:jsonPath", - "key": "cr:key", - "md5": "cr:md5", - "parentField": "cr:parentField", - "path": "cr:path", - "recordSet": "cr:recordSet", - "references": "cr:references", - "regex": "cr:regex", - "repeated": "cr:repeated", - "replace": "cr:replace", - "samplingRate": "cr:samplingRate", - "sc": "https://schema.org/", - "separator": "cr:separator", - "source": "cr:source", - "subField": "cr:subField", - "transform": "cr:transform", - "wd": "https://www.wikidata.org/wiki/" - }, - "@type": "sc:Dataset", - "conformsTo": "http://mlcommons.org/croissant/1.0", - "name": "Cars", - "url": "https://doi.org/10.5072/FK2/CY7BWA", - "creator": [ - { - "@type": "Person", - "givenName": "Philip", - "familyName": "Durbin", - "affiliation": { - "@type": "Organization", - "name": "Harvard" - }, - "name": "Durbin, Philip" - } - ], - "description": "This dataset is about cars.", - "keywords": [ - "Other" - ], - "license": "http://creativecommons.org/publicdomain/zero/1.0", - "datePublished": "2025-05-16", - "dateModified": "2025-05-16", - "includedInDataCatalog": { - "@type": "DataCatalog", - "name": "Root", - "url": "http://localhost:8080" - }, - "publisher": { - "@type": "Organization", - "name": "Root" - }, - "version": "1.0", - "citeAs": "@data{FK2/CY7BWA_2025,author = {Durbin, Philip},publisher = {Root},title = {Cars},year = {2025},url = {https://doi.org/10.5072/FK2/CY7BWA}}", - "distribution": [ - { - "@type": "cr:FileObject", - "@id": "code/compute.py", - "name": "compute.py", - "encodingFormat": "text/x-python", - "md5": "d84985e94dde671f318076bd7a137f15", - "contentSize": "15", - "description": "", - "contentUrl": "http://localhost:8080/api/access/datafile/7" - }, - { - "@type": "cr:FileObject", - "@id": "data/stata13-auto.dta", - "name": "stata13-auto.dta", - "encodingFormat": "application/x-stata-13", - "md5": "7b1201ce6b469796837a835377338c5a", - "contentSize": "6443", - "description": "", - "contentUrl": "http://localhost:8080/api/access/datafile/9?format=original" - }, - { - "@type": "cr:FileObject", - "@id": "doc/README.md", - "name": "README.md", - "encodingFormat": "text/markdown", - "md5": "a2e484d07ee5590cc32182dc2c6ccc83", - "contentSize": "28", - "description": "", - "contentUrl": "http://localhost:8080/api/access/datafile/8" - } - ] -} \ No newline at end of file From 74e0ee6cf4050b7e320e00687e1eedb2508c609d Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Fri, 30 Jan 2026 10:50:38 -0500 Subject: [PATCH 6/9] gitignore files generated by tests --- src/test/resources/croissant/.gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 src/test/resources/croissant/.gitignore diff --git a/src/test/resources/croissant/.gitignore b/src/test/resources/croissant/.gitignore new file mode 100644 index 00000000000..7aa31745061 --- /dev/null +++ b/src/test/resources/croissant/.gitignore @@ -0,0 +1,2 @@ +# these "out" files are generated when running tests +/*/out/croissant.json From fc254a53c3b8d06e9b5a0b7920e95946e887d141 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Fri, 30 Jan 2026 21:39:53 -0500 Subject: [PATCH 7/9] add croissant to expected export formats #11254 --- src/test/resources/json/export-formats.json | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/test/resources/json/export-formats.json b/src/test/resources/json/export-formats.json index 65fc746ee23..ab8f64f9076 100644 --- a/src/test/resources/json/export-formats.json +++ b/src/test/resources/json/export-formats.json @@ -49,6 +49,12 @@ "XMLSchemaLocation": "https://ddialliance.org/Specification/DDI-Codebook/2.5/XMLSchema/codebook.xsd", "XMLSchemaVersion": "2.5" }, + "croissant": { + "displayName": "Croissant", + "mediaType": "application/json", + "isHarvestable": false, + "isVisibleInUserInterface": true + }, "dcterms": { "displayName": "Dublin Core", "mediaType": "application/xml", From 1f09c918751a93600c5963e07780393b76048214 Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Mon, 2 Feb 2026 10:22:53 -0500 Subject: [PATCH 8/9] list new setting and fix typo #11254 --- doc/release-notes/11254-croissant-builtin.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/release-notes/11254-croissant-builtin.md b/doc/release-notes/11254-croissant-builtin.md index e25927a7220..f7bf20c510b 100644 --- a/doc/release-notes/11254-croissant-builtin.md +++ b/doc/release-notes/11254-croissant-builtin.md @@ -1,9 +1,13 @@ ## Croissant Support Is Now Built In -Croissant is a metadata export format for machine learning datasets that (until this release) was optional and implemented as external exporter. The code has been merged into the main Dataverse code base which means the Croissant format is automatically available in your installation of Dataverse, alongside older formats like Dublin Core and DDI. If you were using the external Croissant exporter, the merged code is equivalent to verion 0.1.6. Croissant bugs and feature requests should now be filed against the main Dataverse repo (https://github.com/IQSS/dataverse) and the old repo (https://github.com/gdcc/exporter-croissant) should be considered retired. +Croissant is a metadata export format for machine learning datasets that (until this release) was optional and implemented as external exporter. The code has been merged into the main Dataverse code base which means the Croissant format is automatically available in your installation of Dataverse, alongside older formats like Dublin Core and DDI. If you were using the external Croissant exporter, the merged code is equivalent to version 0.1.6. Croissant bugs and feature requests should now be filed against the main Dataverse repo (https://github.com/IQSS/dataverse) and the old repo (https://github.com/gdcc/exporter-croissant) should be considered retired. As described in the [Discoverability](https://dataverse-guide--12130.org.readthedocs.build/en/12130/admin/discoverability.html#id6) section of the Admin Guide, Croissant is inserted into the "head" of the HTML of dataset landing pages, as requested by the [Google Dataset Search](https://datasetsearch.research.google.com) team so that their tool can filter by datasets that support Croissant. In previous versions of Dataverse, when Croissant was optional and hadn't been enabled, we used the older "Schema.org JSON-LD" format in the "head". If you'd like to keep this behavior, you can use the feature flag [dataverse.feature.legacy-format-in-head](https://dataverse-guide--12130.org.readthedocs.build/en/12130/installation/config.html#dataverse-feature-legacy-format-in-head). We are aware that the amount of data in the "head" of the HTML can grow quite large for both Croissant and Schema.org JSON-LD. This is especially true of Croissant which exposes variable-level information. We plan to address this in https://github.com/IQSS/dataverse/issues/12123 . We also plan to support Croissant 1.1 in the future and are tracking this at https://github.com/IQSS/dataverse/issues/12014 . See also #11254 and #12130. + +## New Settings + +- dataverse.feature.legacy-format-in-head From a3d12476980280727e541077ff42119c241fe8cc Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Wed, 11 Feb 2026 12:07:58 -0500 Subject: [PATCH 9/9] convert feature flag to jvm option #11254 --- doc/release-notes/11254-croissant-builtin.md | 4 ++-- .../source/admin/discoverability.rst | 2 +- doc/sphinx-guides/source/installation/config.rst | 16 +++++++++------- .../edu/harvard/iq/dataverse/DatasetPage.java | 4 ++++ .../iq/dataverse/settings/FeatureFlags.java | 6 ------ .../iq/dataverse/settings/JvmSettings.java | 4 ++++ src/main/webapp/dataset.xhtml | 3 +-- 7 files changed, 21 insertions(+), 18 deletions(-) diff --git a/doc/release-notes/11254-croissant-builtin.md b/doc/release-notes/11254-croissant-builtin.md index f7bf20c510b..4e3af70da4f 100644 --- a/doc/release-notes/11254-croissant-builtin.md +++ b/doc/release-notes/11254-croissant-builtin.md @@ -2,7 +2,7 @@ Croissant is a metadata export format for machine learning datasets that (until this release) was optional and implemented as external exporter. The code has been merged into the main Dataverse code base which means the Croissant format is automatically available in your installation of Dataverse, alongside older formats like Dublin Core and DDI. If you were using the external Croissant exporter, the merged code is equivalent to version 0.1.6. Croissant bugs and feature requests should now be filed against the main Dataverse repo (https://github.com/IQSS/dataverse) and the old repo (https://github.com/gdcc/exporter-croissant) should be considered retired. -As described in the [Discoverability](https://dataverse-guide--12130.org.readthedocs.build/en/12130/admin/discoverability.html#id6) section of the Admin Guide, Croissant is inserted into the "head" of the HTML of dataset landing pages, as requested by the [Google Dataset Search](https://datasetsearch.research.google.com) team so that their tool can filter by datasets that support Croissant. In previous versions of Dataverse, when Croissant was optional and hadn't been enabled, we used the older "Schema.org JSON-LD" format in the "head". If you'd like to keep this behavior, you can use the feature flag [dataverse.feature.legacy-format-in-head](https://dataverse-guide--12130.org.readthedocs.build/en/12130/installation/config.html#dataverse-feature-legacy-format-in-head). +As described in the [Discoverability](https://dataverse-guide--12130.org.readthedocs.build/en/12130/admin/discoverability.html#id6) section of the Admin Guide, Croissant is inserted into the "head" of the HTML of dataset landing pages, as requested by the [Google Dataset Search](https://datasetsearch.research.google.com) team so that their tool can filter by datasets that support Croissant. In previous versions of Dataverse, when Croissant was optional and hadn't been enabled, we used the older "Schema.org JSON-LD" format in the "head". If you'd like to keep this behavior, you can use the feature flag [dataverse.legacy.schemaorg-in-html-head](https://dataverse-guide--12130.org.readthedocs.build/en/12130/installation/config.html#dataverse.legacy.schemaorg-in-html-head). We are aware that the amount of data in the "head" of the HTML can grow quite large for both Croissant and Schema.org JSON-LD. This is especially true of Croissant which exposes variable-level information. We plan to address this in https://github.com/IQSS/dataverse/issues/12123 . We also plan to support Croissant 1.1 in the future and are tracking this at https://github.com/IQSS/dataverse/issues/12014 . @@ -10,4 +10,4 @@ See also #11254 and #12130. ## New Settings -- dataverse.feature.legacy-format-in-head +- dataverse.legacy.schemaorg-in-html-head diff --git a/doc/sphinx-guides/source/admin/discoverability.rst b/doc/sphinx-guides/source/admin/discoverability.rst index 285e7f248a7..cd38a35457a 100644 --- a/doc/sphinx-guides/source/admin/discoverability.rst +++ b/doc/sphinx-guides/source/admin/discoverability.rst @@ -45,7 +45,7 @@ This is the same Croissant file you can download from a dataset landing page by We include Croissant in the ```` because it's `recommended `_ by Google for `Google Dataset Search `_, where they offer a filter to narrow results to only datasets with support for Croissant. -Before Croissant was invented, Google recommended a different format that Dataverse refers to as "Schema.org JSON-LD" in the user interface (and ``schema.org`` in the API). If you prefer to put that older format in the ````, which was the behavior in older versions of Dataverse, see :ref:`dataverse.feature.legacy-format-in-head`. +Before Croissant was invented, Google recommended a different format that Dataverse refers to as "Schema.org JSON-LD" in the user interface (and ``schema.org`` in the API). If you prefer to put that older format in the ````, which was the behavior in older versions of Dataverse, see :ref:`dataverse.legacy.schemaorg-in-html-head`. .. _discovery-sign-posting: diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index f1bfdcc32f4..d84d0bc625a 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -3851,6 +3851,15 @@ Example: ``dataverse.api.mdc.min-delay-ms=100`` (enforces a minimum 100ms delay Can also be set via any `supported MicroProfile Config API source`_, e.g. the environment variable ``DATAVERSE_API_MDC_MIN_DELAY_MS``. +.. _dataverse.legacy.schemaorg-in-html-head: + +dataverse.legacy.schemaorg-in-html-head ++++++++++++++++++++++++++++++++++++++++ + +Instead of Croissant, use the legacy format (Schema.org JSON-LD) in the head of dataset landing pages by setting ``dataverse.legacy.schemaorg-in-html-head=true``. See :ref:`croissant-head`. + +Can also be set via any `supported MicroProfile Config API source`_, e.g. the environment variable ``DATAVERSE_LEGACY_SCHEMAORG_IN_HTML_HEAD``. + .. dataverse.ldn Linked Data Notifications (LDN) Allowed Hosts @@ -4031,13 +4040,6 @@ dataverse.feature.only-update-datacite-when-needed Only contact DataCite to update a DOI after checking to see if DataCite has outdated information (for efficiency, lighter load on DataCite, especially when using file DOIs). -.. _dataverse.feature.legacy-format-in-head: - -dataverse.feature.legacy-format-in-head -+++++++++++++++++++++++++++++++++++++++ - -Instead of Croissant, use the legacy format (Schema.org JSON-LD) in the head of dataset landing pages. See :ref:`croissant-head`. - .. _:ApplicationServerSettings: diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index d0c1eb239f6..4894bfe09a6 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -1485,6 +1485,10 @@ public boolean canSeeCurationStatus() { } } + public boolean isUseLegacyFormatInHead() { + return JvmSettings.SCHEMAORG_IN_HTML_HEAD.lookupOptional(Boolean.class).orElse(false); + } + /* * 4.2.1 optimization. * HOWEVER, this doesn't appear to be saving us anything! diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/FeatureFlags.java b/src/main/java/edu/harvard/iq/dataverse/settings/FeatureFlags.java index 74f3b471408..2e86fae610e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/FeatureFlags.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/FeatureFlags.java @@ -250,12 +250,6 @@ public enum FeatureFlags { */ ONLY_UPDATE_DATACITE_WHEN_NEEDED("only-update-datacite-when-needed"), - /** - * Instead of Croissant, use the legacy format (Schema.org JSON-LD) in the head - * of dataset landing pages. By default this is false. - */ - LEGACY_FORMAT_IN_HEAD("legacy-format-in-head"), - ; final String flag; diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java index 05390ba8a8c..1ef6238a28b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/JvmSettings.java @@ -302,6 +302,10 @@ public enum JvmSettings { SCOPE_LOCALCONTEXTS(PREFIX, "localcontexts"), LOCALCONTEXTS_URL(SCOPE_LOCALCONTEXTS, "url"), LOCALCONTEXTS_API_KEY(SCOPE_LOCALCONTEXTS, "api-key"), + + // LEGACY SETTINGS + SCOPE_LEGACY(PREFIX, "legacy"), + SCHEMAORG_IN_HTML_HEAD(SCOPE_LEGACY, "schemaorg-in-html-head"), // LinkedDataNotification SCOPE_LINKEDDATANOTIFICATION(PREFIX, "ldn"), diff --git a/src/main/webapp/dataset.xhtml b/src/main/webapp/dataset.xhtml index ef7a7acf52b..9bb6fac9bec 100644 --- a/src/main/webapp/dataset.xhtml +++ b/src/main/webapp/dataset.xhtml @@ -95,8 +95,7 @@ - - +