diff --git a/doc/release-notes/11254-croissant-builtin.md b/doc/release-notes/11254-croissant-builtin.md new file mode 100644 index 00000000000..4e3af70da4f --- /dev/null +++ b/doc/release-notes/11254-croissant-builtin.md @@ -0,0 +1,13 @@ +## Croissant Support Is Now Built In + +Croissant is a metadata export format for machine learning datasets that (until this release) was optional and implemented as external exporter. The code has been merged into the main Dataverse code base which means the Croissant format is automatically available in your installation of Dataverse, alongside older formats like Dublin Core and DDI. If you were using the external Croissant exporter, the merged code is equivalent to version 0.1.6. Croissant bugs and feature requests should now be filed against the main Dataverse repo (https://github.com/IQSS/dataverse) and the old repo (https://github.com/gdcc/exporter-croissant) should be considered retired. + +As described in the [Discoverability](https://dataverse-guide--12130.org.readthedocs.build/en/12130/admin/discoverability.html#id6) section of the Admin Guide, Croissant is inserted into the "head" of the HTML of dataset landing pages, as requested by the [Google Dataset Search](https://datasetsearch.research.google.com) team so that their tool can filter by datasets that support Croissant. In previous versions of Dataverse, when Croissant was optional and hadn't been enabled, we used the older "Schema.org JSON-LD" format in the "head". If you'd like to keep this behavior, you can use the feature flag [dataverse.legacy.schemaorg-in-html-head](https://dataverse-guide--12130.org.readthedocs.build/en/12130/installation/config.html#dataverse.legacy.schemaorg-in-html-head). + +We are aware that the amount of data in the "head" of the HTML can grow quite large for both Croissant and Schema.org JSON-LD. This is especially true of Croissant which exposes variable-level information. We plan to address this in https://github.com/IQSS/dataverse/issues/12123 . We also plan to support Croissant 1.1 in the future and are tracking this at https://github.com/IQSS/dataverse/issues/12014 . + +See also #11254 and #12130. + +## New Settings + +- dataverse.legacy.schemaorg-in-html-head diff --git a/doc/sphinx-guides/source/admin/discoverability.rst b/doc/sphinx-guides/source/admin/discoverability.rst index 22ff66246f0..cd38a35457a 100644 --- a/doc/sphinx-guides/source/admin/discoverability.rst +++ b/doc/sphinx-guides/source/admin/discoverability.rst @@ -30,21 +30,22 @@ The HTML source of a dataset landing page includes "DC" (Dublin Core) ```` `` of Dataset Landing Pages ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -The ```` of the HTML source of a dataset landing page includes Schema.org JSON-LD metadata like this:: +`Croissant `_ is a metadata format for machine learning datasets. +In Dataverse, the ```` of the HTML source of a dataset landing page includes Croissant metadata like this:: - diff --git a/src/test/java/edu/harvard/iq/dataverse/export/CroissantExporterTest.java b/src/test/java/edu/harvard/iq/dataverse/export/CroissantExporterTest.java new file mode 100644 index 00000000000..6c6da792d4e --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/export/CroissantExporterTest.java @@ -0,0 +1,605 @@ +package edu.harvard.iq.dataverse.export; + +import static org.junit.jupiter.api.Assertions.*; + +import io.gdcc.spi.export.ExportDataProvider; +import jakarta.json.Json; +import jakarta.json.JsonArray; +import jakarta.json.JsonObject; +import jakarta.json.JsonReader; +import jakarta.json.JsonWriter; +import jakarta.json.JsonWriterFactory; +import jakarta.json.stream.JsonGenerator; +import java.io.ByteArrayOutputStream; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.io.OutputStream; +import java.io.StringReader; +import java.io.StringWriter; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.HashMap; +import java.util.Map; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.skyscreamer.jsonassert.JSONAssert; + +public class CroissantExporterTest { + + static CroissantExporter exporter; + static OutputStream outputStreamMinimal; + static ExportDataProvider dataProviderMinimal; + static OutputStream outputStreamMax; + static ExportDataProvider dataProviderMax; + static OutputStream outputStreamCars; + static ExportDataProvider dataProviderCars; + static OutputStream outputStreamRestricted; + static ExportDataProvider dataProviderRestricted; + static OutputStream outputStreamJunk; + static ExportDataProvider dataProviderJunk; + static OutputStream outputStreamDraft; + static ExportDataProvider dataProviderDraft; + + @BeforeAll + public static void setUp() { + exporter = new CroissantExporter(); + + outputStreamMinimal = new ByteArrayOutputStream(); + dataProviderMinimal = + new ExportDataProvider() { + @Override + public JsonObject getDatasetJson() { + String pathToJsonFile = + "src/test/resources/croissant/minimal/in/datasetJson.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetORE() { + String pathToJsonFile = + "src/test/resources/croissant/minimal/in/datasetORE.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonArray getDatasetFileDetails() { + String pathToJsonFile = + "src/test/resources/croissant/minimal/in/datasetFileDetails.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readArray(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetSchemaDotOrg() { + String pathToJsonFile = + "src/test/resources/croissant/minimal/in/datasetSchemaDotOrg.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public String getDataCiteXml() { + try { + return Files.readString( + Paths.get( + "src/test/resources/croissant/minimal/in/dataCiteXml.xml"), + StandardCharsets.UTF_8); + } catch (IOException ex) { + return null; + } + } + }; + + outputStreamMax = new ByteArrayOutputStream(); + dataProviderMax = + new ExportDataProvider() { + @Override + public JsonObject getDatasetJson() { + String pathToJsonFile = + "src/test/resources/croissant/max/in/datasetJson.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetORE() { + String pathToJsonFile = + "src/test/resources/croissant/max/in/datasetORE.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonArray getDatasetFileDetails() { + String pathToJsonFile = + "src/test/resources/croissant/max/in/datasetFileDetails.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readArray(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetSchemaDotOrg() { + String pathToJsonFile = + "src/test/resources/croissant/max/in/datasetSchemaDotOrg.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public String getDataCiteXml() { + try { + return Files.readString( + Paths.get( + "src/test/resources/croissant/max/in/dataCiteXml.xml"), + StandardCharsets.UTF_8); + } catch (IOException ex) { + return null; + } + } + }; + + outputStreamCars = new ByteArrayOutputStream(); + dataProviderCars = + new ExportDataProvider() { + @Override + public JsonObject getDatasetJson() { + String pathToJsonFile = + "src/test/resources/croissant/cars/in/datasetJson.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetORE() { + String pathToJsonFile = + "src/test/resources/croissant/cars/in/datasetORE.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonArray getDatasetFileDetails() { + String pathToJsonFile = + "src/test/resources/croissant/cars/in/datasetFileDetails.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readArray(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetSchemaDotOrg() { + String pathToJsonFile = + "src/test/resources/croissant/cars/in/datasetSchemaDotOrg.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public String getDataCiteXml() { + try { + return Files.readString( + Paths.get( + "src/test/resources/croissant/cars/in/dataCiteXml.xml"), + StandardCharsets.UTF_8); + } catch (IOException ex) { + return null; + } + } + }; + + outputStreamRestricted = new ByteArrayOutputStream(); + dataProviderRestricted = + new ExportDataProvider() { + @Override + public JsonObject getDatasetJson() { + String pathToJsonFile = + "src/test/resources/croissant/restricted/in/datasetJson.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetORE() { + String pathToJsonFile = + "src/test/resources/croissant/restricted/in/datasetORE.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonArray getDatasetFileDetails() { + String pathToJsonFile = + "src/test/resources/croissant/restricted/in/datasetFileDetails.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readArray(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetSchemaDotOrg() { + String pathToJsonFile = + "src/test/resources/croissant/restricted/in/datasetSchemaDotOrg.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public String getDataCiteXml() { + try { + return Files.readString( + Paths.get( + "src/test/resources/croissant/restricted/in/dataCiteXml.xml"), + StandardCharsets.UTF_8); + } catch (IOException ex) { + return null; + } + } + }; + + outputStreamJunk = new ByteArrayOutputStream(); + dataProviderJunk = + new ExportDataProvider() { + @Override + public JsonObject getDatasetJson() { + String pathToJsonFile = + "src/test/resources/croissant/junk/in/datasetJson.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetORE() { + String pathToJsonFile = + "src/test/resources/croissant/junk/in/datasetORE.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonArray getDatasetFileDetails() { + String pathToJsonFile = + "src/test/resources/croissant/junk/in/datasetFileDetails.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readArray(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetSchemaDotOrg() { + String pathToJsonFile = + "src/test/resources/croissant/junk/in/datasetSchemaDotOrg.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public String getDataCiteXml() { + try { + return Files.readString( + Paths.get( + "src/test/resources/croissant/junk/in/dataCiteXml.xml"), + StandardCharsets.UTF_8); + } catch (IOException ex) { + return null; + } + } + }; + + outputStreamDraft = new ByteArrayOutputStream(); + dataProviderDraft = + new ExportDataProvider() { + @Override + public JsonObject getDatasetJson() { + String pathToJsonFile = + "src/test/resources/croissant/draft/in/datasetJson.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetORE() { + String pathToJsonFile = + "src/test/resources/croissant/draft/in/datasetORE.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonArray getDatasetFileDetails() { + String pathToJsonFile = + "src/test/resources/croissant/draft/in/datasetFileDetails.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readArray(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public JsonObject getDatasetSchemaDotOrg() { + String pathToJsonFile = + "src/test/resources/croissant/draft/in/datasetSchemaDotOrg.json"; + try (JsonReader jsonReader = + Json.createReader(new FileReader(pathToJsonFile))) { + return jsonReader.readObject(); + } catch (FileNotFoundException ex) { + return null; + } + } + + @Override + public String getDataCiteXml() { + try { + return Files.readString( + Paths.get( + "src/test/resources/croissant/draft/in/dataCiteXml.xml"), + StandardCharsets.UTF_8); + } catch (IOException ex) { + return null; + } + } + }; + } + + @Test + public void testGetFormatName() { + CroissantExporter instance = new CroissantExporter(); + String expResult = ""; + String result = instance.getFormatName(); + assertEquals("croissant", result); + } + + @Test + public void testGetDisplayName() { + assertEquals("Croissant", exporter.getDisplayName(null)); + } + + @Test + public void testIsHarvestable() { + assertEquals(false, exporter.isHarvestable()); + } + + @Test + public void testIsAvailableToUsers() { + assertEquals(true, exporter.isAvailableToUsers()); + } + + @Test + public void testGetMediaType() { + assertEquals("application/json", exporter.getMediaType()); + } + + @Test + public void testExportDatasetMinimal() throws Exception { + exporter.exportDataset(dataProviderMinimal, outputStreamMinimal); + String actual = outputStreamMinimal.toString(); + writeCroissantFile(actual, "minimal"); + String expected = + Files.readString( + Paths.get( + "src/test/resources/croissant/minimal/expected/minimal-croissant.json"), + StandardCharsets.UTF_8); + JSONAssert.assertEquals(expected, actual, true); + assertEquals(prettyPrint(expected), prettyPrint(outputStreamMinimal.toString())); + } + + @Test + public void testExportDatasetMax() throws Exception { + exporter.exportDataset(dataProviderMax, outputStreamMax); + String actual = outputStreamMax.toString(); + writeCroissantFile(actual, "max"); + /* + First, install pyDataverse from Dans-labs, the "croissant" branch: + pip3 install --upgrade --no-cache-dir git+https://github.com/Dans-labs/pyDataverse@croissant#egg=pyDataverse + You can use this script to export Croissant from a dataset: + --- + from pyDataverse.Croissant import Croissant + #from pyDataverse.Croissant import Croissant + import json + #host = "https://dataverse.nl" + #PID = "doi:10.34894/KMRAYH" + host = "https://beta.dataverse.org" + PID = "doi:10.5072/FK2/VQTYHD" + croissant = Croissant(host, PID) + print(json.dumps(croissant.get_record(), indent=4, default=str)) + --- + Finally, uncomment the lines below to check for differences. + */ + // String pyDataverse = Files.readString(Paths.get("/tmp/pyDataverse.json"), + // StandardCharsets.UTF_8); + // JSONAssert.assertEquals(actual, pyDataverse, true); + String expected = + Files.readString( + Paths.get("src/test/resources/croissant/max/expected/max-croissant.json"), + StandardCharsets.UTF_8); + JSONAssert.assertEquals(expected, actual, true); + assertEquals(prettyPrint(expected), prettyPrint(outputStreamMax.toString())); + } + + /* + The data in stata13-auto.dta looks something like this: + make price mpg rep78 headroom trunk weight length turn displacement gear_ratio foreign + "AMC Concord" 4099 22 3 2.5 11 2930 186 40 121 3.58 0 + "AMC Pacer" 4749 17 3 3.0 11 3350 173 40 258 2.53 0 + "AMC Spirit" 3799 22 3.0 12 2640 168 35 121 3.08 0 + */ + @Test + public void testExportDatasetCars() throws Exception { + exporter.exportDataset(dataProviderCars, outputStreamCars); + String actual = outputStreamCars.toString(); + writeCroissantFile(actual, "cars"); + String expected = + Files.readString( + Paths.get("src/test/resources/croissant/cars/expected/cars-croissant.json"), + StandardCharsets.UTF_8); + JSONAssert.assertEquals(expected, actual, true); + assertEquals(prettyPrint(expected), prettyPrint(outputStreamCars.toString())); + } + + /** Same as the cars data but the stata13-auto.dta file is restricted. */ + @Test + public void testExportDatasetRestricted() throws Exception { + exporter.exportDataset(dataProviderRestricted, outputStreamRestricted); + String actual = outputStreamRestricted.toString(); + writeCroissantFile(actual, "restricted"); + String expected = + Files.readString( + Paths.get( + "src/test/resources/croissant/restricted/expected/restricted-croissant.json"), + StandardCharsets.UTF_8); + JSONAssert.assertEquals(expected, actual, true); + assertEquals(prettyPrint(expected), prettyPrint(outputStreamRestricted.toString())); + } + + @Test + public void testExportDatasetJunk() throws Exception { + exporter.exportDataset(dataProviderJunk, outputStreamJunk); + String actual = outputStreamJunk.toString(); + writeCroissantFile(actual, "junk"); + String expected = + Files.readString( + Paths.get("src/test/resources/croissant/junk/expected/junk-croissant.json"), + StandardCharsets.UTF_8); + JSONAssert.assertEquals(expected, actual, true); + assertEquals(prettyPrint(expected), prettyPrint(outputStreamJunk.toString())); + } + + @Test + public void testExportDatasetDraft() throws Exception { + exporter.exportDataset(dataProviderDraft, outputStreamDraft); + String actual = outputStreamDraft.toString(); + writeCroissantFile(actual, "draft"); + String expected = + Files.readString( + Paths.get( + "src/test/resources/croissant/draft/expected/draft-croissant.json"), + StandardCharsets.UTF_8); + JSONAssert.assertEquals(expected, actual, true); + assertEquals(prettyPrint(expected), prettyPrint(outputStreamDraft.toString())); + } + + private void writeCroissantFile(String actual, String name) throws IOException { + Path dir = + Files.createDirectories(Paths.get("src/test/resources/croissant/" + name + "/out")); + Path out = Paths.get(dir + "/croissant.json"); + Files.writeString(out, prettyPrint(actual), StandardCharsets.UTF_8); + } + + public static String prettyPrint(String jsonObject) { + try { + return prettyPrint(getJsonObject(jsonObject)); + } catch (Exception ex) { + return jsonObject; + } + } + + public static String prettyPrint(JsonObject jsonObject) { + Map config = new HashMap<>(); + config.put(JsonGenerator.PRETTY_PRINTING, true); + JsonWriterFactory jsonWriterFactory = Json.createWriterFactory(config); + StringWriter stringWriter = new StringWriter(); + try (JsonWriter jsonWriter = jsonWriterFactory.createWriter(stringWriter)) { + jsonWriter.writeObject(jsonObject); + } + return stringWriter.toString(); + } + + public static JsonObject getJsonObject(String serializedJson) { + try (StringReader rdr = new StringReader(serializedJson)) { + try (JsonReader jsonReader = Json.createReader(rdr)) { + return jsonReader.readObject(); + } + } + } +} diff --git a/src/test/resources/croissant/.gitignore b/src/test/resources/croissant/.gitignore new file mode 100644 index 00000000000..7aa31745061 --- /dev/null +++ b/src/test/resources/croissant/.gitignore @@ -0,0 +1,2 @@ +# these "out" files are generated when running tests +/*/out/croissant.json diff --git a/src/test/resources/croissant/cars/expected/cars-croissant.json b/src/test/resources/croissant/cars/expected/cars-croissant.json new file mode 100644 index 00000000000..a9c0d48b217 --- /dev/null +++ b/src/test/resources/croissant/cars/expected/cars-croissant.json @@ -0,0 +1,302 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "rai": "http://mlcommons.org/croissant/RAI/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "samplingRate": "cr:samplingRate", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform", + "wd": "https://www.wikidata.org/wiki/" + }, + "@type": "sc:Dataset", + "conformsTo": "http://mlcommons.org/croissant/1.0", + "name": "Cars", + "url": "https://doi.org/10.5072/FK2/CY7BWA", + "creator": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "affiliation": { + "@type": "Organization", + "name": "Harvard" + }, + "name": "Durbin, Philip" + } + ], + "description": "This dataset is about cars.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "datePublished": "2025-05-16", + "dateModified": "2025-05-16", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "http://localhost:8080" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "version": "1.0", + "citeAs": "@data{FK2/CY7BWA_2025,author = {Durbin, Philip},publisher = {Root},title = {Cars},year = {2025},url = {https://doi.org/10.5072/FK2/CY7BWA}}", + "distribution": [ + { + "@type": "cr:FileObject", + "@id": "code/compute.py", + "name": "compute.py", + "encodingFormat": "text/x-python", + "md5": "d84985e94dde671f318076bd7a137f15", + "contentSize": "15", + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/7" + }, + { + "@type": "cr:FileObject", + "@id": "data/stata13-auto.dta", + "name": "stata13-auto.dta", + "encodingFormat": "application/x-stata-13", + "md5": "7b1201ce6b469796837a835377338c5a", + "contentSize": "6443", + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/9?format=original" + }, + { + "@type": "cr:FileObject", + "@id": "doc/README.md", + "name": "README.md", + "encodingFormat": "text/markdown", + "md5": "a2e484d07ee5590cc32182dc2c6ccc83", + "contentSize": "28", + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/8" + } + ], + "recordSet": [ + { + "@type": "cr:RecordSet", + "field": [ + { + "@type": "cr:Field", + "name": "make", + "description": "Make and Model", + "dataType": "sc:Text", + "source": { + "@id": "2", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "make" + } + } + }, + { + "@type": "cr:Field", + "name": "price", + "description": "Price", + "dataType": "sc:Integer", + "source": { + "@id": "5", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "price" + } + } + }, + { + "@type": "cr:Field", + "name": "mpg", + "description": "Mileage (mpg)", + "dataType": "sc:Integer", + "source": { + "@id": "3", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "mpg" + } + } + }, + { + "@type": "cr:Field", + "name": "rep78", + "description": "Repair Record 1978", + "dataType": "sc:Integer", + "source": { + "@id": "12", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "rep78" + } + } + }, + { + "@type": "cr:Field", + "name": "headroom", + "description": "Headroom (in.)", + "dataType": "sc:Float", + "source": { + "@id": "1", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "headroom" + } + } + }, + { + "@type": "cr:Field", + "name": "trunk", + "description": "Trunk space (cu. ft.)", + "dataType": "sc:Integer", + "source": { + "@id": "7", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "trunk" + } + } + }, + { + "@type": "cr:Field", + "name": "weight", + "description": "Weight (lbs.)", + "dataType": "sc:Integer", + "source": { + "@id": "4", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "weight" + } + } + }, + { + "@type": "cr:Field", + "name": "length", + "description": "Length (in.)", + "dataType": "sc:Integer", + "source": { + "@id": "8", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "length" + } + } + }, + { + "@type": "cr:Field", + "name": "turn", + "description": "Turn Circle (ft.) ", + "dataType": "sc:Integer", + "source": { + "@id": "9", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "turn" + } + } + }, + { + "@type": "cr:Field", + "name": "displacement", + "description": "Displacement (cu. in.)", + "dataType": "sc:Integer", + "source": { + "@id": "10", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "displacement" + } + } + }, + { + "@type": "cr:Field", + "name": "gear_ratio", + "description": "Gear Ratio", + "dataType": "sc:Float", + "source": { + "@id": "6", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "gear_ratio" + } + } + }, + { + "@type": "cr:Field", + "name": "foreign", + "description": "Car type", + "dataType": "sc:Integer", + "source": { + "@id": "11", + "fileObject": { + "@id": "data/stata13-auto.dta" + }, + "extract": { + "column": "foreign" + } + } + } + ] + } + ] +} \ No newline at end of file diff --git a/src/test/resources/croissant/cars/in/dataCiteXml.xml b/src/test/resources/croissant/cars/in/dataCiteXml.xml new file mode 100644 index 00000000000..7c6c89385fd --- /dev/null +++ b/src/test/resources/croissant/cars/in/dataCiteXml.xml @@ -0,0 +1,51 @@ + + + 10.5072/FK2/CY7BWA + + + Durbin, Philip + Philip + Durbin + Harvard + + + + Cars + + Root + 2025 + + Other + + + + Durbin, Philip + Philip + Durbin + Harvard + + + + 2024-03-13 + 2025-05-16 + + + + 15 + 28 + 4026 + + + text/x-python + text/markdown + text/tab-separated-values + + 1.0 + + + Creative Commons CC0 1.0 Universal Public Domain Dedication. + + + This dataset is about cars. + + diff --git a/src/test/resources/croissant/cars/in/datasetFileDetails.json b/src/test/resources/croissant/cars/in/datasetFileDetails.json new file mode 100644 index 00000000000..2ce12a4abe9 --- /dev/null +++ b/src/test/resources/croissant/cars/in/datasetFileDetails.json @@ -0,0 +1,355 @@ +[ + { + "id": 7, + "persistentId": "", + "filename": "compute.py", + "contentType": "text/x-python", + "friendlyType": "Python Source Code", + "filesize": 15, + "description": "", + "storageIdentifier": "local://196d9f154f7-8cadf34ee905", + "rootDataFileId": -1, + "md5": "d84985e94dde671f318076bd7a137f15", + "checksum": { + "type": "MD5", + "value": "d84985e94dde671f318076bd7a137f15" + }, + "tabularData": false, + "creationDate": "2025-05-16", + "publicationDate": "2025-05-16", + "fileAccessRequest": true, + "restricted": false, + "fileMetadataId": 1, + "varGroups": [] + }, + { + "id": 9, + "persistentId": "", + "filename": "stata13-auto.tab", + "contentType": "text/tab-separated-values", + "friendlyType": "Tab-Delimited", + "filesize": 4026, + "description": "", + "storageIdentifier": "local://196d9f15719-2270bfca2b48", + "originalFileFormat": "application/x-stata-13", + "originalFormatLabel": "Stata 13 Binary", + "originalFileSize": 6443, + "originalFileName": "stata13-auto.dta", + "UNF": "UNF:6:RPd9EWHSZwqUvRZuKTJMqg==", + "rootDataFileId": -1, + "md5": "7b1201ce6b469796837a835377338c5a", + "checksum": { + "type": "MD5", + "value": "7b1201ce6b469796837a835377338c5a" + }, + "tabularData": true, + "creationDate": "2025-05-16", + "publicationDate": "2025-05-16", + "fileAccessRequest": true, + "restricted": false, + "fileMetadataId": 3, + "dataTables": [ + { + "varQuantity": 12, + "caseQuantity": 74, + "UNF": "UNF:6:RPd9EWHSZwqUvRZuKTJMqg==", + "dataVariables": [ + { + "id": 2, + "name": "make", + "label": "Make and Model", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "CHARACTER", + "isOrderedCategorical": false, + "fileOrder": 0, + "UNF": "UNF:6:Oo4vwiL8ffhSECOcjsKk2g==", + "variableMetadata": [] + }, + { + "id": 5, + "name": "price", + "label": "Price", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 1, + "UNF": "UNF:6:rvfkkdA36AaCSqCQciybfA==", + "variableMetadata": [], + "summaryStatistics": { + "min": "3291.0", + "medn": "5006.5", + "mean": "6165.256756756757", + "max": "15906.0", + "vald": "74.0", + "mode": ".", + "stdev": "2949.4958847689186", + "invd": "0.0" + } + }, + { + "id": 3, + "name": "mpg", + "label": "Mileage (mpg)", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 2, + "UNF": "UNF:6:vVr3w8CgeZq1KpDfJQudOg==", + "variableMetadata": [], + "summaryStatistics": { + "max": "41.0", + "vald": "74.0", + "medn": "20.0", + "min": "12.0", + "stdev": "5.785503209735141", + "mean": "21.2972972972973", + "invd": "0.0", + "mode": "." + } + }, + { + "id": 12, + "name": "rep78", + "label": "Repair Record 1978", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 3, + "UNF": "UNF:6:gbFI98swTWNhAjCRyi2cdA==", + "variableMetadata": [], + "summaryStatistics": { + "stdev": "0.989932270109041", + "mode": ".", + "min": "1.0", + "max": "5.0", + "medn": "3.0", + "mean": "3.4057971014492754", + "vald": "69.0", + "invd": "5.0" + } + }, + { + "id": 1, + "name": "headroom", + "label": "Headroom (in.)", + "weighted": false, + "variableIntervalType": "contin", + "variableFormatType": "NUMERIC", + "format": "float", + "isOrderedCategorical": false, + "fileOrder": 4, + "UNF": "UNF:6:g4Pl3T0Oz2e/OKJ64WiTnA==", + "variableMetadata": [], + "summaryStatistics": { + "mean": "2.993243243243243", + "mode": ".", + "vald": "74.0", + "invd": "0.0", + "stdev": "0.845994766828771", + "min": "1.5", + "medn": "3.0", + "max": "5.0" + } + }, + { + "id": 7, + "name": "trunk", + "label": "Trunk space (cu. ft.)", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 5, + "UNF": "UNF:6:iab0POsE3By7dQfgX/TY4g==", + "variableMetadata": [], + "summaryStatistics": { + "vald": "74.0", + "mode": ".", + "mean": "13.756756756756756", + "max": "23.0", + "min": "5.0", + "medn": "14.0", + "invd": "0.0", + "stdev": "4.277404189173201" + } + }, + { + "id": 4, + "name": "weight", + "label": "Weight (lbs.)", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 6, + "UNF": "UNF:6:cdoTdfUNeYWHHFEBCDxg+w==", + "variableMetadata": [], + "summaryStatistics": { + "invd": "0.0", + "min": "1760.0", + "vald": "74.0", + "max": "4840.0", + "stdev": "777.1935671373664", + "mean": "3019.459459459459", + "mode": ".", + "medn": "3190.0" + } + }, + { + "id": 8, + "name": "length", + "label": "Length (in.)", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 7, + "UNF": "UNF:6:8z1rjwhqBN4meYIiKI4P1A==", + "variableMetadata": [], + "summaryStatistics": { + "mode": ".", + "vald": "74.0", + "invd": "0.0", + "stdev": "22.266339902021585", + "max": "233.0", + "medn": "192.5", + "mean": "187.93243243243245", + "min": "142.0" + } + }, + { + "id": 9, + "name": "turn", + "label": "Turn Circle (ft.) ", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 8, + "UNF": "UNF:6:QxhjrrNtVz4qA8RulQ2MuQ==", + "variableMetadata": [], + "summaryStatistics": { + "stdev": "4.399353727233908", + "vald": "74.0", + "max": "51.0", + "min": "31.0", + "medn": "40.0", + "mean": "39.648648648648646", + "invd": "0.0", + "mode": "." + } + }, + { + "id": 10, + "name": "displacement", + "label": "Displacement (cu. in.)", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 9, + "UNF": "UNF:6:ftk+RAQpTCT1/y6G/rLWfA==", + "variableMetadata": [], + "summaryStatistics": { + "stdev": "91.83721896440396", + "invd": "0.0", + "min": "79.0", + "medn": "196.0", + "mode": ".", + "vald": "74.0", + "mean": "197.2972972972973", + "max": "425.0" + } + }, + { + "id": 6, + "name": "gear_ratio", + "label": "Gear Ratio", + "weighted": false, + "variableIntervalType": "contin", + "variableFormatType": "NUMERIC", + "format": "float", + "isOrderedCategorical": false, + "fileOrder": 10, + "UNF": "UNF:6:qjnY/qbx26FTepoPqRZ6lw==", + "variableMetadata": [], + "summaryStatistics": { + "medn": "2.9550000429153442", + "stdev": "0.45628709670763035", + "mean": "3.0148648667979883", + "min": "2.190000057220459", + "max": "3.890000104904175", + "mode": ".", + "vald": "74.0", + "invd": "0.0" + } + }, + { + "id": 11, + "name": "foreign", + "label": "Car type", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 11, + "UNF": "UNF:6:nbjzgh3wfAFqKpaoFnHalA==", + "variableMetadata": [], + "summaryStatistics": { + "max": "1.0", + "invd": "0.0", + "mode": ".", + "medn": "0.0", + "stdev": "0.46018845840901884", + "min": "0.0", + "mean": "0.2972972972972975", + "vald": "74.0" + }, + "variableCategories": [ + { + "label": "Domestic", + "value": "0", + "isMissing": false, + "frequency": 52.0 + }, + { + "label": "Foreign", + "value": "1", + "isMissing": false, + "frequency": 22.0 + } + ] + } + ] + } + ], + "varGroups": [] + }, + { + "id": 8, + "persistentId": "", + "filename": "README.md", + "contentType": "text/markdown", + "friendlyType": "Markdown Text", + "filesize": 28, + "description": "", + "storageIdentifier": "local://196d9f15664-1d4bb4e96a97", + "rootDataFileId": -1, + "md5": "a2e484d07ee5590cc32182dc2c6ccc83", + "checksum": { + "type": "MD5", + "value": "a2e484d07ee5590cc32182dc2c6ccc83" + }, + "tabularData": false, + "creationDate": "2025-05-16", + "publicationDate": "2025-05-16", + "fileAccessRequest": true, + "restricted": false, + "fileMetadataId": 2, + "varGroups": [] + } +] diff --git a/src/test/resources/croissant/cars/in/datasetJson.json b/src/test/resources/croissant/cars/in/datasetJson.json new file mode 100644 index 00000000000..96aa26c9228 --- /dev/null +++ b/src/test/resources/croissant/cars/in/datasetJson.json @@ -0,0 +1,228 @@ +{ + "id": 6, + "identifier": "FK2/CY7BWA", + "persistentUrl": "https://doi.org/10.5072/FK2/CY7BWA", + "protocol": "doi", + "authority": "10.5072", + "separator": "/", + "publisher": "Root", + "publicationDate": "2025-05-16", + "storageIdentifier": "local://10.5072/FK2/CY7BWA", + "datasetType": "dataset", + "datasetVersion": { + "id": 3, + "datasetId": 6, + "datasetPersistentId": "doi:10.5072/FK2/CY7BWA", + "storageIdentifier": "local://10.5072/FK2/CY7BWA", + "versionNumber": 1, + "internalVersionNumber": 10, + "versionMinorNumber": 0, + "versionState": "RELEASED", + "latestVersionPublishingState": "RELEASED", + "UNF": "UNF:6:RPd9EWHSZwqUvRZuKTJMqg==", + "lastUpdateTime": "2025-05-16T16:33:18Z", + "releaseTime": "2025-05-16T16:33:18Z", + "createTime": "2025-05-16T16:33:13Z", + "publicationDate": "2025-05-16", + "citationDate": "2025-05-16", + "license": { + "name": "CC0 1.0", + "uri": "http://creativecommons.org/publicdomain/zero/1.0", + "iconUri": "https://licensebuttons.net/p/zero/1.0/88x31.png", + "rightsIdentifier": "CC0-1.0", + "rightsIdentifierScheme": "SPDX", + "schemeUri": "https://spdx.org/licenses/", + "languageCode": "en" + }, + "fileAccessRequest": true, + "metadataBlocks": { + "citation": { + "displayName": "Citation Metadata", + "name": "citation", + "fields": [ + { + "typeName": "title", + "multiple": false, + "typeClass": "primitive", + "value": "Cars" + }, + { + "typeName": "author", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "authorName": { + "typeName": "authorName", + "multiple": false, + "typeClass": "primitive", + "value": "Durbin, Philip" + }, + "authorAffiliation": { + "typeName": "authorAffiliation", + "multiple": false, + "typeClass": "primitive", + "value": "Harvard" + } + } + ] + }, + { + "typeName": "datasetContact", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "datasetContactName": { + "typeName": "datasetContactName", + "multiple": false, + "typeClass": "primitive", + "value": "Durbin, Philip" + }, + "datasetContactAffiliation": { + "typeName": "datasetContactAffiliation", + "multiple": false, + "typeClass": "primitive", + "value": "Harvard" + }, + "datasetContactEmail": { + "typeName": "datasetContactEmail", + "multiple": false, + "typeClass": "primitive", + "value": "dataverse@mailinator.com" + } + } + ] + }, + { + "typeName": "dsDescription", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "dsDescriptionValue": { + "typeName": "dsDescriptionValue", + "multiple": false, + "typeClass": "primitive", + "value": "This dataset is about cars." + } + } + ] + }, + { + "typeName": "subject", + "multiple": true, + "typeClass": "controlledVocabulary", + "value": [ + "Other" + ] + }, + { + "typeName": "depositor", + "multiple": false, + "typeClass": "primitive", + "value": "Durbin, Philip" + }, + { + "typeName": "dateOfDeposit", + "multiple": false, + "typeClass": "primitive", + "value": "2024-03-13" + } + ] + } + }, + "files": [ + { + "description": "", + "label": "compute.py", + "restricted": false, + "directoryLabel": "code", + "version": 2, + "datasetVersionId": 3, + "dataFile": { + "id": 7, + "persistentId": "", + "filename": "compute.py", + "contentType": "text/x-python", + "friendlyType": "Python Source Code", + "filesize": 15, + "description": "", + "storageIdentifier": "local://196d9f154f7-8cadf34ee905", + "rootDataFileId": -1, + "md5": "d84985e94dde671f318076bd7a137f15", + "checksum": { + "type": "MD5", + "value": "d84985e94dde671f318076bd7a137f15" + }, + "tabularData": false, + "creationDate": "2025-05-16", + "publicationDate": "2025-05-16", + "fileAccessRequest": true + } + }, + { + "description": "", + "label": "README.md", + "restricted": false, + "directoryLabel": "doc", + "version": 2, + "datasetVersionId": 3, + "dataFile": { + "id": 8, + "persistentId": "", + "filename": "README.md", + "contentType": "text/markdown", + "friendlyType": "Markdown Text", + "filesize": 28, + "description": "", + "storageIdentifier": "local://196d9f15664-1d4bb4e96a97", + "rootDataFileId": -1, + "md5": "a2e484d07ee5590cc32182dc2c6ccc83", + "checksum": { + "type": "MD5", + "value": "a2e484d07ee5590cc32182dc2c6ccc83" + }, + "tabularData": false, + "creationDate": "2025-05-16", + "publicationDate": "2025-05-16", + "fileAccessRequest": true + } + }, + { + "description": "", + "label": "stata13-auto.tab", + "restricted": false, + "directoryLabel": "data", + "version": 4, + "datasetVersionId": 3, + "dataFile": { + "id": 9, + "persistentId": "", + "filename": "stata13-auto.tab", + "contentType": "text/tab-separated-values", + "friendlyType": "Tab-Delimited", + "filesize": 4026, + "description": "", + "storageIdentifier": "local://196d9f15719-2270bfca2b48", + "originalFileFormat": "application/x-stata-13", + "originalFormatLabel": "Stata 13 Binary", + "originalFileSize": 6443, + "originalFileName": "stata13-auto.dta", + "UNF": "UNF:6:RPd9EWHSZwqUvRZuKTJMqg==", + "rootDataFileId": -1, + "md5": "7b1201ce6b469796837a835377338c5a", + "checksum": { + "type": "MD5", + "value": "7b1201ce6b469796837a835377338c5a" + }, + "tabularData": true, + "creationDate": "2025-05-16", + "publicationDate": "2025-05-16", + "fileAccessRequest": true + } + } + ], + "citation": "Durbin, Philip, 2025, \"Cars\", https://doi.org/10.5072/FK2/CY7BWA, Root, V1, UNF:6:RPd9EWHSZwqUvRZuKTJMqg== [fileUNF]" + } +} diff --git a/src/test/resources/croissant/cars/in/datasetORE.json b/src/test/resources/croissant/cars/in/datasetORE.json new file mode 100644 index 00000000000..0b244ada0c1 --- /dev/null +++ b/src/test/resources/croissant/cars/in/datasetORE.json @@ -0,0 +1,133 @@ +{ + "dcterms:modified": "2025-05-19", + "dcterms:creator": "Root", + "@type": "ore:ResourceMap", + "schema:additionalType": "Dataverse OREMap Format v1.0.1", + "dvcore:generatedBy": { + "@type": "schema:SoftwareApplication", + "schema:name": "Dataverse", + "schema:version": "6.6", + "schema:url": "https://github.com/iqss/dataverse" + }, + "@id": "http://localhost:8080/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.5072/FK2/CY7BWA", + "ore:describes": { + "citation:datasetContact": { + "citation:datasetContactName": "Durbin, Philip", + "citation:datasetContactAffiliation": "Harvard", + "citation:datasetContactEmail": "dataverse@mailinator.com" + }, + "author": { + "citation:authorName": "Durbin, Philip", + "citation:authorAffiliation": "Harvard" + }, + "citation:dsDescription": { + "citation:dsDescriptionValue": "This dataset is about cars." + }, + "dateOfDeposit": "2024-03-13", + "title": "Cars", + "citation:depositor": "Durbin, Philip", + "subject": "Other", + "@id": "https://doi.org/10.5072/FK2/CY7BWA", + "@type": [ + "ore:Aggregation", + "schema:Dataset" + ], + "schema:version": "1.0", + "schema:name": "Cars", + "schema:dateModified": "Fri May 16 16:33:18 UTC 2025", + "schema:datePublished": "2025-05-16", + "schema:creativeWorkStatus": "RELEASED", + "schema:license": "http://creativecommons.org/publicdomain/zero/1.0", + "dvcore:fileTermsOfAccess": { + "dvcore:fileRequestAccess": true + }, + "schema:includedInDataCatalog": "Root", + "schema:isPartOf": { + "schema:name": "Cars", + "@id": "http://localhost:8080/dataverse/cars", + "schema:description": "Data about cars.", + "schema:isPartOf": { + "schema:name": "Root", + "@id": "http://localhost:8080/dataverse/root", + "schema:description": "The root dataverse." + } + }, + "ore:aggregates": [ + { + "schema:description": "", + "schema:name": "compute.py", + "dvcore:restricted": false, + "dvcore:directoryLabel": "code", + "schema:version": 2, + "dvcore:datasetVersionId": 3, + "@id": "http://localhost:8080/file.xhtml?fileId=7", + "schema:sameAs": "http://localhost:8080/api/access/datafile/7", + "@type": "ore:AggregatedResource", + "schema:fileFormat": "text/x-python", + "dvcore:filesize": 15, + "dvcore:storageIdentifier": "local://196d9f154f7-8cadf34ee905", + "dvcore:rootDataFileId": -1, + "dvcore:checksum": { + "@type": "MD5", + "@value": "d84985e94dde671f318076bd7a137f15" + } + }, + { + "schema:description": "", + "schema:name": "README.md", + "dvcore:restricted": false, + "dvcore:directoryLabel": "doc", + "schema:version": 2, + "dvcore:datasetVersionId": 3, + "@id": "http://localhost:8080/file.xhtml?fileId=8", + "schema:sameAs": "http://localhost:8080/api/access/datafile/8", + "@type": "ore:AggregatedResource", + "schema:fileFormat": "text/markdown", + "dvcore:filesize": 28, + "dvcore:storageIdentifier": "local://196d9f15664-1d4bb4e96a97", + "dvcore:rootDataFileId": -1, + "dvcore:checksum": { + "@type": "MD5", + "@value": "a2e484d07ee5590cc32182dc2c6ccc83" + } + }, + { + "schema:description": "", + "schema:name": "stata13-auto.dta", + "dvcore:restricted": false, + "dvcore:directoryLabel": "data", + "schema:version": 4, + "dvcore:datasetVersionId": 3, + "@id": "http://localhost:8080/file.xhtml?fileId=9", + "schema:sameAs": "http://localhost:8080/api/access/datafile/9?format=original", + "@type": "ore:AggregatedResource", + "schema:fileFormat": "application/x-stata-13", + "dvcore:filesize": 6443, + "dvcore:storageIdentifier": "local://196d9f15719-2270bfca2b48", + "dvcore:currentIngestedName": "stata13-auto.tab", + "dvcore:UNF": "UNF:6:RPd9EWHSZwqUvRZuKTJMqg==", + "dvcore:rootDataFileId": -1, + "dvcore:checksum": { + "@type": "MD5", + "@value": "7b1201ce6b469796837a835377338c5a" + } + } + ], + "schema:hasPart": [ + "http://localhost:8080/file.xhtml?fileId=7", + "http://localhost:8080/file.xhtml?fileId=8", + "http://localhost:8080/file.xhtml?fileId=9" + ] + }, + "@context": { + "author": "http://purl.org/dc/terms/creator", + "citation": "https://dataverse.org/schema/citation/", + "dateOfDeposit": "http://purl.org/dc/terms/dateSubmitted", + "dcterms": "http://purl.org/dc/terms/", + "dvcore": "https://dataverse.org/schema/core#", + "ore": "http://www.openarchives.org/ore/terms/", + "schema": "http://schema.org/", + "subject": "http://purl.org/dc/terms/subject", + "title": "http://purl.org/dc/terms/title" + } +} diff --git a/src/test/resources/croissant/cars/in/datasetSchemaDotOrg.json b/src/test/resources/croissant/cars/in/datasetSchemaDotOrg.json new file mode 100644 index 00000000000..83f587c5fd7 --- /dev/null +++ b/src/test/resources/croissant/cars/in/datasetSchemaDotOrg.json @@ -0,0 +1,78 @@ +{ + "@context": "http://schema.org", + "@type": "Dataset", + "@id": "https://doi.org/10.5072/FK2/CY7BWA", + "identifier": "https://doi.org/10.5072/FK2/CY7BWA", + "name": "Cars", + "creator": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "affiliation": { + "@type": "Organization", + "name": "Harvard" + }, + "name": "Durbin, Philip" + } + ], + "author": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "affiliation": { + "@type": "Organization", + "name": "Harvard" + }, + "name": "Durbin, Philip" + } + ], + "datePublished": "2025-05-16", + "dateModified": "2025-05-16", + "version": "1", + "description": "This dataset is about cars.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "http://localhost:8080" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "provider": { + "@type": "Organization", + "name": "Root" + }, + "distribution": [ + { + "@type": "DataDownload", + "name": "compute.py", + "encodingFormat": "text/x-python", + "contentSize": 15, + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/7" + }, + { + "@type": "DataDownload", + "name": "stata13-auto.tab", + "encodingFormat": "text/tab-separated-values", + "contentSize": 4026, + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/9" + }, + { + "@type": "DataDownload", + "name": "README.md", + "encodingFormat": "text/markdown", + "contentSize": 28, + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/8" + } + ] +} diff --git a/src/test/resources/croissant/draft/expected/draft-croissant.json b/src/test/resources/croissant/draft/expected/draft-croissant.json new file mode 100644 index 00000000000..b2065f79195 --- /dev/null +++ b/src/test/resources/croissant/draft/expected/draft-croissant.json @@ -0,0 +1,94 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "rai": "http://mlcommons.org/croissant/RAI/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "samplingRate": "cr:samplingRate", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform", + "wd": "https://www.wikidata.org/wiki/" + }, + "@type": "sc:Dataset", + "conformsTo": "http://mlcommons.org/croissant/1.0", + "name": "Draft Dataset", + "url": "https://doi.org/10.5072/FK2/OO7TEP", + "creator": [ + { + "@type": "Person", + "givenName": "Draft", + "familyName": "Punk", + "affiliation": { + "@type": "Organization", + "name": "French house" + }, + "name": "Punk, Draft" + } + ], + "description": "This dataset hasn't been published yet.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "dateModified": "", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "http://localhost:8080" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "version": "DRAFT", + "citeAs": "@data{FK2/OO7TEP,author = {Punk, Draft},publisher = {Root},title = {Draft Dataset},url = {https://doi.org/10.5072/FK2/OO7TEP}}", + "distribution": [ + { + "@type": "cr:FileObject", + "@id": "data.txt", + "name": "data.txt", + "encodingFormat": "text/plain", + "md5": "050644e853fdfe46a3707695ba2fe736", + "contentSize": "18", + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/4" + } + ] +} \ No newline at end of file diff --git a/src/test/resources/croissant/draft/in/dataCiteXml.xml b/src/test/resources/croissant/draft/in/dataCiteXml.xml new file mode 100644 index 00000000000..814f3d365e7 --- /dev/null +++ b/src/test/resources/croissant/draft/in/dataCiteXml.xml @@ -0,0 +1,46 @@ + + + 10.5072/FK2/OO7TEP + + + Punk, Draft + Draft + Punk + French house + + + + Draft Dataset + + Root + 2025 + + Other + + + + Admin, Dataverse + Dataverse + Admin + Dataverse.org + + + + 2025-04-14 + + + + 18 + + + text/plain + + DRAFT + + + Creative Commons CC0 1.0 Universal Public Domain Dedication. + + + This dataset hasn&apos;t been published yet. + + diff --git a/src/test/resources/croissant/draft/in/datasetFileDetails.json b/src/test/resources/croissant/draft/in/datasetFileDetails.json new file mode 100644 index 00000000000..1460aedba00 --- /dev/null +++ b/src/test/resources/croissant/draft/in/datasetFileDetails.json @@ -0,0 +1,23 @@ +[ + { + "id": 4, + "persistentId": "", + "filename": "data.txt", + "contentType": "text/plain", + "friendlyType": "Plain Text", + "filesize": 18, + "storageIdentifier": "local://196347bdb85-7b4820f8e4ef", + "rootDataFileId": -1, + "md5": "050644e853fdfe46a3707695ba2fe736", + "checksum": { + "type": "MD5", + "value": "050644e853fdfe46a3707695ba2fe736" + }, + "tabularData": false, + "creationDate": "2025-04-14", + "fileAccessRequest": false, + "restricted": false, + "fileMetadataId": 1, + "varGroups": [] + } +] diff --git a/src/test/resources/croissant/draft/in/datasetJson.json b/src/test/resources/croissant/draft/in/datasetJson.json new file mode 100644 index 00000000000..bbfd30ed03a --- /dev/null +++ b/src/test/resources/croissant/draft/in/datasetJson.json @@ -0,0 +1,156 @@ +{ + "id": 3, + "identifier": "FK2/OO7TEP", + "persistentUrl": "https://doi.org/10.5072/FK2/OO7TEP", + "protocol": "doi", + "authority": "10.5072", + "separator": "/", + "publisher": "Root", + "storageIdentifier": "local://10.5072/FK2/OO7TEP", + "datasetType": "dataset", + "datasetVersion": { + "id": 1, + "datasetId": 3, + "datasetPersistentId": "doi:10.5072/FK2/OO7TEP", + "storageIdentifier": "local://10.5072/FK2/OO7TEP", + "internalVersionNumber": 2, + "versionState": "DRAFT", + "latestVersionPublishingState": "DRAFT", + "lastUpdateTime": "2025-04-14T13:27:47Z", + "createTime": "2025-04-14T13:26:41Z", + "license": { + "name": "CC0 1.0", + "uri": "http://creativecommons.org/publicdomain/zero/1.0", + "iconUri": "https://licensebuttons.net/p/zero/1.0/88x31.png", + "rightsIdentifier": "CC0-1.0", + "rightsIdentifierScheme": "SPDX", + "schemeUri": "https://spdx.org/licenses/", + "languageCode": "en" + }, + "fileAccessRequest": true, + "metadataBlocks": { + "citation": { + "displayName": "Citation Metadata", + "name": "citation", + "fields": [ + { + "typeName": "title", + "multiple": false, + "typeClass": "primitive", + "value": "Draft Dataset" + }, + { + "typeName": "author", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "authorName": { + "typeName": "authorName", + "multiple": false, + "typeClass": "primitive", + "value": "Punk, Draft" + }, + "authorAffiliation": { + "typeName": "authorAffiliation", + "multiple": false, + "typeClass": "primitive", + "value": "French house" + } + } + ] + }, + { + "typeName": "datasetContact", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "datasetContactName": { + "typeName": "datasetContactName", + "multiple": false, + "typeClass": "primitive", + "value": "Admin, Dataverse" + }, + "datasetContactAffiliation": { + "typeName": "datasetContactAffiliation", + "multiple": false, + "typeClass": "primitive", + "value": "Dataverse.org" + }, + "datasetContactEmail": { + "typeName": "datasetContactEmail", + "multiple": false, + "typeClass": "primitive", + "value": "dataverse@mailinator.com" + } + } + ] + }, + { + "typeName": "dsDescription", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "dsDescriptionValue": { + "typeName": "dsDescriptionValue", + "multiple": false, + "typeClass": "primitive", + "value": "This dataset hasn't been published yet." + } + } + ] + }, + { + "typeName": "subject", + "multiple": true, + "typeClass": "controlledVocabulary", + "value": [ + "Other" + ] + }, + { + "typeName": "depositor", + "multiple": false, + "typeClass": "primitive", + "value": "Admin, Dataverse" + }, + { + "typeName": "dateOfDeposit", + "multiple": false, + "typeClass": "primitive", + "value": "2025-04-14" + } + ] + } + }, + "files": [ + { + "label": "data.txt", + "restricted": false, + "version": 1, + "datasetVersionId": 1, + "dataFile": { + "id": 4, + "persistentId": "", + "filename": "data.txt", + "contentType": "text/plain", + "friendlyType": "Plain Text", + "filesize": 18, + "storageIdentifier": "local://196347bdb85-7b4820f8e4ef", + "rootDataFileId": -1, + "md5": "050644e853fdfe46a3707695ba2fe736", + "checksum": { + "type": "MD5", + "value": "050644e853fdfe46a3707695ba2fe736" + }, + "tabularData": false, + "creationDate": "2025-04-14", + "fileAccessRequest": false + } + } + ], + "citation": "Punk, Draft, 2025, \"Draft Dataset\", https://doi.org/10.5072/FK2/OO7TEP, Root, DRAFT VERSION" + } +} diff --git a/src/test/resources/croissant/draft/in/datasetORE.json b/src/test/resources/croissant/draft/in/datasetORE.json new file mode 100644 index 00000000000..8f9cfe6fb63 --- /dev/null +++ b/src/test/resources/croissant/draft/in/datasetORE.json @@ -0,0 +1,87 @@ +{ + "dcterms:modified": "2025-04-14", + "dcterms:creator": "Root", + "@type": "ore:ResourceMap", + "schema:additionalType": "Dataverse OREMap Format v1.0.1", + "dvcore:generatedBy": { + "@type": "schema:SoftwareApplication", + "schema:name": "Dataverse", + "schema:version": "6.6", + "schema:url": "https://github.com/iqss/dataverse" + }, + "@id": "http://localhost:8080/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.5072/FK2/OO7TEP", + "ore:describes": { + "author": { + "citation:authorName": "Punk, Draft", + "citation:authorAffiliation": "French house" + }, + "citation:datasetContact": { + "citation:datasetContactName": "Admin, Dataverse", + "citation:datasetContactAffiliation": "Dataverse.org", + "citation:datasetContactEmail": "dataverse@mailinator.com" + }, + "citation:dsDescription": { + "citation:dsDescriptionValue": "This dataset hasn't been published yet." + }, + "dateOfDeposit": "2025-04-14", + "citation:depositor": "Admin, Dataverse", + "subject": "Other", + "title": "Draft Dataset", + "@id": "https://doi.org/10.5072/FK2/OO7TEP", + "@type": [ + "ore:Aggregation", + "schema:Dataset" + ], + "schema:version": "DRAFT", + "schema:name": "Draft Dataset", + "schema:dateModified": "Mon Apr 14 13:27:47 UTC 2025", + "schema:creativeWorkStatus": "DRAFT", + "schema:license": "http://creativecommons.org/publicdomain/zero/1.0", + "dvcore:fileTermsOfAccess": { + "dvcore:fileRequestAccess": true + }, + "schema:includedInDataCatalog": "Root", + "schema:isPartOf": { + "schema:name": "Draft Collection", + "@id": "http://localhost:8080/dataverse/draft", + "schema:isPartOf": { + "schema:name": "Root", + "@id": "http://localhost:8080/dataverse/root", + "schema:description": "The root dataverse." + } + }, + "ore:aggregates": [ + { + "schema:name": "data.txt", + "dvcore:restricted": false, + "schema:version": 1, + "dvcore:datasetVersionId": 1, + "@id": "http://localhost:8080/file.xhtml?fileId=4", + "schema:sameAs": "http://localhost:8080/api/access/datafile/4", + "@type": "ore:AggregatedResource", + "schema:fileFormat": "text/plain", + "dvcore:filesize": 18, + "dvcore:storageIdentifier": "local://196347bdb85-7b4820f8e4ef", + "dvcore:rootDataFileId": -1, + "dvcore:checksum": { + "@type": "MD5", + "@value": "050644e853fdfe46a3707695ba2fe736" + } + } + ], + "schema:hasPart": [ + "http://localhost:8080/file.xhtml?fileId=4" + ] + }, + "@context": { + "author": "http://purl.org/dc/terms/creator", + "citation": "https://dataverse.org/schema/citation/", + "dateOfDeposit": "http://purl.org/dc/terms/dateSubmitted", + "dcterms": "http://purl.org/dc/terms/", + "dvcore": "https://dataverse.org/schema/core#", + "ore": "http://www.openarchives.org/ore/terms/", + "schema": "http://schema.org/", + "subject": "http://purl.org/dc/terms/subject", + "title": "http://purl.org/dc/terms/title" + } +} diff --git a/src/test/resources/croissant/draft/in/datasetSchemaDotOrg.json b/src/test/resources/croissant/draft/in/datasetSchemaDotOrg.json new file mode 100644 index 00000000000..62328140af8 --- /dev/null +++ b/src/test/resources/croissant/draft/in/datasetSchemaDotOrg.json @@ -0,0 +1,60 @@ +{ + "@context": "http://schema.org", + "@type": "Dataset", + "@id": "https://doi.org/10.5072/FK2/OO7TEP", + "identifier": "https://doi.org/10.5072/FK2/OO7TEP", + "name": "Draft Dataset", + "creator": [ + { + "@type": "Person", + "givenName": "Draft", + "familyName": "Punk", + "affiliation": { + "@type": "Organization", + "name": "French house" + }, + "name": "Punk, Draft" + } + ], + "author": [ + { + "@type": "Person", + "givenName": "Draft", + "familyName": "Punk", + "affiliation": { + "@type": "Organization", + "name": "French house" + }, + "name": "Punk, Draft" + } + ], + "dateModified": "", + "version": "DRAFT", + "description": "This dataset hasn't been published yet.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "http://localhost:8080" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "provider": { + "@type": "Organization", + "name": "Root" + }, + "distribution": [ + { + "@type": "DataDownload", + "name": "data.txt", + "encodingFormat": "text/plain", + "contentSize": 18, + "contentUrl": "http://localhost:8080/api/access/datafile/4" + } + ] +} diff --git a/src/test/resources/croissant/junk/expected/junk-croissant.json b/src/test/resources/croissant/junk/expected/junk-croissant.json new file mode 100644 index 00000000000..b02bed5694e --- /dev/null +++ b/src/test/resources/croissant/junk/expected/junk-croissant.json @@ -0,0 +1,83 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "rai": "http://mlcommons.org/croissant/RAI/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "samplingRate": "cr:samplingRate", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform", + "wd": "https://www.wikidata.org/wiki/" + }, + "@type": "sc:Dataset", + "conformsTo": "http://mlcommons.org/croissant/1.0", + "name": "</script><script>alert(666)</script>", + "url": "https://doi.org/10.5072/FK2/0CNXUJ", + "creator": [ + { + "@type": "Person", + "givenName": "Sylvester", + "familyName": "Ritter", + "affiliation": { + "@type": "Organization", + "name": "WWF" + }, + "name": "Ritter, Sylvester" + } + ], + "description": "A junk dataset.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "datePublished": "2025-03-13", + "dateModified": "2025-03-13", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "http://localhost:8080" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "version": "1.0", + "citeAs": "@data{FK2/0CNXUJ_2025,author = {Ritter, Sylvester},publisher = {Root},title = {},year = {2025},url = {https://doi.org/10.5072/FK2/0CNXUJ}}" +} \ No newline at end of file diff --git a/src/test/resources/croissant/junk/in/dataCiteXml.xml b/src/test/resources/croissant/junk/in/dataCiteXml.xml new file mode 100644 index 00000000000..d6c11b056e2 --- /dev/null +++ b/src/test/resources/croissant/junk/in/dataCiteXml.xml @@ -0,0 +1,33 @@ + + + 10.5072/FK2/0CNXUJ + + + Ritter, Sylvester + Sylvester + Ritter + WWF + + + + :unav + + Root + 2025 + + Other + + + 2025-03-13 + 2025-03-13 + + + 1.0 + + + Creative Commons CC0 1.0 Universal Public Domain Dedication. + + + A junk dataset. + + diff --git a/src/test/resources/croissant/junk/in/datasetFileDetails.json b/src/test/resources/croissant/junk/in/datasetFileDetails.json new file mode 100644 index 00000000000..fe51488c706 --- /dev/null +++ b/src/test/resources/croissant/junk/in/datasetFileDetails.json @@ -0,0 +1 @@ +[] diff --git a/src/test/resources/croissant/junk/in/datasetJson.json b/src/test/resources/croissant/junk/in/datasetJson.json new file mode 100644 index 00000000000..984ae55cb92 --- /dev/null +++ b/src/test/resources/croissant/junk/in/datasetJson.json @@ -0,0 +1,124 @@ +{ + "id": 2, + "identifier": "FK2/0CNXUJ", + "persistentUrl": "https://doi.org/10.5072/FK2/0CNXUJ", + "protocol": "doi", + "authority": "10.5072", + "separator": "/", + "publisher": "Root", + "publicationDate": "2025-03-13", + "storageIdentifier": "local://10.5072/FK2/0CNXUJ", + "datasetType": "dataset", + "datasetVersion": { + "id": 1, + "datasetId": 2, + "datasetPersistentId": "doi:10.5072/FK2/0CNXUJ", + "storageIdentifier": "local://10.5072/FK2/0CNXUJ", + "versionNumber": 1, + "versionMinorNumber": 0, + "versionState": "RELEASED", + "latestVersionPublishingState": "RELEASED", + "lastUpdateTime": "2025-03-13T14:56:36Z", + "releaseTime": "2025-03-13T14:56:36Z", + "createTime": "2025-03-13T14:56:26Z", + "publicationDate": "2025-03-13", + "citationDate": "2025-03-13", + "license": { + "name": "CC0 1.0", + "uri": "http://creativecommons.org/publicdomain/zero/1.0", + "iconUri": "https://licensebuttons.net/p/zero/1.0/88x31.png", + "rightsIdentifier": "CC0-1.0", + "rightsIdentifierScheme": "SPDX", + "schemeUri": "https://spdx.org/licenses/", + "languageCode": "en" + }, + "fileAccessRequest": true, + "metadataBlocks": { + "citation": { + "displayName": "Citation Metadata", + "name": "citation", + "fields": [ + { + "typeName": "title", + "multiple": false, + "typeClass": "primitive", + "value": "" + }, + { + "typeName": "author", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "authorName": { + "typeName": "authorName", + "multiple": false, + "typeClass": "primitive", + "value": "Ritter, Sylvester" + }, + "authorAffiliation": { + "typeName": "authorAffiliation", + "multiple": false, + "typeClass": "primitive", + "value": "WWF" + } + } + ] + }, + { + "typeName": "datasetContact", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "datasetContactEmail": { + "typeName": "datasetContactEmail", + "multiple": false, + "typeClass": "primitive", + "value": "dataverse@mailinator.com" + } + } + ] + }, + { + "typeName": "dsDescription", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "dsDescriptionValue": { + "typeName": "dsDescriptionValue", + "multiple": false, + "typeClass": "primitive", + "value": "A junk dataset." + } + } + ] + }, + { + "typeName": "subject", + "multiple": true, + "typeClass": "controlledVocabulary", + "value": [ + "Other" + ] + }, + { + "typeName": "depositor", + "multiple": false, + "typeClass": "primitive", + "value": "Admin, Dataverse" + }, + { + "typeName": "dateOfDeposit", + "multiple": false, + "typeClass": "primitive", + "value": "2025-03-13" + } + ] + } + }, + "files": [], + "citation": "Ritter, Sylvester, 2025, https://doi.org/10.5072/FK2/0CNXUJ, Root, V1" + } +} diff --git a/src/test/resources/croissant/junk/in/datasetORE.json b/src/test/resources/croissant/junk/in/datasetORE.json new file mode 100644 index 00000000000..646955bbb17 --- /dev/null +++ b/src/test/resources/croissant/junk/in/datasetORE.json @@ -0,0 +1,62 @@ +{ + "dcterms:modified": "2025-03-13", + "dcterms:creator": "Root", + "@type": "ore:ResourceMap", + "schema:additionalType": "Dataverse OREMap Format v1.0.1", + "dvcore:generatedBy": { + "@type": "schema:SoftwareApplication", + "schema:name": "Dataverse", + "schema:version": "6.5", + "schema:url": "https://github.com/iqss/dataverse" + }, + "@id": "http://localhost:8080/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.5072/FK2/0CNXUJ", + "ore:describes": { + "citation:dsDescription": { + "citation:dsDescriptionValue": "A junk dataset." + }, + "author": { + "citation:authorName": "Ritter, Sylvester", + "citation:authorAffiliation": "WWF" + }, + "citation:datasetContact": { + "citation:datasetContactEmail": "dataverse@mailinator.com" + }, + "citation:depositor": "Admin, Dataverse", + "subject": "Other", + "title": "", + "dateOfDeposit": "2025-03-13", + "@id": "https://doi.org/10.5072/FK2/0CNXUJ", + "@type": [ + "ore:Aggregation", + "schema:Dataset" + ], + "schema:version": "1.0", + "schema:name": "", + "schema:dateModified": "Thu Mar 13 14:56:36 UTC 2025", + "schema:datePublished": "2025-03-13", + "schema:creativeWorkStatus": "RELEASED", + "schema:license": "http://creativecommons.org/publicdomain/zero/1.0", + "dvcore:fileTermsOfAccess": { + "dvcore:fileRequestAccess": true + }, + "schema:includedInDataCatalog": "Root", + "schema:isPartOf": { + "schema:name": "Root", + "@id": "http://localhost:8080/dataverse/root", + "schema:description": "The root dataverse." + }, + "ore:aggregates": [], + "schema:hasPart": [] + }, + "@context": { + "author": "http://purl.org/dc/terms/creator", + "citation": "https://dataverse.org/schema/citation/", + "dateOfDeposit": "http://purl.org/dc/terms/dateSubmitted", + "dcterms": "http://purl.org/dc/terms/", + "dvcore": "https://dataverse.org/schema/core#", + "ore": "http://www.openarchives.org/ore/terms/", + "schema": "http://schema.org/", + "subject": "http://purl.org/dc/terms/subject", + "title": "http://purl.org/dc/terms/title" + } +} diff --git a/src/test/resources/croissant/junk/in/datasetSchemaDotOrg.json b/src/test/resources/croissant/junk/in/datasetSchemaDotOrg.json new file mode 100644 index 00000000000..e487f075115 --- /dev/null +++ b/src/test/resources/croissant/junk/in/datasetSchemaDotOrg.json @@ -0,0 +1,52 @@ +{ + "@context": "http://schema.org", + "@type": "Dataset", + "@id": "https://doi.org/10.5072/FK2/0CNXUJ", + "identifier": "https://doi.org/10.5072/FK2/0CNXUJ", + "name": "", + "creator": [ + { + "@type": "Person", + "givenName": "Sylvester", + "familyName": "Ritter", + "affiliation": { + "@type": "Organization", + "name": "WWF" + }, + "name": "Ritter, Sylvester" + } + ], + "author": [ + { + "@type": "Person", + "givenName": "Sylvester", + "familyName": "Ritter", + "affiliation": { + "@type": "Organization", + "name": "WWF" + }, + "name": "Ritter, Sylvester" + } + ], + "datePublished": "2025-03-13", + "dateModified": "2025-03-13", + "version": "1", + "description": "A junk dataset.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "http://localhost:8080" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "provider": { + "@type": "Organization", + "name": "Root" + } +} diff --git a/src/test/resources/croissant/max/expected/max-croissant.json b/src/test/resources/croissant/max/expected/max-croissant.json new file mode 100644 index 00000000000..bf1941c7289 --- /dev/null +++ b/src/test/resources/croissant/max/expected/max-croissant.json @@ -0,0 +1,196 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "rai": "http://mlcommons.org/croissant/RAI/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "samplingRate": "cr:samplingRate", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform", + "wd": "https://www.wikidata.org/wiki/" + }, + "@type": "sc:Dataset", + "conformsTo": "http://mlcommons.org/croissant/1.0", + "name": "Max Schema.org", + "url": "https://doi.org/10.5072/FK2/VQTYHD", + "creator": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "affiliation": { + "@type": "Organization", + "name": "Harvard University" + }, + "sameAs": "https://orcid.org/0000-0002-9528-9470", + "@id": "https://orcid.org/0000-0002-9528-9470", + "identifier": "https://orcid.org/0000-0002-9528-9470", + "name": "Durbin, Philip" + }, + { + "@type": "Person", + "affiliation": { + "@type": "Organization", + "name": "Harvard University" + }, + "name": "IQSS" + } + ], + "description": "Exercising fields used by `schema.org` exporter.", + "keywords": [ + "Social Sciences", + "Other", + "foo", + "bar" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "datePublished": "2024-05-01", + "dateModified": "2025-05-21", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "https://beta.dataverse.org" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "version": "3.0", + "citeAs": "@data{FK2/VQTYHD_2024,author = {Durbin, Philip and IQSS},publisher = {Root},title = {Max Schema.org},year = {2024},url = {https://doi.org/10.5072/FK2/VQTYHD}}", + "funder": [ + { + "@type": "Organization", + "name": "NSF" + }, + { + "@type": "Organization", + "name": "NIH" + } + ], + "spatialCoverage": [ + "Cambridge, MA, United States, Harvard Square" + ], + "citation": [ + { + "@type": "CreativeWork", + "name": "Tykhonov, V., & Durbin, P. (2024, March 20). Croissant ML standard in the context of Dataverse, EOSC and beyond. Zenodo. https://doi.org/10.5281/zenodo.10843668", + "@id": "https://doi.org/10.5281/zenodo.10843668", + "identifier": "https://doi.org/10.5281/zenodo.10843668", + "url": "https://doi.org/10.5281/zenodo.10843668" + } + ], + "temporalCoverage": [ + "2023-01-01/2023-12-31" + ], + "distribution": [ + { + "@type": "cr:FileObject", + "@id": "data.tsv", + "name": "data.tsv", + "encodingFormat": "text/tab-separated-values", + "md5": "3663d6a436ac00f5541a7336d6fa18c9", + "contentSize": "33", + "description": "", + "contentUrl": "https://beta.dataverse.org/api/access/datafile/26646?format=original" + }, + { + "@type": "cr:FileObject", + "@id": "doc/README.md", + "name": "README.md", + "encodingFormat": "text/markdown", + "md5": "ebf050ec8cce5df0a72b100cfc9f442f", + "contentSize": "34", + "description": "Additional documentation.", + "contentUrl": "https://beta.dataverse.org/api/access/datafile/26148" + } + ], + "recordSet": [ + { + "@type": "cr:RecordSet", + "field": [ + { + "@type": "cr:Field", + "name": "foo", + "description": "foo", + "dataType": "sc:Text", + "source": { + "@id": "1287", + "fileObject": { + "@id": "data.tsv" + }, + "extract": { + "column": "foo" + } + } + }, + { + "@type": "cr:Field", + "name": "bar", + "description": "bar", + "dataType": "sc:Integer", + "source": { + "@id": "1285", + "fileObject": { + "@id": "data.tsv" + }, + "extract": { + "column": "bar" + } + } + }, + { + "@type": "cr:Field", + "name": "baz", + "description": "baz", + "dataType": "sc:Integer", + "source": { + "@id": "1286", + "fileObject": { + "@id": "data.tsv" + }, + "extract": { + "column": "baz" + } + } + } + ] + } + ] +} \ No newline at end of file diff --git a/src/test/resources/croissant/max/in/dataCiteXml.xml b/src/test/resources/croissant/max/in/dataCiteXml.xml new file mode 100644 index 00000000000..e91c0583b71 --- /dev/null +++ b/src/test/resources/croissant/max/in/dataCiteXml.xml @@ -0,0 +1,77 @@ + + + 10.5072/FK2/VQTYHD + + + Durbin, Philip + Philip + Durbin + https://orcid.org/0000-0002-9528-9470 + Harvard University + + + IQSS + Harvard University + + + + Max Schema.org + + Root + 2024 + + Social Sciences + Other + foo + bar + + + + Durbin, Philip + Philip + Durbin + + + + 2024-05-01 + 2024-05-01 + 2025-05-21 + 2023-01-01/2023-12-31 + + + + 10.5281/ZENODO.10843668 + + + 34 + 21865 + 27 + + + text/markdown + text/tab-separated-values + text/tab-separated-values + + 3.0 + + + Creative Commons CC0 1.0 Universal Public Domain Dedication. + + + Exercising fields used by `schema.org` exporter. + + + + United States, MA,, Cambridge,, Harvard Square, + + + + + NSF + + + NIH + 3OT2DB000004-01S3 + + + diff --git a/src/test/resources/croissant/max/in/datasetFileDetails.json b/src/test/resources/croissant/max/in/datasetFileDetails.json new file mode 100644 index 00000000000..35881e3eae1 --- /dev/null +++ b/src/test/resources/croissant/max/in/datasetFileDetails.json @@ -0,0 +1,117 @@ +[ + { + "id": 26646, + "persistentId": "", + "filename": "data.tab", + "contentType": "text/tab-separated-values", + "friendlyType": "Tab-Delimited", + "filesize": 27, + "storageIdentifier": "s3://beta-dataverse-direct:196f44cc758-4d710ffac5d8", + "originalFileFormat": "text/tsv", + "originalFormatLabel": "Tab-Separated Values", + "originalFileSize": 33, + "originalFileName": "data.tsv", + "UNF": "UNF:6:ngOUmEnfm08jahzBYqStQA==", + "rootDataFileId": -1, + "md5": "3663d6a436ac00f5541a7336d6fa18c9", + "checksum": { + "type": "MD5", + "value": "3663d6a436ac00f5541a7336d6fa18c9" + }, + "tabularData": true, + "creationDate": "2025-05-21", + "publicationDate": "2025-05-21", + "fileAccessRequest": true, + "restricted": false, + "fileMetadataId": 32509, + "dataTables": [ + { + "varQuantity": 3, + "caseQuantity": 3, + "UNF": "UNF:6:ngOUmEnfm08jahzBYqStQA==", + "dataVariables": [ + { + "id": 1287, + "name": "foo", + "label": "foo", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "CHARACTER", + "isOrderedCategorical": false, + "fileOrder": 0, + "UNF": "UNF:6:FWBO/a1GcxDnM3fNLdzrHw==", + "variableMetadata": [] + }, + { + "id": 1285, + "name": "bar", + "label": "bar", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 1, + "UNF": "UNF:6:AvELPR5QTaBbnq6S22Msow==", + "variableMetadata": [], + "summaryStatistics": { + "mode": ".", + "invd": "0.0", + "min": "1.0", + "stdev": "1.0", + "max": "3.0", + "vald": "3.0", + "mean": "2.0", + "medn": "2.0" + } + }, + { + "id": 1286, + "name": "baz", + "label": "baz", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 2, + "UNF": "UNF:6:WkRUZjFbozW1nFYiqMGWeQ==", + "variableMetadata": [], + "summaryStatistics": { + "mean": "20.0", + "mode": ".", + "min": "10.0", + "max": "30.0", + "invd": "0.0", + "stdev": "10.0", + "vald": "3.0", + "medn": "20.0" + } + } + ] + } + ], + "varGroups": [] + }, + { + "id": 26148, + "persistentId": "", + "filename": "README.md", + "contentType": "text/markdown", + "friendlyType": "Markdown Text", + "filesize": 34, + "description": "Additional documentation.", + "storageIdentifier": "s3://beta-dataverse-direct:18f35bee76a-f45ece0b0fcc", + "rootDataFileId": -1, + "md5": "ebf050ec8cce5df0a72b100cfc9f442f", + "checksum": { + "type": "MD5", + "value": "ebf050ec8cce5df0a72b100cfc9f442f" + }, + "tabularData": false, + "creationDate": "2024-05-01", + "publicationDate": "2024-05-01", + "fileAccessRequest": true, + "restricted": false, + "fileMetadataId": 32511, + "varGroups": [] + } +] diff --git a/src/test/resources/croissant/max/in/datasetJson.json b/src/test/resources/croissant/max/in/datasetJson.json new file mode 100644 index 00000000000..a0ddaa54436 --- /dev/null +++ b/src/test/resources/croissant/max/in/datasetJson.json @@ -0,0 +1,376 @@ +{ + "id": 26147, + "identifier": "FK2/VQTYHD", + "persistentUrl": "https://doi.org/10.5072/FK2/VQTYHD", + "protocol": "doi", + "authority": "10.5072", + "separator": "/", + "publisher": "Root", + "publicationDate": "2024-05-01", + "storageIdentifier": "s3://10.5072/FK2/VQTYHD", + "datasetType": "dataset", + "datasetVersion": { + "id": 266, + "datasetId": 26147, + "datasetPersistentId": "doi:10.5072/FK2/VQTYHD", + "storageIdentifier": "s3://10.5072/FK2/VQTYHD", + "versionNumber": 3, + "internalVersionNumber": 7, + "versionMinorNumber": 0, + "versionState": "RELEASED", + "latestVersionPublishingState": "RELEASED", + "UNF": "UNF:6:ngOUmEnfm08jahzBYqStQA==", + "lastUpdateTime": "2025-05-21T19:25:29Z", + "releaseTime": "2025-05-21T19:25:29Z", + "createTime": "2025-05-21T19:23:21Z", + "publicationDate": "2024-05-01", + "citationDate": "2024-05-01", + "license": { + "name": "CC0 1.0", + "uri": "http://creativecommons.org/publicdomain/zero/1.0", + "iconUri": "https://licensebuttons.net/p/zero/1.0/88x31.png", + "rightsIdentifier": "CC0-1.0", + "rightsIdentifierScheme": "SPDX", + "schemeUri": "https://spdx.org/licenses/", + "languageCode": "en" + }, + "fileAccessRequest": true, + "metadataBlocks": { + "citation": { + "displayName": "Citation Metadata", + "name": "citation", + "fields": [ + { + "typeName": "title", + "multiple": false, + "typeClass": "primitive", + "value": "Max Schema.org" + }, + { + "typeName": "author", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "authorName": { + "typeName": "authorName", + "multiple": false, + "typeClass": "primitive", + "value": "Durbin, Philip" + }, + "authorAffiliation": { + "typeName": "authorAffiliation", + "multiple": false, + "typeClass": "primitive", + "value": "Harvard University" + }, + "authorIdentifierScheme": { + "typeName": "authorIdentifierScheme", + "multiple": false, + "typeClass": "controlledVocabulary", + "value": "ORCID" + }, + "authorIdentifier": { + "typeName": "authorIdentifier", + "multiple": false, + "typeClass": "primitive", + "value": "0000-0002-9528-9470" + } + }, + { + "authorName": { + "typeName": "authorName", + "multiple": false, + "typeClass": "primitive", + "value": "IQSS" + }, + "authorAffiliation": { + "typeName": "authorAffiliation", + "multiple": false, + "typeClass": "primitive", + "value": "Harvard University" + } + } + ] + }, + { + "typeName": "datasetContact", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "datasetContactName": { + "typeName": "datasetContactName", + "multiple": false, + "typeClass": "primitive", + "value": "Durbin, Philip" + }, + "datasetContactEmail": { + "typeName": "datasetContactEmail", + "multiple": false, + "typeClass": "primitive", + "value": "philip_durbin@harvard.edu" + } + } + ] + }, + { + "typeName": "dsDescription", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "dsDescriptionValue": { + "typeName": "dsDescriptionValue", + "multiple": false, + "typeClass": "primitive", + "value": "Exercising fields used by `schema.org` exporter." + } + } + ] + }, + { + "typeName": "subject", + "multiple": true, + "typeClass": "controlledVocabulary", + "value": [ + "Social Sciences", + "Other" + ] + }, + { + "typeName": "keyword", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "keywordValue": { + "typeName": "keywordValue", + "multiple": false, + "typeClass": "primitive", + "value": "foo" + } + }, + { + "keywordValue": { + "typeName": "keywordValue", + "multiple": false, + "typeClass": "primitive", + "value": "bar" + } + } + ] + }, + { + "typeName": "publication", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "publicationCitation": { + "typeName": "publicationCitation", + "multiple": false, + "typeClass": "primitive", + "value": "Tykhonov, V., & Durbin, P. (2024, March 20). Croissant ML standard in the context of Dataverse, EOSC and beyond. Zenodo. https://doi.org/10.5281/zenodo.10843668" + }, + "publicationIDType": { + "typeName": "publicationIDType", + "multiple": false, + "typeClass": "controlledVocabulary", + "value": "doi" + }, + "publicationIDNumber": { + "typeName": "publicationIDNumber", + "multiple": false, + "typeClass": "primitive", + "value": "10.5281/zenodo.10843668" + }, + "publicationURL": { + "typeName": "publicationURL", + "multiple": false, + "typeClass": "primitive", + "value": "https://doi.org/10.5281/zenodo.10843668" + } + } + ] + }, + { + "typeName": "contributor", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "contributorType": { + "typeName": "contributorType", + "multiple": false, + "typeClass": "controlledVocabulary", + "value": "Funder" + }, + "contributorName": { + "typeName": "contributorName", + "multiple": false, + "typeClass": "primitive", + "value": "NSF" + } + } + ] + }, + { + "typeName": "grantNumber", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "grantNumberAgency": { + "typeName": "grantNumberAgency", + "multiple": false, + "typeClass": "primitive", + "value": "NIH" + }, + "grantNumberValue": { + "typeName": "grantNumberValue", + "multiple": false, + "typeClass": "primitive", + "value": "3OT2DB000004-01S3" + } + } + ] + }, + { + "typeName": "depositor", + "multiple": false, + "typeClass": "primitive", + "value": "Durbin, Philip" + }, + { + "typeName": "dateOfDeposit", + "multiple": false, + "typeClass": "primitive", + "value": "2024-05-01" + }, + { + "typeName": "timePeriodCovered", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "timePeriodCoveredStart": { + "typeName": "timePeriodCoveredStart", + "multiple": false, + "typeClass": "primitive", + "value": "2023-01-01" + }, + "timePeriodCoveredEnd": { + "typeName": "timePeriodCoveredEnd", + "multiple": false, + "typeClass": "primitive", + "value": "2023-12-31" + } + } + ] + } + ] + }, + "geospatial": { + "displayName": "Geospatial Metadata", + "name": "geospatial", + "fields": [ + { + "typeName": "geographicCoverage", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "country": { + "typeName": "country", + "multiple": false, + "typeClass": "controlledVocabulary", + "value": "United States" + }, + "state": { + "typeName": "state", + "multiple": false, + "typeClass": "primitive", + "value": "MA" + }, + "city": { + "typeName": "city", + "multiple": false, + "typeClass": "primitive", + "value": "Cambridge" + }, + "otherGeographicCoverage": { + "typeName": "otherGeographicCoverage", + "multiple": false, + "typeClass": "primitive", + "value": "Harvard Square" + } + } + ] + } + ] + } + }, + "files": [ + { + "description": "Additional documentation.", + "label": "README.md", + "restricted": false, + "directoryLabel": "doc", + "version": 1, + "datasetVersionId": 266, + "dataFile": { + "id": 26148, + "persistentId": "", + "filename": "README.md", + "contentType": "text/markdown", + "friendlyType": "Markdown Text", + "filesize": 34, + "description": "Additional documentation.", + "storageIdentifier": "s3://beta-dataverse-direct:18f35bee76a-f45ece0b0fcc", + "rootDataFileId": -1, + "md5": "ebf050ec8cce5df0a72b100cfc9f442f", + "checksum": { + "type": "MD5", + "value": "ebf050ec8cce5df0a72b100cfc9f442f" + }, + "tabularData": false, + "creationDate": "2024-05-01", + "publicationDate": "2024-05-01", + "fileAccessRequest": true + } + }, + { + "label": "data.tab", + "restricted": false, + "version": 3, + "datasetVersionId": 266, + "dataFile": { + "id": 26646, + "persistentId": "", + "filename": "data.tab", + "contentType": "text/tab-separated-values", + "friendlyType": "Tab-Delimited", + "filesize": 27, + "storageIdentifier": "s3://beta-dataverse-direct:196f44cc758-4d710ffac5d8", + "originalFileFormat": "text/tsv", + "originalFormatLabel": "Tab-Separated Values", + "originalFileSize": 33, + "originalFileName": "data.tsv", + "UNF": "UNF:6:ngOUmEnfm08jahzBYqStQA==", + "rootDataFileId": -1, + "md5": "3663d6a436ac00f5541a7336d6fa18c9", + "checksum": { + "type": "MD5", + "value": "3663d6a436ac00f5541a7336d6fa18c9" + }, + "tabularData": true, + "creationDate": "2025-05-21", + "publicationDate": "2025-05-21", + "fileAccessRequest": true + } + } + ], + "citation": "Durbin, Philip; IQSS, 2024, \"Max Schema.org\", https://doi.org/10.5072/FK2/VQTYHD, Root, V3, UNF:6:ngOUmEnfm08jahzBYqStQA== [fileUNF]" + } +} diff --git a/src/test/resources/croissant/max/in/datasetORE.json b/src/test/resources/croissant/max/in/datasetORE.json new file mode 100644 index 00000000000..2c3cce7ab6a --- /dev/null +++ b/src/test/resources/croissant/max/in/datasetORE.json @@ -0,0 +1,163 @@ +{ + "dcterms:modified": "2025-05-21", + "dcterms:creator": "Root", + "@type": "ore:ResourceMap", + "schema:additionalType": "Dataverse OREMap Format v1.0.1", + "dvcore:generatedBy": { + "@type": "schema:SoftwareApplication", + "schema:name": "Dataverse", + "schema:version": "6.6 build develop-c4379a0", + "schema:url": "https://github.com/iqss/dataverse" + }, + "@id": "https://beta.dataverse.org/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.5072/FK2/VQTYHD", + "ore:describes": { + "author": [ + { + "citation:authorName": "Durbin, Philip", + "citation:authorAffiliation": "Harvard University", + "authorIdentifierScheme": "ORCID", + "authorIdentifier": "0000-0002-9528-9470" + }, + { + "citation:authorName": "IQSS", + "citation:authorAffiliation": "Harvard University" + } + ], + "citation:keyword": [ + { + "citation:keywordValue": "foo" + }, + { + "citation:keywordValue": "bar" + } + ], + "timePeriodCovered": { + "citation:timePeriodCoveredStart": "2023-01-01", + "citation:timePeriodCoveredEnd": "2023-12-31" + }, + "contributor": { + "citation:contributorType": "Funder", + "citation:contributorName": "NSF" + }, + "citation:dsDescription": { + "citation:dsDescriptionValue": "Exercising fields used by `schema.org` exporter." + }, + "publication": { + "publicationCitation": "Tykhonov, V., & Durbin, P. (2024, March 20). Croissant ML standard in the context of Dataverse, EOSC and beyond. Zenodo. https://doi.org/10.5281/zenodo.10843668", + "publicationIDType": "doi", + "publicationIDNumber": "10.5281/zenodo.10843668", + "publicationURL": "https://doi.org/10.5281/zenodo.10843668" + }, + "grantNumber": { + "citation:grantNumberAgency": "NIH", + "citation:grantNumberValue": "3OT2DB000004-01S3" + }, + "geospatial:geographicCoverage": { + "geospatial:country": "United States", + "geospatial:state": "MA", + "geospatial:city": "Cambridge", + "geospatial:otherGeographicCoverage": "Harvard Square" + }, + "citation:datasetContact": { + "citation:datasetContactName": "Durbin, Philip", + "citation:datasetContactEmail": "philip_durbin@harvard.edu" + }, + "dateOfDeposit": "2024-05-01", + "subject": [ + "Social Sciences", + "Other" + ], + "citation:depositor": "Durbin, Philip", + "title": "Max Schema.org", + "@id": "https://doi.org/10.5072/FK2/VQTYHD", + "@type": [ + "ore:Aggregation", + "schema:Dataset" + ], + "schema:version": "3.0", + "schema:name": "Max Schema.org", + "schema:dateModified": "2025-05-21 19:25:29.653", + "schema:datePublished": "2024-05-01", + "schema:creativeWorkStatus": "RELEASED", + "schema:license": "http://creativecommons.org/publicdomain/zero/1.0", + "dvcore:fileTermsOfAccess": { + "dvcore:fileRequestAccess": true + }, + "schema:includedInDataCatalog": "Root", + "schema:isPartOf": { + "schema:name": "Philip Durbin Dataverse", + "@id": "https://beta.dataverse.org/dataverse/pdurbin", + "schema:isPartOf": { + "schema:name": "Root", + "@id": "https://beta.dataverse.org/dataverse/root", + "schema:description": "The root dataverse." + } + }, + "ore:aggregates": [ + { + "schema:description": "Additional documentation.", + "schema:name": "README.md", + "dvcore:restricted": false, + "dvcore:directoryLabel": "doc", + "schema:version": 1, + "dvcore:datasetVersionId": 266, + "@id": "https://beta.dataverse.org/file.xhtml?fileId=26148", + "schema:sameAs": "https://beta.dataverse.org/api/access/datafile/26148", + "@type": "ore:AggregatedResource", + "schema:fileFormat": "text/markdown", + "dvcore:filesize": 34, + "dvcore:storageIdentifier": "s3://beta-dataverse-direct:18f35bee76a-f45ece0b0fcc", + "dvcore:rootDataFileId": -1, + "dvcore:checksum": { + "@type": "MD5", + "@value": "ebf050ec8cce5df0a72b100cfc9f442f" + } + }, + { + "schema:name": "data.tsv", + "dvcore:restricted": false, + "schema:version": 3, + "dvcore:datasetVersionId": 266, + "@id": "https://beta.dataverse.org/file.xhtml?fileId=26646", + "schema:sameAs": "https://beta.dataverse.org/api/access/datafile/26646?format=original", + "@type": "ore:AggregatedResource", + "schema:fileFormat": "text/tsv", + "dvcore:filesize": 33, + "dvcore:storageIdentifier": "s3://beta-dataverse-direct:196f44cc758-4d710ffac5d8", + "dvcore:currentIngestedName": "data.tab", + "dvcore:UNF": "UNF:6:ngOUmEnfm08jahzBYqStQA==", + "dvcore:rootDataFileId": -1, + "dvcore:checksum": { + "@type": "MD5", + "@value": "3663d6a436ac00f5541a7336d6fa18c9" + } + } + ], + "schema:hasPart": [ + "https://beta.dataverse.org/file.xhtml?fileId=26148", + "https://beta.dataverse.org/file.xhtml?fileId=26646" + ] + }, + "@context": { + "author": "http://purl.org/dc/terms/creator", + "authorIdentifier": "http://purl.org/spar/datacite/AgentIdentifier", + "authorIdentifierScheme": "http://purl.org/spar/datacite/AgentIdentifierScheme", + "citation": "https://dataverse.org/schema/citation/", + "contributor": "http://purl.org/dc/terms/contributor", + "dateOfDeposit": "http://purl.org/dc/terms/dateSubmitted", + "dcterms": "http://purl.org/dc/terms/", + "dvcore": "https://dataverse.org/schema/core#", + "geospatial": "https://beta.dataverse.org/schema/geospatial#", + "grantNumber": "https://schema.org/sponsor", + "ore": "http://www.openarchives.org/ore/terms/", + "publication": "http://purl.org/dc/terms/isReferencedBy", + "publicationCitation": "http://purl.org/dc/terms/bibliographicCitation", + "publicationIDNumber": "http://purl.org/spar/datacite/ResourceIdentifier", + "publicationIDType": "http://purl.org/spar/datacite/ResourceIdentifierScheme", + "publicationURL": "https://schema.org/distribution", + "schema": "http://schema.org/", + "subject": "http://purl.org/dc/terms/subject", + "timePeriodCovered": "https://schema.org/temporalCoverage", + "title": "http://purl.org/dc/terms/title" + } +} diff --git a/src/test/resources/croissant/max/in/datasetSchemaDotOrg.json b/src/test/resources/croissant/max/in/datasetSchemaDotOrg.json new file mode 100644 index 00000000000..d3f764255e8 --- /dev/null +++ b/src/test/resources/croissant/max/in/datasetSchemaDotOrg.json @@ -0,0 +1,119 @@ +{ + "@context": "http://schema.org", + "@type": "Dataset", + "@id": "https://doi.org/10.5072/FK2/VQTYHD", + "identifier": "https://doi.org/10.5072/FK2/VQTYHD", + "name": "Max Schema.org", + "creator": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "affiliation": { + "@type": "Organization", + "name": "Harvard University" + }, + "sameAs": "https://orcid.org/0000-0002-9528-9470", + "@id": "https://orcid.org/0000-0002-9528-9470", + "identifier": "https://orcid.org/0000-0002-9528-9470", + "name": "Durbin, Philip" + }, + { + "@type": "Person", + "affiliation": { + "@type": "Organization", + "name": "Harvard University" + }, + "name": "IQSS" + } + ], + "author": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "affiliation": { + "@type": "Organization", + "name": "Harvard University" + }, + "sameAs": "https://orcid.org/0000-0002-9528-9470", + "@id": "https://orcid.org/0000-0002-9528-9470", + "identifier": "https://orcid.org/0000-0002-9528-9470", + "name": "Durbin, Philip" + }, + { + "@type": "Person", + "affiliation": { + "@type": "Organization", + "name": "Harvard University" + }, + "name": "IQSS" + } + ], + "datePublished": "2024-05-01", + "dateModified": "2025-05-21", + "version": "3", + "description": "Exercising fields used by `schema.org` exporter.", + "keywords": [ + "Social Sciences", + "Other", + "foo", + "bar" + ], + "citation": [ + { + "@type": "CreativeWork", + "name": "Tykhonov, V., & Durbin, P. (2024, March 20). Croissant ML standard in the context of Dataverse, EOSC and beyond. Zenodo. https://doi.org/10.5281/zenodo.10843668", + "@id": "https://doi.org/10.5281/zenodo.10843668", + "identifier": "https://doi.org/10.5281/zenodo.10843668", + "url": "https://doi.org/10.5281/zenodo.10843668" + } + ], + "temporalCoverage": [ + "2023-01-01/2023-12-31" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "https://beta.dataverse.org" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "provider": { + "@type": "Organization", + "name": "Root" + }, + "funder": [ + { + "@type": "Organization", + "name": "NSF" + }, + { + "@type": "Organization", + "name": "NIH" + } + ], + "spatialCoverage": [ + "Cambridge, MA, United States, Harvard Square" + ], + "distribution": [ + { + "@type": "DataDownload", + "name": "data.tab", + "encodingFormat": "text/tab-separated-values", + "contentSize": 27, + "contentUrl": "https://beta.dataverse.org/api/access/datafile/26646" + }, + { + "@type": "DataDownload", + "name": "README.md", + "encodingFormat": "text/markdown", + "contentSize": 34, + "description": "Additional documentation.", + "contentUrl": "https://beta.dataverse.org/api/access/datafile/26148" + } + ] +} diff --git a/src/test/resources/croissant/minimal/expected/minimal-croissant.json b/src/test/resources/croissant/minimal/expected/minimal-croissant.json new file mode 100644 index 00000000000..7c47afc1485 --- /dev/null +++ b/src/test/resources/croissant/minimal/expected/minimal-croissant.json @@ -0,0 +1,79 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "rai": "http://mlcommons.org/croissant/RAI/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "samplingRate": "cr:samplingRate", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform", + "wd": "https://www.wikidata.org/wiki/" + }, + "@type": "sc:Dataset", + "conformsTo": "http://mlcommons.org/croissant/1.0", + "name": "Minimal", + "url": "https://doi.org/10.5072/FK2/4C0JYC", + "creator": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "name": "Durbin, Philip" + } + ], + "description": "Minimal metadata and no files.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "datePublished": "2024-05-01", + "dateModified": "2024-05-01", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "https://beta.dataverse.org" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "version": "1.0", + "citeAs": "@data{FK2/4C0JYC_2024,author = {Durbin, Philip},publisher = {Root},title = {Minimal},year = {2024},url = {https://doi.org/10.5072/FK2/4C0JYC}}" +} \ No newline at end of file diff --git a/src/test/resources/croissant/minimal/in/dataCiteXml.xml b/src/test/resources/croissant/minimal/in/dataCiteXml.xml new file mode 100644 index 00000000000..14feafba53d --- /dev/null +++ b/src/test/resources/croissant/minimal/in/dataCiteXml.xml @@ -0,0 +1,17 @@ + + + 10.5072/FK2/4C0JYC + Durbin, Philip + + Minimal + + Root + 2024 + + + Minimal metadata and no files. + + + diff --git a/src/test/resources/croissant/minimal/in/datasetFileDetails.json b/src/test/resources/croissant/minimal/in/datasetFileDetails.json new file mode 100644 index 00000000000..fe51488c706 --- /dev/null +++ b/src/test/resources/croissant/minimal/in/datasetFileDetails.json @@ -0,0 +1 @@ +[] diff --git a/src/test/resources/croissant/minimal/in/datasetJson.json b/src/test/resources/croissant/minimal/in/datasetJson.json new file mode 100644 index 00000000000..cedd4723dd5 --- /dev/null +++ b/src/test/resources/croissant/minimal/in/datasetJson.json @@ -0,0 +1,100 @@ +{ + "id": 26146, + "identifier": "FK2/4C0JYC", + "persistentUrl": "https://doi.org/10.5072/FK2/4C0JYC", + "protocol": "doi", + "authority": "10.5072", + "publisher": "Root", + "publicationDate": "2024-05-01", + "storageIdentifier": "s3://10.5072/FK2/4C0JYC", + "datasetVersion": { + "id": 108, + "datasetId": 26146, + "datasetPersistentId": "doi:10.5072/FK2/4C0JYC", + "storageIdentifier": "s3://10.5072/FK2/4C0JYC", + "versionNumber": 1, + "versionMinorNumber": 0, + "versionState": "RELEASED", + "latestVersionPublishingState": "RELEASED", + "lastUpdateTime": "2024-05-01T14:27:17Z", + "releaseTime": "2024-05-01T14:27:17Z", + "createTime": "2024-05-01T14:26:54Z", + "publicationDate": "2024-05-01", + "citationDate": "2024-05-01", + "license": { + "name": "CC0 1.0", + "uri": "http://creativecommons.org/publicdomain/zero/1.0", + "iconUri": "https://licensebuttons.net/p/zero/1.0/88x31.png" + }, + "fileAccessRequest": true, + "metadataBlocks": { + "citation": { + "displayName": "Citation Metadata", + "name": "citation", + "fields": [ + { + "typeName": "title", + "multiple": false, + "typeClass": "primitive", + "value": "Minimal" + }, + { + "typeName": "author", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "authorName": { + "typeName": "authorName", + "multiple": false, + "typeClass": "primitive", + "value": "Durbin, Philip" + } + } + ] + }, + { + "typeName": "datasetContact", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "datasetContactEmail": { + "typeName": "datasetContactEmail", + "multiple": false, + "typeClass": "primitive", + "value": "philip_durbin@harvard.edu" + } + } + ] + }, + { + "typeName": "dsDescription", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "dsDescriptionValue": { + "typeName": "dsDescriptionValue", + "multiple": false, + "typeClass": "primitive", + "value": "Minimal metadata and no files." + } + } + ] + }, + { + "typeName": "subject", + "multiple": true, + "typeClass": "controlledVocabulary", + "value": [ + "Other" + ] + } + ] + } + }, + "files": [], + "citation": "Durbin, Philip, 2024, \"Minimal\", https://doi.org/10.5072/FK2/4C0JYC, Root, V1" + } +} diff --git a/src/test/resources/croissant/minimal/in/datasetORE.json b/src/test/resources/croissant/minimal/in/datasetORE.json new file mode 100644 index 00000000000..a76ec9ea0ac --- /dev/null +++ b/src/test/resources/croissant/minimal/in/datasetORE.json @@ -0,0 +1,62 @@ +{ + "dcterms:modified": "2024-05-01", + "dcterms:creator": "Root", + "@type": "ore:ResourceMap", + "schema:additionalType": "Dataverse OREMap Format v1.0.0", + "dvcore:generatedBy": { + "@type": "schema:SoftwareApplication", + "schema:name": "Dataverse", + "schema:version": "6.2 build develop-e615050", + "schema:url": "https://github.com/iqss/dataverse" + }, + "@id": "https://beta.dataverse.org/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.5072/FK2/4C0JYC", + "ore:describes": { + "citation:dsDescription": { + "citation:dsDescriptionValue": "Minimal metadata and no files." + }, + "author": { + "citation:authorName": "Durbin, Philip" + }, + "citation:datasetContact": { + "citation:datasetContactEmail": "philip_durbin@harvard.edu" + }, + "title": "Minimal", + "subject": "Other", + "@id": "https://doi.org/10.5072/FK2/4C0JYC", + "@type": [ + "ore:Aggregation", + "schema:Dataset" + ], + "schema:version": "1.0", + "schema:name": "Minimal", + "schema:dateModified": "2024-05-01 14:27:17.719", + "schema:datePublished": "2024-05-01", + "schema:creativeWorkStatus": "RELEASED", + "schema:license": "http://creativecommons.org/publicdomain/zero/1.0", + "dvcore:fileTermsOfAccess": { + "dvcore:fileRequestAccess": true + }, + "schema:includedInDataCatalog": "Root", + "schema:isPartOf": { + "schema:name": "Philip Durbin Dataverse", + "@id": "https://beta.dataverse.org/dataverse/pdurbin", + "schema:isPartOf": { + "schema:name": "Root", + "@id": "https://beta.dataverse.org/dataverse/root", + "schema:description": "The root dataverse." + } + }, + "ore:aggregates": [], + "schema:hasPart": [] + }, + "@context": { + "author": "http://purl.org/dc/terms/creator", + "citation": "https://dataverse.org/schema/citation/", + "dcterms": "http://purl.org/dc/terms/", + "dvcore": "https://dataverse.org/schema/core#", + "ore": "http://www.openarchives.org/ore/terms/", + "schema": "http://schema.org/", + "subject": "http://purl.org/dc/terms/subject", + "title": "http://purl.org/dc/terms/title" + } +} diff --git a/src/test/resources/croissant/minimal/in/datasetSchemaDotOrg.json b/src/test/resources/croissant/minimal/in/datasetSchemaDotOrg.json new file mode 100644 index 00000000000..36dcab588a3 --- /dev/null +++ b/src/test/resources/croissant/minimal/in/datasetSchemaDotOrg.json @@ -0,0 +1,44 @@ +{ + "@context": "http://schema.org", + "@type": "Dataset", + "@id": "https://doi.org/10.5072/FK2/4C0JYC", + "identifier": "https://doi.org/10.5072/FK2/4C0JYC", + "name": "Minimal", + "creator": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "name": "Durbin, Philip" + } + ], + "author": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "name": "Durbin, Philip" + } + ], + "datePublished": "2024-05-01", + "dateModified": "2024-05-01", + "version": "1", + "description": "Minimal metadata and no files.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "https://beta.dataverse.org" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "provider": { + "@type": "Organization", + "name": "Root" + } +} diff --git a/src/test/resources/croissant/restricted/expected/restricted-croissant.json b/src/test/resources/croissant/restricted/expected/restricted-croissant.json new file mode 100644 index 00000000000..19d970d1bbb --- /dev/null +++ b/src/test/resources/croissant/restricted/expected/restricted-croissant.json @@ -0,0 +1,115 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "rai": "http://mlcommons.org/croissant/RAI/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "examples": { + "@id": "cr:examples", + "@type": "@json" + }, + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "samplingRate": "cr:samplingRate", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform", + "wd": "https://www.wikidata.org/wiki/" + }, + "@type": "sc:Dataset", + "conformsTo": "http://mlcommons.org/croissant/1.0", + "name": "Cars", + "url": "https://doi.org/10.5072/FK2/CY7BWA", + "creator": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "affiliation": { + "@type": "Organization", + "name": "Harvard" + }, + "name": "Durbin, Philip" + } + ], + "description": "This dataset is about cars.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "datePublished": "2025-05-16", + "dateModified": "2025-05-16", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "http://localhost:8080" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "version": "1.0", + "citeAs": "@data{FK2/CY7BWA_2025,author = {Durbin, Philip},publisher = {Root},title = {Cars},year = {2025},url = {https://doi.org/10.5072/FK2/CY7BWA}}", + "distribution": [ + { + "@type": "cr:FileObject", + "@id": "code/compute.py", + "name": "compute.py", + "encodingFormat": "text/x-python", + "md5": "d84985e94dde671f318076bd7a137f15", + "contentSize": "15", + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/7" + }, + { + "@type": "cr:FileObject", + "@id": "data/stata13-auto.dta", + "name": "stata13-auto.dta", + "encodingFormat": "application/x-stata-13", + "md5": "7b1201ce6b469796837a835377338c5a", + "contentSize": "6443", + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/9?format=original" + }, + { + "@type": "cr:FileObject", + "@id": "doc/README.md", + "name": "README.md", + "encodingFormat": "text/markdown", + "md5": "a2e484d07ee5590cc32182dc2c6ccc83", + "contentSize": "28", + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/8" + } + ] +} \ No newline at end of file diff --git a/src/test/resources/croissant/restricted/in/dataCiteXml.xml b/src/test/resources/croissant/restricted/in/dataCiteXml.xml new file mode 100644 index 00000000000..7c6c89385fd --- /dev/null +++ b/src/test/resources/croissant/restricted/in/dataCiteXml.xml @@ -0,0 +1,51 @@ + + + 10.5072/FK2/CY7BWA + + + Durbin, Philip + Philip + Durbin + Harvard + + + + Cars + + Root + 2025 + + Other + + + + Durbin, Philip + Philip + Durbin + Harvard + + + + 2024-03-13 + 2025-05-16 + + + + 15 + 28 + 4026 + + + text/x-python + text/markdown + text/tab-separated-values + + 1.0 + + + Creative Commons CC0 1.0 Universal Public Domain Dedication. + + + This dataset is about cars. + + diff --git a/src/test/resources/croissant/restricted/in/datasetFileDetails.json b/src/test/resources/croissant/restricted/in/datasetFileDetails.json new file mode 100644 index 00000000000..f2cdff072da --- /dev/null +++ b/src/test/resources/croissant/restricted/in/datasetFileDetails.json @@ -0,0 +1,355 @@ +[ + { + "id": 7, + "persistentId": "", + "filename": "compute.py", + "contentType": "text/x-python", + "friendlyType": "Python Source Code", + "filesize": 15, + "description": "", + "storageIdentifier": "local://196d9f154f7-8cadf34ee905", + "rootDataFileId": -1, + "md5": "d84985e94dde671f318076bd7a137f15", + "checksum": { + "type": "MD5", + "value": "d84985e94dde671f318076bd7a137f15" + }, + "tabularData": false, + "creationDate": "2025-05-16", + "publicationDate": "2025-05-16", + "fileAccessRequest": true, + "restricted": false, + "fileMetadataId": 1, + "varGroups": [] + }, + { + "id": 9, + "persistentId": "", + "filename": "stata13-auto.tab", + "contentType": "text/tab-separated-values", + "friendlyType": "Tab-Delimited", + "filesize": 4026, + "description": "", + "storageIdentifier": "local://196d9f15719-2270bfca2b48", + "originalFileFormat": "application/x-stata-13", + "originalFormatLabel": "Stata 13 Binary", + "originalFileSize": 6443, + "originalFileName": "stata13-auto.dta", + "UNF": "UNF:6:RPd9EWHSZwqUvRZuKTJMqg==", + "rootDataFileId": -1, + "md5": "7b1201ce6b469796837a835377338c5a", + "checksum": { + "type": "MD5", + "value": "7b1201ce6b469796837a835377338c5a" + }, + "tabularData": true, + "creationDate": "2025-05-16", + "publicationDate": "2025-05-16", + "fileAccessRequest": true, + "restricted": true, + "fileMetadataId": 3, + "dataTables": [ + { + "varQuantity": 12, + "caseQuantity": 74, + "UNF": "UNF:6:RPd9EWHSZwqUvRZuKTJMqg==", + "dataVariables": [ + { + "id": 2, + "name": "make", + "label": "Make and Model", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "CHARACTER", + "isOrderedCategorical": false, + "fileOrder": 0, + "UNF": "UNF:6:Oo4vwiL8ffhSECOcjsKk2g==", + "variableMetadata": [] + }, + { + "id": 5, + "name": "price", + "label": "Price", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 1, + "UNF": "UNF:6:rvfkkdA36AaCSqCQciybfA==", + "variableMetadata": [], + "summaryStatistics": { + "min": "3291.0", + "medn": "5006.5", + "mean": "6165.256756756757", + "max": "15906.0", + "vald": "74.0", + "mode": ".", + "stdev": "2949.4958847689186", + "invd": "0.0" + } + }, + { + "id": 3, + "name": "mpg", + "label": "Mileage (mpg)", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 2, + "UNF": "UNF:6:vVr3w8CgeZq1KpDfJQudOg==", + "variableMetadata": [], + "summaryStatistics": { + "max": "41.0", + "vald": "74.0", + "medn": "20.0", + "min": "12.0", + "stdev": "5.785503209735141", + "mean": "21.2972972972973", + "invd": "0.0", + "mode": "." + } + }, + { + "id": 12, + "name": "rep78", + "label": "Repair Record 1978", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 3, + "UNF": "UNF:6:gbFI98swTWNhAjCRyi2cdA==", + "variableMetadata": [], + "summaryStatistics": { + "stdev": "0.989932270109041", + "mode": ".", + "min": "1.0", + "max": "5.0", + "medn": "3.0", + "mean": "3.4057971014492754", + "vald": "69.0", + "invd": "5.0" + } + }, + { + "id": 1, + "name": "headroom", + "label": "Headroom (in.)", + "weighted": false, + "variableIntervalType": "contin", + "variableFormatType": "NUMERIC", + "format": "float", + "isOrderedCategorical": false, + "fileOrder": 4, + "UNF": "UNF:6:g4Pl3T0Oz2e/OKJ64WiTnA==", + "variableMetadata": [], + "summaryStatistics": { + "mean": "2.993243243243243", + "mode": ".", + "vald": "74.0", + "invd": "0.0", + "stdev": "0.845994766828771", + "min": "1.5", + "medn": "3.0", + "max": "5.0" + } + }, + { + "id": 7, + "name": "trunk", + "label": "Trunk space (cu. ft.)", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 5, + "UNF": "UNF:6:iab0POsE3By7dQfgX/TY4g==", + "variableMetadata": [], + "summaryStatistics": { + "vald": "74.0", + "mode": ".", + "mean": "13.756756756756756", + "max": "23.0", + "min": "5.0", + "medn": "14.0", + "invd": "0.0", + "stdev": "4.277404189173201" + } + }, + { + "id": 4, + "name": "weight", + "label": "Weight (lbs.)", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 6, + "UNF": "UNF:6:cdoTdfUNeYWHHFEBCDxg+w==", + "variableMetadata": [], + "summaryStatistics": { + "invd": "0.0", + "min": "1760.0", + "vald": "74.0", + "max": "4840.0", + "stdev": "777.1935671373664", + "mean": "3019.459459459459", + "mode": ".", + "medn": "3190.0" + } + }, + { + "id": 8, + "name": "length", + "label": "Length (in.)", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 7, + "UNF": "UNF:6:8z1rjwhqBN4meYIiKI4P1A==", + "variableMetadata": [], + "summaryStatistics": { + "mode": ".", + "vald": "74.0", + "invd": "0.0", + "stdev": "22.266339902021585", + "max": "233.0", + "medn": "192.5", + "mean": "187.93243243243245", + "min": "142.0" + } + }, + { + "id": 9, + "name": "turn", + "label": "Turn Circle (ft.) ", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 8, + "UNF": "UNF:6:QxhjrrNtVz4qA8RulQ2MuQ==", + "variableMetadata": [], + "summaryStatistics": { + "stdev": "4.399353727233908", + "vald": "74.0", + "max": "51.0", + "min": "31.0", + "medn": "40.0", + "mean": "39.648648648648646", + "invd": "0.0", + "mode": "." + } + }, + { + "id": 10, + "name": "displacement", + "label": "Displacement (cu. in.)", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 9, + "UNF": "UNF:6:ftk+RAQpTCT1/y6G/rLWfA==", + "variableMetadata": [], + "summaryStatistics": { + "stdev": "91.83721896440396", + "invd": "0.0", + "min": "79.0", + "medn": "196.0", + "mode": ".", + "vald": "74.0", + "mean": "197.2972972972973", + "max": "425.0" + } + }, + { + "id": 6, + "name": "gear_ratio", + "label": "Gear Ratio", + "weighted": false, + "variableIntervalType": "contin", + "variableFormatType": "NUMERIC", + "format": "float", + "isOrderedCategorical": false, + "fileOrder": 10, + "UNF": "UNF:6:qjnY/qbx26FTepoPqRZ6lw==", + "variableMetadata": [], + "summaryStatistics": { + "medn": "2.9550000429153442", + "stdev": "0.45628709670763035", + "mean": "3.0148648667979883", + "min": "2.190000057220459", + "max": "3.890000104904175", + "mode": ".", + "vald": "74.0", + "invd": "0.0" + } + }, + { + "id": 11, + "name": "foreign", + "label": "Car type", + "weighted": false, + "variableIntervalType": "discrete", + "variableFormatType": "NUMERIC", + "isOrderedCategorical": false, + "fileOrder": 11, + "UNF": "UNF:6:nbjzgh3wfAFqKpaoFnHalA==", + "variableMetadata": [], + "summaryStatistics": { + "max": "1.0", + "invd": "0.0", + "mode": ".", + "medn": "0.0", + "stdev": "0.46018845840901884", + "min": "0.0", + "mean": "0.2972972972972975", + "vald": "74.0" + }, + "variableCategories": [ + { + "label": "Domestic", + "value": "0", + "isMissing": false, + "frequency": 52.0 + }, + { + "label": "Foreign", + "value": "1", + "isMissing": false, + "frequency": 22.0 + } + ] + } + ] + } + ], + "varGroups": [] + }, + { + "id": 8, + "persistentId": "", + "filename": "README.md", + "contentType": "text/markdown", + "friendlyType": "Markdown Text", + "filesize": 28, + "description": "", + "storageIdentifier": "local://196d9f15664-1d4bb4e96a97", + "rootDataFileId": -1, + "md5": "a2e484d07ee5590cc32182dc2c6ccc83", + "checksum": { + "type": "MD5", + "value": "a2e484d07ee5590cc32182dc2c6ccc83" + }, + "tabularData": false, + "creationDate": "2025-05-16", + "publicationDate": "2025-05-16", + "fileAccessRequest": true, + "restricted": false, + "fileMetadataId": 2, + "varGroups": [] + } +] diff --git a/src/test/resources/croissant/restricted/in/datasetJson.json b/src/test/resources/croissant/restricted/in/datasetJson.json new file mode 100644 index 00000000000..3234579cddd --- /dev/null +++ b/src/test/resources/croissant/restricted/in/datasetJson.json @@ -0,0 +1,228 @@ +{ + "id": 6, + "identifier": "FK2/CY7BWA", + "persistentUrl": "https://doi.org/10.5072/FK2/CY7BWA", + "protocol": "doi", + "authority": "10.5072", + "separator": "/", + "publisher": "Root", + "publicationDate": "2025-05-16", + "storageIdentifier": "local://10.5072/FK2/CY7BWA", + "datasetType": "dataset", + "datasetVersion": { + "id": 3, + "datasetId": 6, + "datasetPersistentId": "doi:10.5072/FK2/CY7BWA", + "storageIdentifier": "local://10.5072/FK2/CY7BWA", + "versionNumber": 1, + "internalVersionNumber": 10, + "versionMinorNumber": 0, + "versionState": "RELEASED", + "latestVersionPublishingState": "RELEASED", + "UNF": "UNF:6:RPd9EWHSZwqUvRZuKTJMqg==", + "lastUpdateTime": "2025-05-16T16:33:18Z", + "releaseTime": "2025-05-16T16:33:18Z", + "createTime": "2025-05-16T16:33:13Z", + "publicationDate": "2025-05-16", + "citationDate": "2025-05-16", + "license": { + "name": "CC0 1.0", + "uri": "http://creativecommons.org/publicdomain/zero/1.0", + "iconUri": "https://licensebuttons.net/p/zero/1.0/88x31.png", + "rightsIdentifier": "CC0-1.0", + "rightsIdentifierScheme": "SPDX", + "schemeUri": "https://spdx.org/licenses/", + "languageCode": "en" + }, + "fileAccessRequest": true, + "metadataBlocks": { + "citation": { + "displayName": "Citation Metadata", + "name": "citation", + "fields": [ + { + "typeName": "title", + "multiple": false, + "typeClass": "primitive", + "value": "Cars" + }, + { + "typeName": "author", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "authorName": { + "typeName": "authorName", + "multiple": false, + "typeClass": "primitive", + "value": "Durbin, Philip" + }, + "authorAffiliation": { + "typeName": "authorAffiliation", + "multiple": false, + "typeClass": "primitive", + "value": "Harvard" + } + } + ] + }, + { + "typeName": "datasetContact", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "datasetContactName": { + "typeName": "datasetContactName", + "multiple": false, + "typeClass": "primitive", + "value": "Durbin, Philip" + }, + "datasetContactAffiliation": { + "typeName": "datasetContactAffiliation", + "multiple": false, + "typeClass": "primitive", + "value": "Harvard" + }, + "datasetContactEmail": { + "typeName": "datasetContactEmail", + "multiple": false, + "typeClass": "primitive", + "value": "dataverse@mailinator.com" + } + } + ] + }, + { + "typeName": "dsDescription", + "multiple": true, + "typeClass": "compound", + "value": [ + { + "dsDescriptionValue": { + "typeName": "dsDescriptionValue", + "multiple": false, + "typeClass": "primitive", + "value": "This dataset is about cars." + } + } + ] + }, + { + "typeName": "subject", + "multiple": true, + "typeClass": "controlledVocabulary", + "value": [ + "Other" + ] + }, + { + "typeName": "depositor", + "multiple": false, + "typeClass": "primitive", + "value": "Durbin, Philip" + }, + { + "typeName": "dateOfDeposit", + "multiple": false, + "typeClass": "primitive", + "value": "2024-03-13" + } + ] + } + }, + "files": [ + { + "description": "", + "label": "compute.py", + "restricted": false, + "directoryLabel": "code", + "version": 2, + "datasetVersionId": 3, + "dataFile": { + "id": 7, + "persistentId": "", + "filename": "compute.py", + "contentType": "text/x-python", + "friendlyType": "Python Source Code", + "filesize": 15, + "description": "", + "storageIdentifier": "local://196d9f154f7-8cadf34ee905", + "rootDataFileId": -1, + "md5": "d84985e94dde671f318076bd7a137f15", + "checksum": { + "type": "MD5", + "value": "d84985e94dde671f318076bd7a137f15" + }, + "tabularData": false, + "creationDate": "2025-05-16", + "publicationDate": "2025-05-16", + "fileAccessRequest": true + } + }, + { + "description": "", + "label": "README.md", + "restricted": false, + "directoryLabel": "doc", + "version": 2, + "datasetVersionId": 3, + "dataFile": { + "id": 8, + "persistentId": "", + "filename": "README.md", + "contentType": "text/markdown", + "friendlyType": "Markdown Text", + "filesize": 28, + "description": "", + "storageIdentifier": "local://196d9f15664-1d4bb4e96a97", + "rootDataFileId": -1, + "md5": "a2e484d07ee5590cc32182dc2c6ccc83", + "checksum": { + "type": "MD5", + "value": "a2e484d07ee5590cc32182dc2c6ccc83" + }, + "tabularData": false, + "creationDate": "2025-05-16", + "publicationDate": "2025-05-16", + "fileAccessRequest": true + } + }, + { + "description": "", + "label": "stata13-auto.tab", + "restricted": true, + "directoryLabel": "data", + "version": 4, + "datasetVersionId": 3, + "dataFile": { + "id": 9, + "persistentId": "", + "filename": "stata13-auto.tab", + "contentType": "text/tab-separated-values", + "friendlyType": "Tab-Delimited", + "filesize": 4026, + "description": "", + "storageIdentifier": "local://196d9f15719-2270bfca2b48", + "originalFileFormat": "application/x-stata-13", + "originalFormatLabel": "Stata 13 Binary", + "originalFileSize": 6443, + "originalFileName": "stata13-auto.dta", + "UNF": "UNF:6:RPd9EWHSZwqUvRZuKTJMqg==", + "rootDataFileId": -1, + "md5": "7b1201ce6b469796837a835377338c5a", + "checksum": { + "type": "MD5", + "value": "7b1201ce6b469796837a835377338c5a" + }, + "tabularData": true, + "creationDate": "2025-05-16", + "publicationDate": "2025-05-16", + "fileAccessRequest": true + } + } + ], + "citation": "Durbin, Philip, 2025, \"Cars\", https://doi.org/10.5072/FK2/CY7BWA, Root, V1, UNF:6:RPd9EWHSZwqUvRZuKTJMqg== [fileUNF]" + } +} diff --git a/src/test/resources/croissant/restricted/in/datasetORE.json b/src/test/resources/croissant/restricted/in/datasetORE.json new file mode 100644 index 00000000000..8e6c5b93507 --- /dev/null +++ b/src/test/resources/croissant/restricted/in/datasetORE.json @@ -0,0 +1,133 @@ +{ + "dcterms:modified": "2025-05-19", + "dcterms:creator": "Root", + "@type": "ore:ResourceMap", + "schema:additionalType": "Dataverse OREMap Format v1.0.1", + "dvcore:generatedBy": { + "@type": "schema:SoftwareApplication", + "schema:name": "Dataverse", + "schema:version": "6.6", + "schema:url": "https://github.com/iqss/dataverse" + }, + "@id": "http://localhost:8080/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.5072/FK2/CY7BWA", + "ore:describes": { + "citation:datasetContact": { + "citation:datasetContactName": "Durbin, Philip", + "citation:datasetContactAffiliation": "Harvard", + "citation:datasetContactEmail": "dataverse@mailinator.com" + }, + "author": { + "citation:authorName": "Durbin, Philip", + "citation:authorAffiliation": "Harvard" + }, + "citation:dsDescription": { + "citation:dsDescriptionValue": "This dataset is about cars." + }, + "dateOfDeposit": "2024-03-13", + "title": "Cars", + "citation:depositor": "Durbin, Philip", + "subject": "Other", + "@id": "https://doi.org/10.5072/FK2/CY7BWA", + "@type": [ + "ore:Aggregation", + "schema:Dataset" + ], + "schema:version": "1.0", + "schema:name": "Cars", + "schema:dateModified": "Fri May 16 16:33:18 UTC 2025", + "schema:datePublished": "2025-05-16", + "schema:creativeWorkStatus": "RELEASED", + "schema:license": "http://creativecommons.org/publicdomain/zero/1.0", + "dvcore:fileTermsOfAccess": { + "dvcore:fileRequestAccess": true + }, + "schema:includedInDataCatalog": "Root", + "schema:isPartOf": { + "schema:name": "Cars", + "@id": "http://localhost:8080/dataverse/cars", + "schema:description": "Data about cars.", + "schema:isPartOf": { + "schema:name": "Root", + "@id": "http://localhost:8080/dataverse/root", + "schema:description": "The root dataverse." + } + }, + "ore:aggregates": [ + { + "schema:description": "", + "schema:name": "compute.py", + "dvcore:restricted": false, + "dvcore:directoryLabel": "code", + "schema:version": 2, + "dvcore:datasetVersionId": 3, + "@id": "http://localhost:8080/file.xhtml?fileId=7", + "schema:sameAs": "http://localhost:8080/api/access/datafile/7", + "@type": "ore:AggregatedResource", + "schema:fileFormat": "text/x-python", + "dvcore:filesize": 15, + "dvcore:storageIdentifier": "local://196d9f154f7-8cadf34ee905", + "dvcore:rootDataFileId": -1, + "dvcore:checksum": { + "@type": "MD5", + "@value": "d84985e94dde671f318076bd7a137f15" + } + }, + { + "schema:description": "", + "schema:name": "README.md", + "dvcore:restricted": false, + "dvcore:directoryLabel": "doc", + "schema:version": 2, + "dvcore:datasetVersionId": 3, + "@id": "http://localhost:8080/file.xhtml?fileId=8", + "schema:sameAs": "http://localhost:8080/api/access/datafile/8", + "@type": "ore:AggregatedResource", + "schema:fileFormat": "text/markdown", + "dvcore:filesize": 28, + "dvcore:storageIdentifier": "local://196d9f15664-1d4bb4e96a97", + "dvcore:rootDataFileId": -1, + "dvcore:checksum": { + "@type": "MD5", + "@value": "a2e484d07ee5590cc32182dc2c6ccc83" + } + }, + { + "schema:description": "", + "schema:name": "stata13-auto.dta", + "dvcore:restricted": true, + "dvcore:directoryLabel": "data", + "schema:version": 4, + "dvcore:datasetVersionId": 3, + "@id": "http://localhost:8080/file.xhtml?fileId=9", + "schema:sameAs": "http://localhost:8080/api/access/datafile/9?format=original", + "@type": "ore:AggregatedResource", + "schema:fileFormat": "application/x-stata-13", + "dvcore:filesize": 6443, + "dvcore:storageIdentifier": "local://196d9f15719-2270bfca2b48", + "dvcore:currentIngestedName": "stata13-auto.tab", + "dvcore:UNF": "UNF:6:RPd9EWHSZwqUvRZuKTJMqg==", + "dvcore:rootDataFileId": -1, + "dvcore:checksum": { + "@type": "MD5", + "@value": "7b1201ce6b469796837a835377338c5a" + } + } + ], + "schema:hasPart": [ + "http://localhost:8080/file.xhtml?fileId=7", + "http://localhost:8080/file.xhtml?fileId=8", + "http://localhost:8080/file.xhtml?fileId=9" + ] + }, + "@context": { + "author": "http://purl.org/dc/terms/creator", + "citation": "https://dataverse.org/schema/citation/", + "dateOfDeposit": "http://purl.org/dc/terms/dateSubmitted", + "dcterms": "http://purl.org/dc/terms/", + "dvcore": "https://dataverse.org/schema/core#", + "ore": "http://www.openarchives.org/ore/terms/", + "schema": "http://schema.org/", + "subject": "http://purl.org/dc/terms/subject", + "title": "http://purl.org/dc/terms/title" + } +} diff --git a/src/test/resources/croissant/restricted/in/datasetSchemaDotOrg.json b/src/test/resources/croissant/restricted/in/datasetSchemaDotOrg.json new file mode 100644 index 00000000000..83f587c5fd7 --- /dev/null +++ b/src/test/resources/croissant/restricted/in/datasetSchemaDotOrg.json @@ -0,0 +1,78 @@ +{ + "@context": "http://schema.org", + "@type": "Dataset", + "@id": "https://doi.org/10.5072/FK2/CY7BWA", + "identifier": "https://doi.org/10.5072/FK2/CY7BWA", + "name": "Cars", + "creator": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "affiliation": { + "@type": "Organization", + "name": "Harvard" + }, + "name": "Durbin, Philip" + } + ], + "author": [ + { + "@type": "Person", + "givenName": "Philip", + "familyName": "Durbin", + "affiliation": { + "@type": "Organization", + "name": "Harvard" + }, + "name": "Durbin, Philip" + } + ], + "datePublished": "2025-05-16", + "dateModified": "2025-05-16", + "version": "1", + "description": "This dataset is about cars.", + "keywords": [ + "Other" + ], + "license": "http://creativecommons.org/publicdomain/zero/1.0", + "includedInDataCatalog": { + "@type": "DataCatalog", + "name": "Root", + "url": "http://localhost:8080" + }, + "publisher": { + "@type": "Organization", + "name": "Root" + }, + "provider": { + "@type": "Organization", + "name": "Root" + }, + "distribution": [ + { + "@type": "DataDownload", + "name": "compute.py", + "encodingFormat": "text/x-python", + "contentSize": 15, + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/7" + }, + { + "@type": "DataDownload", + "name": "stata13-auto.tab", + "encodingFormat": "text/tab-separated-values", + "contentSize": 4026, + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/9" + }, + { + "@type": "DataDownload", + "name": "README.md", + "encodingFormat": "text/markdown", + "contentSize": 28, + "description": "", + "contentUrl": "http://localhost:8080/api/access/datafile/8" + } + ] +} diff --git a/src/test/resources/json/export-formats.json b/src/test/resources/json/export-formats.json index 65fc746ee23..ab8f64f9076 100644 --- a/src/test/resources/json/export-formats.json +++ b/src/test/resources/json/export-formats.json @@ -49,6 +49,12 @@ "XMLSchemaLocation": "https://ddialliance.org/Specification/DDI-Codebook/2.5/XMLSchema/codebook.xsd", "XMLSchemaVersion": "2.5" }, + "croissant": { + "displayName": "Croissant", + "mediaType": "application/json", + "isHarvestable": false, + "isVisibleInUserInterface": true + }, "dcterms": { "displayName": "Dublin Core", "mediaType": "application/xml",