diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java index effd2742b5..8dec699e16 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java @@ -84,6 +84,8 @@ public class ClinVarIndexer extends ClinicalIndexer { private int numberGermlineRecords = 0; private int numberNoDiseaseTrait = 0; private int numberMultipleInheritanceModels = 0; + private static final String RCVIDS = "rcvIds"; + private static final String SCVIDS = "scvIds"; private static final Set DOMINANT_TERM_SET = new HashSet<>(Arrays.asList(ModeOfInheritance.monoallelic, ModeOfInheritance.monoallelic_maternally_imprinted, @@ -262,8 +264,11 @@ private boolean updateRocksDB(AlleleLocationData alleleLocationData, PublicSetTy clinicalHaplotypeString = StringUtils.join(normalisedVariantStringList, HAPLOTYPE_STRING_SEPARATOR); } + // get VCV ID + String vcvId = getVcvId(publicSet); + // parse RCVs - String accession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc(); + String rcvAccession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc(); String clinicalSignficanceDescription = publicSet.getReferenceClinVarAssertion() .getClinicalSignificance() .getDescription(); @@ -271,12 +276,14 @@ private boolean updateRocksDB(AlleleLocationData alleleLocationData, PublicSetTy .getReviewStatus().name(); List getObservedIn = publicSet.getReferenceClinVarAssertion().getObservedIn(); addNewEntries(variantAnnotation, publicSet, alleleLocationData.getAlleleId(), mateVariantString, - clinicalHaplotypeString, traitsToEfoTermsMap, accession, clinicalSignficanceDescription, - reviewStatusName, getObservedIn); + clinicalHaplotypeString, traitsToEfoTermsMap, rcvAccession, clinicalSignficanceDescription, + reviewStatusName, getObservedIn, vcvId); + + List scvAccessions = new ArrayList<>(); // parse SCVs for (MeasureTraitType measureTraitType : publicSet.getClinVarAssertion()) { - accession = measureTraitType.getClinVarAccession().getAcc(); + String scvAccession = measureTraitType.getClinVarAccession().getAcc(); clinicalSignficanceDescription = StringUtils.join(measureTraitType.getClinicalSignificance().getDescription(), CLINICAL_SIGNIFICANCE_SEPARATOR); @@ -284,9 +291,16 @@ private boolean updateRocksDB(AlleleLocationData alleleLocationData, PublicSetTy reviewStatusName = getReviewStatusIfPresent(measureTraitType); getObservedIn = measureTraitType.getObservedIn(); addNewEntries(variantAnnotation, publicSet, alleleLocationData.getAlleleId(), mateVariantString, - clinicalHaplotypeString, traitsToEfoTermsMap, accession, clinicalSignficanceDescription, - reviewStatusName, getObservedIn); + clinicalHaplotypeString, traitsToEfoTermsMap, scvAccession, clinicalSignficanceDescription, + reviewStatusName, getObservedIn, vcvId); + scvAccessions.add(scvAccession); + } + + if (StringUtils.isNotEmpty(vcvId)) { + // add SCVs and RCVs to VCV entry + addAdditionalProperties(variantAnnotation, vcvId, rcvAccession, scvAccessions); } + rdb.put(normalisedVariantString.getBytes(), jsonObjectWriter.writeValueAsBytes(variantAnnotation)); } return true; @@ -295,6 +309,47 @@ private boolean updateRocksDB(AlleleLocationData alleleLocationData, PublicSetTy return false; } + + private String getVcvId(PublicSetType publicSet) { + if (publicSet.getReferenceClinVarAssertion() == null || publicSet.getReferenceClinVarAssertion().getMeasureSet() == null + || publicSet.getReferenceClinVarAssertion().getMeasureSet().getID() == null) { + return null; + } + return publicSet.getReferenceClinVarAssertion().getMeasureSet().getID().toString(); + } + + private void addAdditionalProperties(VariantAnnotation variantAnnotation, String vcvId, String rcvAccession, + List scvAccessions) { + List properties = getTraitAssociation(variantAnnotation, vcvId).getAdditionalProperties(); + boolean hasRCVIds = false; + boolean hasSCVIds = false; + for (Property property : properties) { + if (RCVIDS.equals(property.getName())) { + hasRCVIds = true; + property.setValue(property.getValue() + "," + rcvAccession); + } + if (SCVIDS.equals(property.getName())) { + hasSCVIds = true; + property.setValue(property.getValue() + "," + String.join(",", scvAccessions)); + } + } + if (!hasRCVIds) { + properties.add(new Property(null, RCVIDS, rcvAccession)); + } + if (!hasSCVIds) { + properties.add(new Property(null, SCVIDS, String.join(",", scvAccessions))); + } + } + + private EvidenceEntry getTraitAssociation(VariantAnnotation variantAnnotation, String vcvId) { + for (EvidenceEntry evidenceEntry: variantAnnotation.getTraitAssociation()) { + if (vcvId.equals(evidenceEntry.getId())) { + return evidenceEntry; + } + } + return null; + } + private String getReviewStatusIfPresent(MeasureTraitType measureTraitType) { if (measureTraitType.getClinicalSignificance().getReviewStatus() != null) { return measureTraitType.getClinicalSignificance().getReviewStatus().name(); @@ -384,11 +439,15 @@ private void addNewEntries(VariantAnnotation variantAnnotation, PublicSetType pu String mateVariantString, String clinicalHaplotypeString, Map traitsToEfoTermsMap, String accession, String clinicalSignficanceDescription, String reviewStatusName, - List getObservedIn) + List getObservedIn, String vcvId) throws JsonProcessingException { - List additionalProperties = new ArrayList<>(3); - EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, "2022.02", "2022-02"); + List additionalProperties = new ArrayList<>(); + if (StringUtils.isNotEmpty(vcvId)) { + additionalProperties.add(new Property(null, "vcvIds", vcvId)); + } + // TODO this needs to come from the config + EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, "2022.10", "2022-10"); // String accession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc(); VariantClassification variantClassification = getVariantClassification( @@ -403,7 +462,6 @@ private void addNewEntries(VariantAnnotation variantAnnotation, PublicSetType pu additionalProperties.add(new Property(null, GENOTYPESET, mateVariantString)); } - String vcvId= publicSet.getReferenceClinVarAssertion().getMeasureSet().getAcc(); if (StringUtils.isNotEmpty(vcvId)) { additionalProperties.add(new Property("VCV_ID", "VCV ID", vcvId)); } @@ -732,7 +790,7 @@ private Map> parseVariantSummary(Map rcvSet = new HashSet<>(Arrays.asList(parts[11].split(";"))); + Set rcvSet = new HashSet<>(Arrays.asList(parts[11].split("\\|"))); // Fill in rcvToAlleleLocationData map for (String rcv : rcvSet) { List alleleLocationDataList; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java index ff3d6d7640..8bdab27ee9 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilder.java @@ -125,7 +125,7 @@ public void parse() throws IOException, RocksDBException { if (this.clinvarXMLFile != null && this.clinvarSummaryFile != null && this.clinvarVariationAlleleFile != null && Files.exists(clinvarXMLFile) && Files.exists(clinvarSummaryFile) && Files.exists(clinvarVariationAlleleFile)) { - ClinVarIndexer clinvarIndexer = new ClinVarIndexer(clinvarXMLFile.getParent().resolve("clinvar_chunks"), clinvarSummaryFile, + ClinVarIndexer clinvarIndexer = new ClinVarIndexer(clinvarXMLFile.getParent(), clinvarSummaryFile, clinvarVariationAlleleFile, clinvarEFOFile, normalize, genomeSequenceFilePath, assembly, rdb); clinvarIndexer.index(); } else { diff --git a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilderTest.java b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilderTest.java index aa4f8c7de0..0d8a9732d6 100644 --- a/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilderTest.java +++ b/cellbase-lib/src/test/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalVariantBuilderTest.java @@ -22,7 +22,7 @@ import com.fasterxml.jackson.databind.ObjectReader; import com.mongodb.util.JSON; import org.hamcrest.CoreMatchers; -import org.junit.jupiter.api.Test; +import org.junit.Test; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.avro.*; import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer; @@ -56,6 +56,70 @@ public ClinicalVariantBuilderTest() { jsonObjectMapper.setSerializationInclusion(JsonInclude.Include.NON_NULL); } + private void initGrch38() throws Exception { + Path clinicalVariantFolder = Paths.get(getClass().getResource("/variant/annotation/clinicalVariant/grch38").toURI()); + org.apache.commons.io.FileUtils.copyDirectory(clinicalVariantFolder.toFile(), Paths.get("/tmp/clinicalVariant4").toFile()); + clinicalVariantFolder = Paths.get("/tmp/clinicalVariant4"); + + org.apache.commons.io.FileUtils.copyFile(Paths.get(getClass() + .getResource("/variant/annotation/Homo_sapiens.GRCh38.90.dna.primary_assembly.chr13.fa.gz").toURI()).toFile(), + clinicalVariantFolder.resolve("Homo_sapiens.GRCh38.90.dna.primary_assembly.chr13.fa.gz").toFile()); + org.apache.commons.io.FileUtils.copyFile(Paths.get(getClass() + .getResource("/variant/annotation/Homo_sapiens.GRCh38.90.dna.primary_assembly.chr13.fa.fai").toURI()).toFile(), + clinicalVariantFolder.resolve("Homo_sapiens.GRCh38.90.dna.primary_assembly.chr13.fa.gz.fai").toFile()); + org.apache.commons.io.FileUtils.copyFile(Paths.get(getClass() + .getResource("/variant/annotation/Homo_sapiens.GRCh38.90.dna.primary_assembly.chr13.fa.gzi").toURI()).toFile(), + clinicalVariantFolder.resolve("Homo_sapiens.GRCh38.90.dna.primary_assembly.chr13.fa.gz.gzi").toFile()); + + Path genomeSequenceFilePath = clinicalVariantFolder.resolve("Homo_sapiens.GRCh38.90.dna.primary_assembly.chr13.fa.gz"); + + CellBaseSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), EtlCommons.CLINICAL_VARIANTS_DATA, true); + (new ClinicalVariantBuilder(clinicalVariantFolder, true, genomeSequenceFilePath, "GRCh38", serializer)).parse(); + } + + @Test + public void testUnexpectedAccession() throws Exception { + cleanUp(); + + initGrch38(); + + List parsedVariantList = loadSerializedVariants("/tmp/" + EtlCommons.CLINICAL_VARIANTS_JSON_FILE); + assertEquals(6, parsedVariantList.size()); + + List variantList = getVariantByAccession(parsedVariantList, "209047"); + assertEquals(1, variantList.size()); + Variant variant = variantList.get(0); + assertEquals("7", variant.getChromosome()); + assertEquals(Integer.valueOf(117530975), variant.getStart()); + assertEquals("G", variant.getReference()); + assertEquals("A", variant.getAlternate()); + + // variant should have list of SCVs and RCVs and VCVs + EvidenceEntry evidenceEntry = getEvidenceEntryByAccession(variant, "RCV000007529"); + System.out.println(evidenceEntry); + assertEquals(5, evidenceEntry.getAdditionalProperties().size()); + assertEquals("7109", getValueByName(evidenceEntry, "vcvIds")); + + evidenceEntry = getEvidenceEntryByAccession(variant, "SCV000053488"); + assertEquals(5, evidenceEntry.getAdditionalProperties().size()); + assertEquals("7109", getValueByName(evidenceEntry, "vcvIds")); + + evidenceEntry = getEvidenceEntryByAccession(variant, "7109"); + assertEquals(4, evidenceEntry.getAdditionalProperties().size()); + assertEquals("RCV000007529", getValueByName(evidenceEntry, "rcvIds")); + assertEquals("SCV000053488", getValueByName(evidenceEntry, "scvIds")); + } + + private String getValueByName(EvidenceEntry evidenceEntry, String name) { + for (Property property : evidenceEntry.getAdditionalProperties()) { + if (property.getName().equals(name)) { + return property.getValue(); + } + } + return null; + } + + @Test public void noNormaliseTest() throws Exception { // Remove all previous clinical variant temporary test data @@ -799,10 +863,10 @@ private List loadSerializedVariants(String fileName) { // } // } - @Test - public void testVariant() { - Variant v = new Variant("1", 2000, 2100, "A", ""); - System.out.println(v.toStringSimple()); - } +// @Test +// public void testVariant() { +// Variant v = new Variant("1", 2000, 2100, "A", ""); +// System.out.println(v.toStringSimple()); +// } } diff --git a/cellbase-lib/src/test/resources/variant/annotation/Homo_sapiens.GRCh38.90.dna.primary_assembly.chr13.fa.fai b/cellbase-lib/src/test/resources/variant/annotation/Homo_sapiens.GRCh38.90.dna.primary_assembly.chr13.fa.fai new file mode 100644 index 0000000000..d49e170f56 --- /dev/null +++ b/cellbase-lib/src/test/resources/variant/annotation/Homo_sapiens.GRCh38.90.dna.primary_assembly.chr13.fa.fai @@ -0,0 +1 @@ +13 114364328 58 60 61 diff --git a/cellbase-lib/src/test/resources/variant/annotation/Homo_sapiens.GRCh38.90.dna.primary_assembly.chr13.fa.gz b/cellbase-lib/src/test/resources/variant/annotation/Homo_sapiens.GRCh38.90.dna.primary_assembly.chr13.fa.gz new file mode 100644 index 0000000000..e9c27b857d Binary files /dev/null and b/cellbase-lib/src/test/resources/variant/annotation/Homo_sapiens.GRCh38.90.dna.primary_assembly.chr13.fa.gz differ diff --git a/cellbase-lib/src/test/resources/variant/annotation/Homo_sapiens.GRCh38.90.dna.primary_assembly.chr13.fa.gzi b/cellbase-lib/src/test/resources/variant/annotation/Homo_sapiens.GRCh38.90.dna.primary_assembly.chr13.fa.gzi new file mode 100644 index 0000000000..8663f7de5d --- /dev/null +++ b/cellbase-lib/src/test/resources/variant/annotation/Homo_sapiens.GRCh38.90.dna.primary_assembly.chr13.fa.gzi @@ -0,0 +1 @@ +ÿÿÿÿÿÿÿÿ \ No newline at end of file diff --git a/cellbase-lib/src/test/resources/variant/annotation/clinicalVariant/grch38/ClinVarFullRelease_2022-02.xml.gz b/cellbase-lib/src/test/resources/variant/annotation/clinicalVariant/grch38/ClinVarFullRelease_2022-02.xml.gz new file mode 100644 index 0000000000..46b528a600 Binary files /dev/null and b/cellbase-lib/src/test/resources/variant/annotation/clinicalVariant/grch38/ClinVarFullRelease_2022-02.xml.gz differ diff --git a/cellbase-lib/src/test/resources/variant/annotation/clinicalVariant/grch38/variant_summary.txt.gz b/cellbase-lib/src/test/resources/variant/annotation/clinicalVariant/grch38/variant_summary.txt.gz new file mode 100644 index 0000000000..fb061db43c Binary files /dev/null and b/cellbase-lib/src/test/resources/variant/annotation/clinicalVariant/grch38/variant_summary.txt.gz differ diff --git a/cellbase-lib/src/test/resources/variant/annotation/clinicalVariant/grch38/variation_allele.txt.gz b/cellbase-lib/src/test/resources/variant/annotation/clinicalVariant/grch38/variation_allele.txt.gz new file mode 100644 index 0000000000..e583f81738 Binary files /dev/null and b/cellbase-lib/src/test/resources/variant/annotation/clinicalVariant/grch38/variation_allele.txt.gz differ