Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ public class ClinVarIndexer extends ClinicalIndexer {
private int numberGermlineRecords = 0;
private int numberNoDiseaseTrait = 0;
private int numberMultipleInheritanceModels = 0;
private static final String RCVIDS = "rcvIds";
private static final String SCVIDS = "scvIds";
private static final Set<ModeOfInheritance> DOMINANT_TERM_SET
= new HashSet<>(Arrays.asList(ModeOfInheritance.monoallelic,
ModeOfInheritance.monoallelic_maternally_imprinted,
Expand Down Expand Up @@ -262,31 +264,43 @@ private boolean updateRocksDB(AlleleLocationData alleleLocationData, PublicSetTy
clinicalHaplotypeString = StringUtils.join(normalisedVariantStringList, HAPLOTYPE_STRING_SEPARATOR);
}

// get VCV ID
String vcvId = getVcvId(publicSet);

// parse RCVs
String accession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc();
String rcvAccession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc();
String clinicalSignficanceDescription = publicSet.getReferenceClinVarAssertion()
.getClinicalSignificance()
.getDescription();
String reviewStatusName = publicSet.getReferenceClinVarAssertion().getClinicalSignificance()
.getReviewStatus().name();
List<ObservationSet> getObservedIn = publicSet.getReferenceClinVarAssertion().getObservedIn();
addNewEntries(variantAnnotation, publicSet, alleleLocationData.getAlleleId(), mateVariantString,
clinicalHaplotypeString, traitsToEfoTermsMap, accession, clinicalSignficanceDescription,
reviewStatusName, getObservedIn);
clinicalHaplotypeString, traitsToEfoTermsMap, rcvAccession, clinicalSignficanceDescription,
reviewStatusName, getObservedIn, vcvId);

List<String> scvAccessions = new ArrayList<>();

// parse SCVs
for (MeasureTraitType measureTraitType : publicSet.getClinVarAssertion()) {
accession = measureTraitType.getClinVarAccession().getAcc();
String scvAccession = measureTraitType.getClinVarAccession().getAcc();
clinicalSignficanceDescription
= StringUtils.join(measureTraitType.getClinicalSignificance().getDescription(),
CLINICAL_SIGNIFICANCE_SEPARATOR);

reviewStatusName = getReviewStatusIfPresent(measureTraitType);
getObservedIn = measureTraitType.getObservedIn();
addNewEntries(variantAnnotation, publicSet, alleleLocationData.getAlleleId(), mateVariantString,
clinicalHaplotypeString, traitsToEfoTermsMap, accession, clinicalSignficanceDescription,
reviewStatusName, getObservedIn);
clinicalHaplotypeString, traitsToEfoTermsMap, scvAccession, clinicalSignficanceDescription,
reviewStatusName, getObservedIn, vcvId);
scvAccessions.add(scvAccession);
}

if (StringUtils.isNotEmpty(vcvId)) {
// add SCVs and RCVs to VCV entry
addAdditionalProperties(variantAnnotation, vcvId, rcvAccession, scvAccessions);
}

rdb.put(normalisedVariantString.getBytes(), jsonObjectWriter.writeValueAsBytes(variantAnnotation));
}
return true;
Expand All @@ -295,6 +309,47 @@ private boolean updateRocksDB(AlleleLocationData alleleLocationData, PublicSetTy
return false;
}


private String getVcvId(PublicSetType publicSet) {
if (publicSet.getReferenceClinVarAssertion() == null || publicSet.getReferenceClinVarAssertion().getMeasureSet() == null
|| publicSet.getReferenceClinVarAssertion().getMeasureSet().getID() == null) {
return null;
}
return publicSet.getReferenceClinVarAssertion().getMeasureSet().getID().toString();
}

private void addAdditionalProperties(VariantAnnotation variantAnnotation, String vcvId, String rcvAccession,
List<String> scvAccessions) {
List<Property> properties = getTraitAssociation(variantAnnotation, vcvId).getAdditionalProperties();
boolean hasRCVIds = false;
boolean hasSCVIds = false;
for (Property property : properties) {
if (RCVIDS.equals(property.getName())) {
hasRCVIds = true;
property.setValue(property.getValue() + "," + rcvAccession);
}
if (SCVIDS.equals(property.getName())) {
hasSCVIds = true;
property.setValue(property.getValue() + "," + String.join(",", scvAccessions));
}
}
if (!hasRCVIds) {
properties.add(new Property(null, RCVIDS, rcvAccession));
}
if (!hasSCVIds) {
properties.add(new Property(null, SCVIDS, String.join(",", scvAccessions)));
}
}

private EvidenceEntry getTraitAssociation(VariantAnnotation variantAnnotation, String vcvId) {
for (EvidenceEntry evidenceEntry: variantAnnotation.getTraitAssociation()) {
if (vcvId.equals(evidenceEntry.getId())) {
return evidenceEntry;
}
}
return null;
}

private String getReviewStatusIfPresent(MeasureTraitType measureTraitType) {
if (measureTraitType.getClinicalSignificance().getReviewStatus() != null) {
return measureTraitType.getClinicalSignificance().getReviewStatus().name();
Expand Down Expand Up @@ -384,11 +439,15 @@ private void addNewEntries(VariantAnnotation variantAnnotation, PublicSetType pu
String mateVariantString, String clinicalHaplotypeString,
Map<String, EFO> traitsToEfoTermsMap, String accession,
String clinicalSignficanceDescription, String reviewStatusName,
List<ObservationSet> getObservedIn)
List<ObservationSet> getObservedIn, String vcvId)
throws JsonProcessingException {

List<Property> additionalProperties = new ArrayList<>(3);
EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, "2022.02", "2022-02");
List<Property> additionalProperties = new ArrayList<>();
if (StringUtils.isNotEmpty(vcvId)) {
additionalProperties.add(new Property(null, "vcvIds", vcvId));
}
// TODO this needs to come from the config
EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, "2022.10", "2022-10");
// String accession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc();

VariantClassification variantClassification = getVariantClassification(
Expand All @@ -403,7 +462,6 @@ private void addNewEntries(VariantAnnotation variantAnnotation, PublicSetType pu
additionalProperties.add(new Property(null, GENOTYPESET, mateVariantString));
}

String vcvId= publicSet.getReferenceClinVarAssertion().getMeasureSet().getAcc();
if (StringUtils.isNotEmpty(vcvId)) {
additionalProperties.add(new Property("VCV_ID", "VCV ID", vcvId));
}
Expand Down Expand Up @@ -732,7 +790,7 @@ private Map<String, List<AlleleLocationData>> parseVariantSummary(Map<String, EF

// Each line may contain more than one RCV; e.g.: RCV000000019;RCV000000020;RCV000000021;RCV000000022;...
// Also, RCV ids may be repeated in the same line!!! e.g RCV000540418;RCV000540418;RCV000540418;RCV000000066
Set<String> rcvSet = new HashSet<>(Arrays.asList(parts[11].split(";")));
Set<String> rcvSet = new HashSet<>(Arrays.asList(parts[11].split("\\|")));
// Fill in rcvToAlleleLocationData map
for (String rcv : rcvSet) {
List<AlleleLocationData> alleleLocationDataList;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ public void parse() throws IOException, RocksDBException {
if (this.clinvarXMLFile != null && this.clinvarSummaryFile != null
&& this.clinvarVariationAlleleFile != null && Files.exists(clinvarXMLFile)
&& Files.exists(clinvarSummaryFile) && Files.exists(clinvarVariationAlleleFile)) {
ClinVarIndexer clinvarIndexer = new ClinVarIndexer(clinvarXMLFile.getParent().resolve("clinvar_chunks"), clinvarSummaryFile,
ClinVarIndexer clinvarIndexer = new ClinVarIndexer(clinvarXMLFile.getParent(), clinvarSummaryFile,
clinvarVariationAlleleFile, clinvarEFOFile, normalize, genomeSequenceFilePath, assembly, rdb);
clinvarIndexer.index();
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import com.fasterxml.jackson.databind.ObjectReader;
import com.mongodb.util.JSON;
import org.hamcrest.CoreMatchers;
import org.junit.jupiter.api.Test;
import org.junit.Test;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.avro.*;
import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer;
Expand Down Expand Up @@ -56,6 +56,70 @@ public ClinicalVariantBuilderTest() {
jsonObjectMapper.setSerializationInclusion(JsonInclude.Include.NON_NULL);
}

private void initGrch38() throws Exception {
Path clinicalVariantFolder = Paths.get(getClass().getResource("/variant/annotation/clinicalVariant/grch38").toURI());
org.apache.commons.io.FileUtils.copyDirectory(clinicalVariantFolder.toFile(), Paths.get("/tmp/clinicalVariant4").toFile());
clinicalVariantFolder = Paths.get("/tmp/clinicalVariant4");

org.apache.commons.io.FileUtils.copyFile(Paths.get(getClass()
.getResource("/variant/annotation/Homo_sapiens.GRCh38.90.dna.primary_assembly.chr13.fa.gz").toURI()).toFile(),
clinicalVariantFolder.resolve("Homo_sapiens.GRCh38.90.dna.primary_assembly.chr13.fa.gz").toFile());
org.apache.commons.io.FileUtils.copyFile(Paths.get(getClass()
.getResource("/variant/annotation/Homo_sapiens.GRCh38.90.dna.primary_assembly.chr13.fa.fai").toURI()).toFile(),
clinicalVariantFolder.resolve("Homo_sapiens.GRCh38.90.dna.primary_assembly.chr13.fa.gz.fai").toFile());
org.apache.commons.io.FileUtils.copyFile(Paths.get(getClass()
.getResource("/variant/annotation/Homo_sapiens.GRCh38.90.dna.primary_assembly.chr13.fa.gzi").toURI()).toFile(),
clinicalVariantFolder.resolve("Homo_sapiens.GRCh38.90.dna.primary_assembly.chr13.fa.gz.gzi").toFile());

Path genomeSequenceFilePath = clinicalVariantFolder.resolve("Homo_sapiens.GRCh38.90.dna.primary_assembly.chr13.fa.gz");

CellBaseSerializer serializer = new CellBaseJsonFileSerializer(Paths.get("/tmp/"), EtlCommons.CLINICAL_VARIANTS_DATA, true);
(new ClinicalVariantBuilder(clinicalVariantFolder, true, genomeSequenceFilePath, "GRCh38", serializer)).parse();
}

@Test
public void testUnexpectedAccession() throws Exception {
cleanUp();

initGrch38();

List<Variant> parsedVariantList = loadSerializedVariants("/tmp/" + EtlCommons.CLINICAL_VARIANTS_JSON_FILE);
assertEquals(6, parsedVariantList.size());

List<Variant> variantList = getVariantByAccession(parsedVariantList, "209047");
assertEquals(1, variantList.size());
Variant variant = variantList.get(0);
assertEquals("7", variant.getChromosome());
assertEquals(Integer.valueOf(117530975), variant.getStart());
assertEquals("G", variant.getReference());
assertEquals("A", variant.getAlternate());

// variant should have list of SCVs and RCVs and VCVs
EvidenceEntry evidenceEntry = getEvidenceEntryByAccession(variant, "RCV000007529");
System.out.println(evidenceEntry);
assertEquals(5, evidenceEntry.getAdditionalProperties().size());
assertEquals("7109", getValueByName(evidenceEntry, "vcvIds"));

evidenceEntry = getEvidenceEntryByAccession(variant, "SCV000053488");
assertEquals(5, evidenceEntry.getAdditionalProperties().size());
assertEquals("7109", getValueByName(evidenceEntry, "vcvIds"));

evidenceEntry = getEvidenceEntryByAccession(variant, "7109");
assertEquals(4, evidenceEntry.getAdditionalProperties().size());
assertEquals("RCV000007529", getValueByName(evidenceEntry, "rcvIds"));
assertEquals("SCV000053488", getValueByName(evidenceEntry, "scvIds"));
}

private String getValueByName(EvidenceEntry evidenceEntry, String name) {
for (Property property : evidenceEntry.getAdditionalProperties()) {
if (property.getName().equals(name)) {
return property.getValue();
}
}
return null;
}


@Test
public void noNormaliseTest() throws Exception {
// Remove all previous clinical variant temporary test data
Expand Down Expand Up @@ -799,10 +863,10 @@ private List<Variant> loadSerializedVariants(String fileName) {
// }
// }

@Test
public void testVariant() {
Variant v = new Variant("1", 2000, 2100, "A", "<DEL>");
System.out.println(v.toStringSimple());
}
// @Test
// public void testVariant() {
// Variant v = new Variant("1", 2000, 2100, "A", "<DEL>");
// System.out.println(v.toStringSimple());
// }

}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
13 114364328 58 60 61
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
��������
Binary file not shown.
Binary file not shown.
Binary file not shown.