From 79f132e43dcdff0dc689da1e30bc633c2c46b28a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 4 Mar 2025 10:17:10 +0200 Subject: [PATCH 01/15] Add -f/--format to write vcf format from kbo map. --- Cargo.toml | 1 + src/cli.rs | 4 ++ src/main.rs | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 116 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 69735f3..ef52d0c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ clap = { version = "4", features = ["derive"]} kbo = "0.4.1" log = "0.4.20" needletail = { version = "0.6.0", default-features = false, features = ["flate2"] } +noodles-vcf = "0.49" rayon = "1" sbwt = "0.3.4" stderrlog = "0.6.0" diff --git a/src/cli.rs b/src/cli.rs index c728bf6..82e6086 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -126,6 +126,10 @@ pub enum Commands { #[arg(short = 'r', long = "reference", required = true, help_heading = "Input")] ref_file: String, + // Output format + #[arg(short = 'f', long = "format", default_value = "aln", help_heading = "Output")] + out_format: String, + // Parameters // // Upper bound for random match probability #[arg(long = "max-error-prob", default_value_t = 0.0000001, help_heading = "Algorithm")] diff --git a/src/main.rs b/src/main.rs index c3362e5..a6e9f24 100644 --- a/src/main.rs +++ b/src/main.rs @@ -20,6 +20,18 @@ use needletail::parser::SequenceRecord; use rayon::iter::ParallelIterator; use rayon::iter::IntoParallelRefIterator; +// TODO clean up +use noodles_vcf::{ + self as vcf, + header::record::value::{map::Contig, Map}, + record::{ + alternate_bases::Allele, + genotypes::{keys::key, sample::Value, Keys}, + reference_bases::Base, + AlternateBases, Genotypes, Position, + }, +}; + // Command-line interface mod cli; @@ -61,6 +73,87 @@ fn init_log(log_max_level: usize) { .unwrap(); } +/// [`u8`] representation used elsewhere to [`noodles_vcf::record::reference_bases::Base`] +#[inline] +fn u8_to_base(ref_base: u8) -> Base { + match ref_base { + b'A' => Base::A, + b'C' => Base::C, + b'G' => Base::G, + b'T' => Base::T, + _ => Base::N, + } +} + +/// Write alignments as a .vcf file +/// Adapted from +/// https://github.com/bacpop/ska.rust/blob/9483f0383a8cc9b151ae0cae4c33bf62fc89cbec/src/ska_ref.rs#L427 +fn write_vcf(f: &mut W, ref_seq: &Vec, alignments: &Vec>, names: &Vec) -> Result<(), std::io::Error> { + log::info!("Converting to VCF"); + + // Write the VCF header + let mut writer = vcf::Writer::new(f); + let mut header_builder = vcf::Header::builder(); + for contig in names { + header_builder = header_builder.add_contig( + contig.parse().expect("Could not add contig to header"), + Map::::new(), + ); + } + for name in names { + header_builder = header_builder.add_sample_name(name); + } + let header = header_builder.build(); + writer.write_header(&header)?; + + // Write each record (column) + let keys = Keys::try_from(vec![key::GENOTYPE]).unwrap(); + + for (mapped_pos, ref_base) in ref_seq.iter().enumerate() { + let mut genotype_vec = Vec::with_capacity(alignments.len()); + let mut alt_bases: Vec = Vec::new(); + let mut variant = false; + for query in alignments { + let mapped_base = query[mapped_pos]; + + let gt = if mapped_base == *ref_base { + String::from("0") + } else if mapped_base == b'-' { + variant = true; + String::from(".") + } else { + variant = true; + let alt_base = u8_to_base(mapped_base); + if !alt_bases.contains(&alt_base) { + alt_bases.push(alt_base); + } + (alt_bases.iter().position(|&r| r == alt_base).unwrap() + 1).to_string() + }; + genotype_vec.push(vec![Some(Value::String(gt))]); + } + if variant { + let ref_allele = u8_to_base(*ref_base); + let genotypes = Genotypes::new(keys.clone(), genotype_vec); + let alt_alleles: Vec = + alt_bases.iter().map(|a| Allele::Bases(vec![*a])).collect(); + let record = vcf::Record::builder() + .set_chromosome( + "test" + .parse() + .expect("Invalid chromosome name"), + ) + .set_position(Position::from(mapped_pos + 1)) + .add_reference_base(ref_allele) + .set_alternate_bases(AlternateBases::from(alt_alleles)) + .set_genotypes(genotypes) + .build() + .expect("Could not construct record"); + writer.write_record(&header, &record)?; + } + } + Ok(()) +} + /// Use `kbo` to list the available commands or `kbo ` to run. /// /// # Input format detection @@ -242,6 +335,7 @@ fn main() { Some(cli::Commands::Map { query_files, ref_file, + out_format, max_error_prob, num_threads, kmer_size, @@ -275,8 +369,11 @@ fn main() { let ref_data = read_fastx_file(ref_file); + let mut alignments: Vec> = Vec::new(); + let mut names: Vec = Vec::new(); + let stdout = std::io::stdout(); - query_files.par_iter().for_each(|query_file| { + query_files.iter().for_each(|query_file| { let query_data = read_fastx_file(query_file); let (sbwt, lcs) = kbo::index::build_sbwt_from_vecs(&query_data, &Some(sbwt_build_options.clone())); @@ -285,9 +382,20 @@ fn main() { res.append(&mut kbo::map(ref_contig, &sbwt, &lcs, map_opts)); }); - let _ = writeln!(&mut stdout.lock(), - ">{}\n{}", query_file, std::str::from_utf8(&res).expect("UTF-8")); + if out_format == "aln" { + let _ = writeln!(&mut stdout.lock(), + ">{}\n{}", query_file, std::str::from_utf8(&res).expect("UTF-8")); + } else if out_format == "vcf" { + // Store in alignments and write later + names.push(query_file.to_string()); + alignments.push(res); + } }); + + if out_format == "vcf" { + let _ = write_vcf(&mut stdout.lock(), &ref_data.into_iter().flatten().collect::>(), &alignments, &names); + } + }, None => {} } From 3c8fa74ceefc8cb601e8e850b917761fa4f100e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 4 Mar 2025 14:11:17 +0200 Subject: [PATCH 02/15] Update vcf header and add contig names. --- Cargo.toml | 1 + src/cli.rs | 6 +- src/main.rs | 179 ++++++++++++++++++++++++++++++++-------------------- 3 files changed, 115 insertions(+), 71 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ef52d0c..e98a7ed 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,7 @@ path = "src/main.rs" [dependencies] ## core clap = { version = "4", features = ["derive"]} +chrono = "0.4.40" kbo = "0.4.1" log = "0.4.20" needletail = { version = "0.6.0", default-features = false, features = ["flate2"] } diff --git a/src/cli.rs b/src/cli.rs index 82e6086..da057d4 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -122,9 +122,13 @@ pub enum Commands { #[arg(group = "input", required = true)] query_files: Vec, - // Reference fasta + // Input options + // // Reference file #[arg(short = 'r', long = "reference", required = true, help_heading = "Input")] ref_file: String, + // // Do not concatenate contigs in reference + #[arg(long = "detailed", help_heading = "Input", default_value_t = false)] + detailed: bool, // Output format #[arg(short = 'f', long = "format", default_value = "aln", help_heading = "Output")] diff --git a/src/main.rs b/src/main.rs index a6e9f24..c9001c9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -14,6 +14,7 @@ use std::io::Write; use clap::Parser; +use chrono::offset::Local; use log::info; use needletail::Sequence; use needletail::parser::SequenceRecord; @@ -24,6 +25,7 @@ use rayon::iter::IntoParallelRefIterator; use noodles_vcf::{ self as vcf, header::record::value::{map::Contig, Map}, + header::record::value::Collection, record::{ alternate_bases::Allele, genotypes::{keys::key, sample::Value, Keys}, @@ -85,70 +87,91 @@ fn u8_to_base(ref_base: u8) -> Base { } } -/// Write alignments as a .vcf file -/// Adapted from -/// https://github.com/bacpop/ska.rust/blob/9483f0383a8cc9b151ae0cae4c33bf62fc89cbec/src/ska_ref.rs#L427 -fn write_vcf(f: &mut W, ref_seq: &Vec, alignments: &Vec>, names: &Vec) -> Result<(), std::io::Error> { - log::info!("Converting to VCF"); - - // Write the VCF header +/// Write the header of a .vcf file +fn write_vcf_header(f: &mut W, + ref_name: &str, + contig_info: &[(&String, &usize)], +) -> Result { let mut writer = vcf::Writer::new(f); let mut header_builder = vcf::Header::builder(); - for contig in names { + for (name, length) in contig_info.iter() { + let record = Map::::builder() + .set_length(**length) + .build(); + header_builder = header_builder.add_contig( - contig.parse().expect("Could not add contig to header"), - Map::::new(), + name.parse().expect("Could not add contig to header"), + record.expect("Valid contig"), ); - } - for name in names { - header_builder = header_builder.add_sample_name(name); - } - let header = header_builder.build(); + + }; + header_builder = header_builder.add_sample_name("unknown"); + let mut header = header_builder.build(); + + let current_date = Local::now().format("%Y%m%d").to_string(); + let vcf_date = Collection::Unstructured(vec![current_date]); + header.other_records_mut().insert("fileDate".parse().expect("Valid string"), vcf_date.clone()); + + let vcf_source = Collection::Unstructured(vec![format!("kbo-cli v{}", env!("CARGO_PKG_VERSION"))]); + header.other_records_mut().insert("source".parse().expect("Valid string"), vcf_source.clone()); + + let vcf_reference = Collection::Unstructured(vec![ref_name.to_string()]); + header.other_records_mut().insert("reference".parse().expect("Valid string"), vcf_reference.clone()); + + let vcf_phasing = Collection::Unstructured(vec!["none".to_string()]); + header.other_records_mut().insert("phasing".parse().expect("Valid string"), vcf_phasing.clone()); + writer.write_header(&header)?; + Ok(header) +} + +/// Write the contents of a .vcf file +fn write_vcf(f: &mut W, + header: &vcf::Header, + ref_seq: &[&u8], + mapped_seq: &[u8], + contig_name: &str +) -> Result<(), std::io::Error> { + let mut writer = vcf::Writer::new(f); + // Write each record (column) let keys = Keys::try_from(vec![key::GENOTYPE]).unwrap(); for (mapped_pos, ref_base) in ref_seq.iter().enumerate() { - let mut genotype_vec = Vec::with_capacity(alignments.len()); - let mut alt_bases: Vec = Vec::new(); + let alt_base: Base; let mut variant = false; - for query in alignments { - let mapped_base = query[mapped_pos]; - - let gt = if mapped_base == *ref_base { - String::from("0") - } else if mapped_base == b'-' { - variant = true; - String::from(".") - } else { - variant = true; - let alt_base = u8_to_base(mapped_base); - if !alt_bases.contains(&alt_base) { - alt_bases.push(alt_base); - } - (alt_bases.iter().position(|&r| r == alt_base).unwrap() + 1).to_string() - }; - genotype_vec.push(vec![Some(Value::String(gt))]); - } + + let mapped_base = mapped_seq[mapped_pos]; + + let (genotype, alt_base) = if mapped_base == **ref_base { + (String::from("0"), u8_to_base(mapped_base)) + } else if mapped_base == b'-' { + variant = true; + (String::from("."), u8_to_base(b'-')) + } else { + variant = true; + alt_base = u8_to_base(mapped_base); + (mapped_base.to_string(), alt_base) + }; + if variant { - let ref_allele = u8_to_base(*ref_base); - let genotypes = Genotypes::new(keys.clone(), genotype_vec); - let alt_alleles: Vec = - alt_bases.iter().map(|a| Allele::Bases(vec![*a])).collect(); + let ref_allele = u8_to_base(**ref_base); + let genotypes = Genotypes::new(keys.clone(), vec![vec![Some(Value::String(genotype))]]); + let alt_allele = vec![Allele::Bases(vec![alt_base])]; let record = vcf::Record::builder() .set_chromosome( - "test" + contig_name .parse() .expect("Invalid chromosome name"), ) .set_position(Position::from(mapped_pos + 1)) .add_reference_base(ref_allele) - .set_alternate_bases(AlternateBases::from(alt_alleles)) + .set_alternate_bases(AlternateBases::from(alt_allele)) .set_genotypes(genotypes) .build() .expect("Could not construct record"); - writer.write_record(&header, &record)?; + writer.write_record(header, &record)?; } } Ok(()) @@ -335,6 +358,7 @@ fn main() { Some(cli::Commands::Map { query_files, ref_file, + detailed, out_format, max_error_prob, num_threads, @@ -369,34 +393,49 @@ fn main() { let ref_data = read_fastx_file(ref_file); - let mut alignments: Vec> = Vec::new(); - let mut names: Vec = Vec::new(); - - let stdout = std::io::stdout(); - query_files.iter().for_each(|query_file| { - let query_data = read_fastx_file(query_file); - let (sbwt, lcs) = kbo::index::build_sbwt_from_vecs(&query_data, &Some(sbwt_build_options.clone())); - - let mut res: Vec = Vec::new(); - ref_data.iter().for_each(|ref_contig| { - res.append(&mut kbo::map(ref_contig, &sbwt, &lcs, map_opts)); - }); - - if out_format == "aln" { - let _ = writeln!(&mut stdout.lock(), - ">{}\n{}", query_file, std::str::from_utf8(&res).expect("UTF-8")); - } else if out_format == "vcf" { - // Store in alignments and write later - names.push(query_file.to_string()); - alignments.push(res); - } - }); - - if out_format == "vcf" { - let _ = write_vcf(&mut stdout.lock(), &ref_data.into_iter().flatten().collect::>(), &alignments, &names); - } + let stdout = std::io::stdout(); + + let mut indexes: Vec<((sbwt::SbwtIndexVariant, sbwt::LcsArray), String, usize)> = Vec::new(); + + query_files.iter().for_each(|query_file| { + if *detailed { + let mut reader = needletail::parse_fastx_file(query_file).expect("valid path/file"); + while let Some(contig) = read_from_fastx_parser(&mut *reader) { + let name = std::str::from_utf8(contig.id()).expect("UTF-8"); + let seq = contig.normalize(true); + indexes.push((kbo::index::build_sbwt_from_vecs(&[seq.to_vec()], &Some(sbwt_build_options.clone())), name.to_string(), seq.len())); + } + } else { + let contigs = read_fastx_file(query_file); + let n_bases = ref_data.iter().map(|x| x.len()).reduce(|a, b| a + b).unwrap(); + let name = query_file.clone(); + indexes.push((kbo::index::build_sbwt_from_vecs(&contigs, &Some(sbwt_build_options.clone())), name, n_bases)); + } - }, - None => {} + if out_format == "aln" { + ref_data.iter().for_each(|ref_contig| { + indexes.iter().for_each(|((sbwt, lcs), contig_name, _)| { + let res = kbo::map(ref_contig, &sbwt, &lcs, map_opts); + let _ = writeln!(&mut stdout.lock(), + ">{}\n{}", contig_name, std::str::from_utf8(&res).expect("UTF-8")); + }); + }); + } else if out_format == "vcf" { + let vcf_header = write_vcf_header(&mut stdout.lock(), ref_file, + &indexes.iter().map(|((_, _), contig_name, contig_len)| { + (contig_name, contig_len) + }).collect::>()) + .expect("I/O error"); + + ref_data.iter().for_each(|ref_contig| { + indexes.iter().for_each(|((sbwt, lcs), contig_name, _)| { + let res = kbo::map(ref_contig, &sbwt, &lcs, map_opts); + let _ = write_vcf(&mut stdout.lock(), &vcf_header, &ref_data.iter().flatten().collect::>(), &res, &contig_name); + }); + }); + }; + }); + }, + None => {} } } From 31593f9067246707d33254b7503490f166fffcf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 4 Mar 2025 14:12:32 +0200 Subject: [PATCH 03/15] Remove needless borrows. --- src/main.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main.rs b/src/main.rs index c9001c9..4f76e07 100644 --- a/src/main.rs +++ b/src/main.rs @@ -415,7 +415,7 @@ fn main() { if out_format == "aln" { ref_data.iter().for_each(|ref_contig| { indexes.iter().for_each(|((sbwt, lcs), contig_name, _)| { - let res = kbo::map(ref_contig, &sbwt, &lcs, map_opts); + let res = kbo::map(ref_contig, sbwt, lcs, map_opts); let _ = writeln!(&mut stdout.lock(), ">{}\n{}", contig_name, std::str::from_utf8(&res).expect("UTF-8")); }); @@ -429,8 +429,8 @@ fn main() { ref_data.iter().for_each(|ref_contig| { indexes.iter().for_each(|((sbwt, lcs), contig_name, _)| { - let res = kbo::map(ref_contig, &sbwt, &lcs, map_opts); - let _ = write_vcf(&mut stdout.lock(), &vcf_header, &ref_data.iter().flatten().collect::>(), &res, &contig_name); + let res = kbo::map(ref_contig, sbwt, lcs, map_opts); + let _ = write_vcf(&mut stdout.lock(), &vcf_header, &ref_data.iter().flatten().collect::>(), &res, contig_name); }); }); }; From 0f5bc004661f8940d5a0e267dbf9c5689de5296d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 4 Mar 2025 14:12:47 +0200 Subject: [PATCH 04/15] Untabify rest of cli::Commands::Map --- src/main.rs | 60 ++++++++++++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/src/main.rs b/src/main.rs index 4f76e07..d7006c9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -356,42 +356,42 @@ fn main() { }, Some(cli::Commands::Map { - query_files, - ref_file, + query_files, + ref_file, detailed, out_format, - max_error_prob, - num_threads, + max_error_prob, + num_threads, kmer_size, - prefix_precalc, - dedup_batches, - mem_gb, - temp_dir, - verbose, + prefix_precalc, + dedup_batches, + mem_gb, + temp_dir, + verbose, }) => { - init_log(if *verbose { 2 } else { 1 }); + init_log(if *verbose { 2 } else { 1 }); let mut sbwt_build_options = kbo::index::BuildOpts::default(); - // These are required for the subcommand to work correctly - sbwt_build_options.add_revcomp = true; - sbwt_build_options.build_select = true; - // These can be adjusted - sbwt_build_options.k = *kmer_size; - sbwt_build_options.num_threads = *num_threads; - sbwt_build_options.prefix_precalc = *prefix_precalc; - sbwt_build_options.dedup_batches = *dedup_batches; - sbwt_build_options.mem_gb = *mem_gb; - sbwt_build_options.temp_dir = temp_dir.clone(); - - let mut map_opts = kbo::MapOpts::default(); - map_opts.max_error_prob = *max_error_prob; - - rayon::ThreadPoolBuilder::new() - .num_threads(*num_threads) - .thread_name(|i| format!("rayon-thread-{}", i)) - .build() - .unwrap(); + // These are required for the subcommand to work correctly + sbwt_build_options.add_revcomp = true; + sbwt_build_options.build_select = true; + // These can be adjusted + sbwt_build_options.k = *kmer_size; + sbwt_build_options.num_threads = *num_threads; + sbwt_build_options.prefix_precalc = *prefix_precalc; + sbwt_build_options.dedup_batches = *dedup_batches; + sbwt_build_options.mem_gb = *mem_gb; + sbwt_build_options.temp_dir = temp_dir.clone(); + + let mut map_opts = kbo::MapOpts::default(); + map_opts.max_error_prob = *max_error_prob; + + rayon::ThreadPoolBuilder::new() + .num_threads(*num_threads) + .thread_name(|i| format!("rayon-thread-{}", i)) + .build() + .unwrap(); - let ref_data = read_fastx_file(ref_file); + let ref_data = read_fastx_file(ref_file); let stdout = std::io::stdout(); From 6f1d37a921cf22e13863daf19934293d45af6b17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 4 Mar 2025 14:15:40 +0200 Subject: [PATCH 05/15] Fix query base counting with --detailed. --- src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index d7006c9..c9e886e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -407,7 +407,7 @@ fn main() { } } else { let contigs = read_fastx_file(query_file); - let n_bases = ref_data.iter().map(|x| x.len()).reduce(|a, b| a + b).unwrap(); + let n_bases = contigs.iter().map(|x| x.len()).reduce(|a, b| a + b).unwrap(); let name = query_file.clone(); indexes.push((kbo::index::build_sbwt_from_vecs(&contigs, &Some(sbwt_build_options.clone())), name, n_bases)); } From ddf93b2e9590ac50d3b09de05db3368dc02164e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 4 Mar 2025 14:43:21 +0200 Subject: [PATCH 06/15] Fix parsing contig names with whitespace --- src/main.rs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/main.rs b/src/main.rs index c9e886e..15ea577 100644 --- a/src/main.rs +++ b/src/main.rs @@ -94,14 +94,16 @@ fn write_vcf_header(f: &mut W, ) -> Result { let mut writer = vcf::Writer::new(f); let mut header_builder = vcf::Header::builder(); - for (name, length) in contig_info.iter() { + for (contig_header, length) in contig_info.iter() { let record = Map::::builder() .set_length(**length) .build(); + let mut header_contents = contig_header.split_whitespace(); + let contig_name = header_contents.next().expect("Contig name"); header_builder = header_builder.add_contig( - name.parse().expect("Could not add contig to header"), - record.expect("Valid contig"), + contig_name.parse().expect("Query contig name in header"), + record.expect("Record of type vcf::header::record::value::map::Contig"), ); }; @@ -131,7 +133,7 @@ fn write_vcf(f: &mut W, header: &vcf::Header, ref_seq: &[&u8], mapped_seq: &[u8], - contig_name: &str + contig_header: &str ) -> Result<(), std::io::Error> { let mut writer = vcf::Writer::new(f); @@ -147,7 +149,8 @@ fn write_vcf(f: &mut W, let (genotype, alt_base) = if mapped_base == **ref_base { (String::from("0"), u8_to_base(mapped_base)) } else if mapped_base == b'-' { - variant = true; + // Only output changes that can be resolved + variant = false; (String::from("."), u8_to_base(b'-')) } else { variant = true; @@ -159,6 +162,10 @@ fn write_vcf(f: &mut W, let ref_allele = u8_to_base(**ref_base); let genotypes = Genotypes::new(keys.clone(), vec![vec![Some(Value::String(genotype))]]); let alt_allele = vec![Allele::Bases(vec![alt_base])]; + + let mut header_contents = contig_header.split_whitespace(); + let contig_name = header_contents.next().expect("Contig name"); + let record = vcf::Record::builder() .set_chromosome( contig_name From 7b1e4e71728b2745f21a1fbf80a738243f7f9cbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 4 Mar 2025 14:45:02 +0200 Subject: [PATCH 07/15] Fix writing .vcf against fragmented references. --- src/main.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main.rs b/src/main.rs index 15ea577..318659d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -131,7 +131,7 @@ fn write_vcf_header(f: &mut W, /// Write the contents of a .vcf file fn write_vcf(f: &mut W, header: &vcf::Header, - ref_seq: &[&u8], + ref_seq: &[u8], mapped_seq: &[u8], contig_header: &str ) -> Result<(), std::io::Error> { @@ -146,7 +146,7 @@ fn write_vcf(f: &mut W, let mapped_base = mapped_seq[mapped_pos]; - let (genotype, alt_base) = if mapped_base == **ref_base { + let (genotype, alt_base) = if mapped_base == *ref_base { (String::from("0"), u8_to_base(mapped_base)) } else if mapped_base == b'-' { // Only output changes that can be resolved @@ -159,7 +159,7 @@ fn write_vcf(f: &mut W, }; if variant { - let ref_allele = u8_to_base(**ref_base); + let ref_allele = u8_to_base(*ref_base); let genotypes = Genotypes::new(keys.clone(), vec![vec![Some(Value::String(genotype))]]); let alt_allele = vec![Allele::Bases(vec![alt_base])]; @@ -437,7 +437,7 @@ fn main() { ref_data.iter().for_each(|ref_contig| { indexes.iter().for_each(|((sbwt, lcs), contig_name, _)| { let res = kbo::map(ref_contig, sbwt, lcs, map_opts); - let _ = write_vcf(&mut stdout.lock(), &vcf_header, &ref_data.iter().flatten().collect::>(), &res, contig_name); + let _ = write_vcf(&mut stdout.lock(), &vcf_header, ref_contig, &res, contig_name); }); }); }; From 14a39567a1b53843dcd14efa9c0ffb24256698fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 4 Mar 2025 15:26:33 +0200 Subject: [PATCH 08/15] Take .vcf contig names from reference, not query. --- src/main.rs | 86 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 36 deletions(-) diff --git a/src/main.rs b/src/main.rs index 318659d..9ac7cc9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -64,6 +64,20 @@ fn read_fastx_file( seq_data } +// Reads all sequence data from a fastX file +fn read_fastx_file2( + file: &str, +) -> Vec<(String, Vec)> { + let mut seq_data: Vec<(String, Vec)> = Vec::new(); + let mut reader = needletail::parse_fastx_file(file).unwrap_or_else(|_| panic!("Expected valid fastX file at {}", file)); + while let Some(rec) = read_from_fastx_parser(&mut *reader) { + let seqrec = rec.normalize(true); + let seqname = String::from_utf8(rec.id().to_vec()).expect("UTF-8"); + seq_data.push((seqname, seqrec.to_vec())); + } + seq_data +} + /// Initializes the logger with verbosity given in `log_max_level`. fn init_log(log_max_level: usize) { stderrlog::new() @@ -90,13 +104,13 @@ fn u8_to_base(ref_base: u8) -> Base { /// Write the header of a .vcf file fn write_vcf_header(f: &mut W, ref_name: &str, - contig_info: &[(&String, &usize)], + contig_info: &[(String, usize)], ) -> Result { let mut writer = vcf::Writer::new(f); let mut header_builder = vcf::Header::builder(); for (contig_header, length) in contig_info.iter() { let record = Map::::builder() - .set_length(**length) + .set_length(*length) .build(); let mut header_contents = contig_header.split_whitespace(); @@ -398,49 +412,49 @@ fn main() { .build() .unwrap(); - let ref_data = read_fastx_file(ref_file); + // --detailed only makes sense for .vcf output format + if *detailed && out_format == "aln" { + log::warn!("--detailed does not do anything with output format aln"); + } - let stdout = std::io::stdout(); + let ref_data: Vec<(String, Vec)> = read_fastx_file2(ref_file); - let mut indexes: Vec<((sbwt::SbwtIndexVariant, sbwt::LcsArray), String, usize)> = Vec::new(); + let stdout = std::io::stdout(); query_files.iter().for_each(|query_file| { - if *detailed { - let mut reader = needletail::parse_fastx_file(query_file).expect("valid path/file"); - while let Some(contig) = read_from_fastx_parser(&mut *reader) { - let name = std::str::from_utf8(contig.id()).expect("UTF-8"); - let seq = contig.normalize(true); - indexes.push((kbo::index::build_sbwt_from_vecs(&[seq.to_vec()], &Some(sbwt_build_options.clone())), name.to_string(), seq.len())); - } - } else { - let contigs = read_fastx_file(query_file); - let n_bases = contigs.iter().map(|x| x.len()).reduce(|a, b| a + b).unwrap(); - let name = query_file.clone(); - indexes.push((kbo::index::build_sbwt_from_vecs(&contigs, &Some(sbwt_build_options.clone())), name, n_bases)); - } + let contigs: Vec> = read_fastx_file2(query_file).iter().map(|(_, seq)| seq.clone()).collect(); + let (sbwt, lcs) = kbo::index::build_sbwt_from_vecs(&contigs, &Some(sbwt_build_options.clone())); if out_format == "aln" { - ref_data.iter().for_each(|ref_contig| { - indexes.iter().for_each(|((sbwt, lcs), contig_name, _)| { - let res = kbo::map(ref_contig, sbwt, lcs, map_opts); - let _ = writeln!(&mut stdout.lock(), - ">{}\n{}", contig_name, std::str::from_utf8(&res).expect("UTF-8")); + // Concatenate the reference if it contains multiple contigs + let ref_concat = ref_data.iter().map(|(_, contig)| contig.clone()).collect::>>(); + let res = kbo::map(&ref_concat.into_iter().flatten().collect::>(), &sbwt, &lcs, map_opts); + let _ = writeln!(&mut stdout.lock(), + ">{}\n{}", query_file, std::str::from_utf8(&res).expect("UTF-8")); + } else if out_format == "vcf" && *detailed { + // Will map separately against each contig in the reference + let vcf_header = write_vcf_header(&mut stdout.lock(), ref_file, + &ref_data.iter().map(|(contig_name, contig_seq)| { + (contig_name.clone(), contig_seq.len()) + }).collect::>()) + .expect("Write header to .vcf file"); + + ref_data.iter().for_each(|(ref_header, ref_seq)| { + let res = kbo::map(ref_seq, &sbwt, &lcs, map_opts); + write_vcf(&mut stdout.lock(), &vcf_header, ref_seq, &res, ref_header).expect("Write contents to .vcf file"); }); - }); } else if out_format == "vcf" { + // Will flatten the reference and map against it + let ref_contigs = ref_data.iter().map(|(_, contig)| contig.clone()).collect::>>(); + let ref_concat = ref_contigs.into_iter().flatten().collect::>(); + let vcf_header = write_vcf_header(&mut stdout.lock(), ref_file, - &indexes.iter().map(|((_, _), contig_name, contig_len)| { - (contig_name, contig_len) - }).collect::>()) - .expect("I/O error"); - - ref_data.iter().for_each(|ref_contig| { - indexes.iter().for_each(|((sbwt, lcs), contig_name, _)| { - let res = kbo::map(ref_contig, sbwt, lcs, map_opts); - let _ = write_vcf(&mut stdout.lock(), &vcf_header, ref_contig, &res, contig_name); - }); - }); - }; + &[(ref_file.clone(), ref_concat.len())]) + .expect("Write header to .vcf file"); + + let res = kbo::map(&ref_concat, &sbwt, &lcs, map_opts); + write_vcf(&mut stdout.lock(), &vcf_header, &ref_concat, &res, ref_file).expect("Write contents to .vcf file"); + } }); }, None => {} From 5831a49ed01edf4bbab90f3254beb64f73ac3348 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 4 Mar 2025 15:33:31 +0200 Subject: [PATCH 09/15] Revert unintended changes to .aln format. --- src/main.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/main.rs b/src/main.rs index 9ac7cc9..e99e3af 100644 --- a/src/main.rs +++ b/src/main.rs @@ -426,9 +426,10 @@ fn main() { let (sbwt, lcs) = kbo::index::build_sbwt_from_vecs(&contigs, &Some(sbwt_build_options.clone())); if out_format == "aln" { - // Concatenate the reference if it contains multiple contigs - let ref_concat = ref_data.iter().map(|(_, contig)| contig.clone()).collect::>>(); - let res = kbo::map(&ref_concat.into_iter().flatten().collect::>(), &sbwt, &lcs, map_opts); + let mut res: Vec = Vec::new(); + ref_data.iter().for_each(|(_, ref_seq)| { + res.append(&mut kbo::map(ref_seq, &sbwt, &lcs, map_opts)); + }); let _ = writeln!(&mut stdout.lock(), ">{}\n{}", query_file, std::str::from_utf8(&res).expect("UTF-8")); } else if out_format == "vcf" && *detailed { From 6266cf6fd31b7ac53c0ad1f1d3d649adf80c3d47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 4 Mar 2025 15:43:28 +0200 Subject: [PATCH 10/15] Revert adding --detailed to map. --- src/cli.rs | 3 --- src/main.rs | 19 +------------------ 2 files changed, 1 insertion(+), 21 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index da057d4..cc720e7 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -126,9 +126,6 @@ pub enum Commands { // // Reference file #[arg(short = 'r', long = "reference", required = true, help_heading = "Input")] ref_file: String, - // // Do not concatenate contigs in reference - #[arg(long = "detailed", help_heading = "Input", default_value_t = false)] - detailed: bool, // Output format #[arg(short = 'f', long = "format", default_value = "aln", help_heading = "Output")] diff --git a/src/main.rs b/src/main.rs index e99e3af..0797e6d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -379,7 +379,6 @@ fn main() { Some(cli::Commands::Map { query_files, ref_file, - detailed, out_format, max_error_prob, num_threads, @@ -412,11 +411,6 @@ fn main() { .build() .unwrap(); - // --detailed only makes sense for .vcf output format - if *detailed && out_format == "aln" { - log::warn!("--detailed does not do anything with output format aln"); - } - let ref_data: Vec<(String, Vec)> = read_fastx_file2(ref_file); let stdout = std::io::stdout(); @@ -432,7 +426,7 @@ fn main() { }); let _ = writeln!(&mut stdout.lock(), ">{}\n{}", query_file, std::str::from_utf8(&res).expect("UTF-8")); - } else if out_format == "vcf" && *detailed { + } else if out_format == "vcf" { // Will map separately against each contig in the reference let vcf_header = write_vcf_header(&mut stdout.lock(), ref_file, &ref_data.iter().map(|(contig_name, contig_seq)| { @@ -444,17 +438,6 @@ fn main() { let res = kbo::map(ref_seq, &sbwt, &lcs, map_opts); write_vcf(&mut stdout.lock(), &vcf_header, ref_seq, &res, ref_header).expect("Write contents to .vcf file"); }); - } else if out_format == "vcf" { - // Will flatten the reference and map against it - let ref_contigs = ref_data.iter().map(|(_, contig)| contig.clone()).collect::>>(); - let ref_concat = ref_contigs.into_iter().flatten().collect::>(); - - let vcf_header = write_vcf_header(&mut stdout.lock(), ref_file, - &[(ref_file.clone(), ref_concat.len())]) - .expect("Write header to .vcf file"); - - let res = kbo::map(&ref_concat, &sbwt, &lcs, map_opts); - write_vcf(&mut stdout.lock(), &vcf_header, &ref_concat, &res, ref_file).expect("Write contents to .vcf file"); } }); }, From 902d7dd83cc7eb1ec6eb99ae69d88fc06e55c0bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 4 Mar 2025 15:47:49 +0200 Subject: [PATCH 11/15] Merge read_fastx_file and read_fastx_file2 --- src/main.rs | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/src/main.rs b/src/main.rs index 0797e6d..6c0d60b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -54,19 +54,6 @@ fn read_from_fastx_parser( // Reads all sequence data from a fastX file fn read_fastx_file( file: &str, -) -> Vec> { - let mut seq_data: Vec> = Vec::new(); - let mut reader = needletail::parse_fastx_file(file).unwrap_or_else(|_| panic!("Expected valid fastX file at {}", file)); - while let Some(rec) = read_from_fastx_parser(&mut *reader) { - let seqrec = rec.normalize(true); - seq_data.push(seqrec.to_vec()); - } - seq_data -} - -// Reads all sequence data from a fastX file -fn read_fastx_file2( - file: &str, ) -> Vec<(String, Vec)> { let mut seq_data: Vec<(String, Vec)> = Vec::new(); let mut reader = needletail::parse_fastx_file(file).unwrap_or_else(|_| panic!("Expected valid fastX file at {}", file)); @@ -244,7 +231,7 @@ fn main() { info!("Building SBWT index from {} files...", seq_files.len()); let mut seq_data: Vec> = Vec::new(); seq_files.iter().for_each(|file| { - seq_data.append(&mut read_fastx_file(file)); + seq_data.append(&mut read_fastx_file(file).into_iter().map(|(_, seq)| seq).collect::>>()); }); let (sbwt, lcs) = kbo::build(&seq_data, sbwt_build_options); @@ -298,7 +285,7 @@ fn main() { info!("Building SBWT from file {}...", ref_file.as_ref().unwrap()); if !*detailed { - let ref_data = read_fastx_file(ref_file.as_ref().unwrap()); + let ref_data = read_fastx_file(ref_file.as_ref().unwrap()).into_iter().map(|(_, seq)| seq).collect::>>(); let n_bases = ref_data.iter().map(|x| x.len()).reduce(|a, b| a + b).unwrap(); indexes.push((kbo::index::build_sbwt_from_vecs(&ref_data, &Some(sbwt_build_options)), ref_file.clone().unwrap(), n_bases)); } else { @@ -411,12 +398,12 @@ fn main() { .build() .unwrap(); - let ref_data: Vec<(String, Vec)> = read_fastx_file2(ref_file); + let ref_data: Vec<(String, Vec)> = read_fastx_file(ref_file); let stdout = std::io::stdout(); query_files.iter().for_each(|query_file| { - let contigs: Vec> = read_fastx_file2(query_file).iter().map(|(_, seq)| seq.clone()).collect(); + let contigs: Vec> = read_fastx_file(query_file).iter().map(|(_, seq)| seq.clone()).collect(); let (sbwt, lcs) = kbo::index::build_sbwt_from_vecs(&contigs, &Some(sbwt_build_options.clone())); if out_format == "aln" { From 587aaa308ca6fd4c9aa09f0bee6fc5b39b6cf686 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 4 Mar 2025 15:54:38 +0200 Subject: [PATCH 12/15] Separate vcf_writer.rs from main --- src/main.rs | 128 ++----------------------------------------- src/vcf_writer.rs | 135 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+), 125 deletions(-) create mode 100644 src/vcf_writer.rs diff --git a/src/main.rs b/src/main.rs index 6c0d60b..94c2a6e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -14,28 +14,15 @@ use std::io::Write; use clap::Parser; -use chrono::offset::Local; use log::info; use needletail::Sequence; use needletail::parser::SequenceRecord; use rayon::iter::ParallelIterator; use rayon::iter::IntoParallelRefIterator; -// TODO clean up -use noodles_vcf::{ - self as vcf, - header::record::value::{map::Contig, Map}, - header::record::value::Collection, - record::{ - alternate_bases::Allele, - genotypes::{keys::key, sample::Value, Keys}, - reference_bases::Base, - AlternateBases, Genotypes, Position, - }, -}; - // Command-line interface mod cli; +mod vcf_writer; // Given a needletail parser, reads the next contig sequence fn read_from_fastx_parser( @@ -76,115 +63,6 @@ fn init_log(log_max_level: usize) { .unwrap(); } -/// [`u8`] representation used elsewhere to [`noodles_vcf::record::reference_bases::Base`] -#[inline] -fn u8_to_base(ref_base: u8) -> Base { - match ref_base { - b'A' => Base::A, - b'C' => Base::C, - b'G' => Base::G, - b'T' => Base::T, - _ => Base::N, - } -} - -/// Write the header of a .vcf file -fn write_vcf_header(f: &mut W, - ref_name: &str, - contig_info: &[(String, usize)], -) -> Result { - let mut writer = vcf::Writer::new(f); - let mut header_builder = vcf::Header::builder(); - for (contig_header, length) in contig_info.iter() { - let record = Map::::builder() - .set_length(*length) - .build(); - - let mut header_contents = contig_header.split_whitespace(); - let contig_name = header_contents.next().expect("Contig name"); - header_builder = header_builder.add_contig( - contig_name.parse().expect("Query contig name in header"), - record.expect("Record of type vcf::header::record::value::map::Contig"), - ); - - }; - header_builder = header_builder.add_sample_name("unknown"); - let mut header = header_builder.build(); - - let current_date = Local::now().format("%Y%m%d").to_string(); - let vcf_date = Collection::Unstructured(vec![current_date]); - header.other_records_mut().insert("fileDate".parse().expect("Valid string"), vcf_date.clone()); - - let vcf_source = Collection::Unstructured(vec![format!("kbo-cli v{}", env!("CARGO_PKG_VERSION"))]); - header.other_records_mut().insert("source".parse().expect("Valid string"), vcf_source.clone()); - - let vcf_reference = Collection::Unstructured(vec![ref_name.to_string()]); - header.other_records_mut().insert("reference".parse().expect("Valid string"), vcf_reference.clone()); - - let vcf_phasing = Collection::Unstructured(vec!["none".to_string()]); - header.other_records_mut().insert("phasing".parse().expect("Valid string"), vcf_phasing.clone()); - - writer.write_header(&header)?; - - Ok(header) -} - -/// Write the contents of a .vcf file -fn write_vcf(f: &mut W, - header: &vcf::Header, - ref_seq: &[u8], - mapped_seq: &[u8], - contig_header: &str -) -> Result<(), std::io::Error> { - let mut writer = vcf::Writer::new(f); - - // Write each record (column) - let keys = Keys::try_from(vec![key::GENOTYPE]).unwrap(); - - for (mapped_pos, ref_base) in ref_seq.iter().enumerate() { - let alt_base: Base; - let mut variant = false; - - let mapped_base = mapped_seq[mapped_pos]; - - let (genotype, alt_base) = if mapped_base == *ref_base { - (String::from("0"), u8_to_base(mapped_base)) - } else if mapped_base == b'-' { - // Only output changes that can be resolved - variant = false; - (String::from("."), u8_to_base(b'-')) - } else { - variant = true; - alt_base = u8_to_base(mapped_base); - (mapped_base.to_string(), alt_base) - }; - - if variant { - let ref_allele = u8_to_base(*ref_base); - let genotypes = Genotypes::new(keys.clone(), vec![vec![Some(Value::String(genotype))]]); - let alt_allele = vec![Allele::Bases(vec![alt_base])]; - - let mut header_contents = contig_header.split_whitespace(); - let contig_name = header_contents.next().expect("Contig name"); - - let record = vcf::Record::builder() - .set_chromosome( - contig_name - .parse() - .expect("Invalid chromosome name"), - ) - .set_position(Position::from(mapped_pos + 1)) - .add_reference_base(ref_allele) - .set_alternate_bases(AlternateBases::from(alt_allele)) - .set_genotypes(genotypes) - .build() - .expect("Could not construct record"); - writer.write_record(header, &record)?; - } - } - Ok(()) -} - /// Use `kbo` to list the available commands or `kbo ` to run. /// /// # Input format detection @@ -415,7 +293,7 @@ fn main() { ">{}\n{}", query_file, std::str::from_utf8(&res).expect("UTF-8")); } else if out_format == "vcf" { // Will map separately against each contig in the reference - let vcf_header = write_vcf_header(&mut stdout.lock(), ref_file, + let vcf_header = vcf_writer::write_vcf_header(&mut stdout.lock(), ref_file, &ref_data.iter().map(|(contig_name, contig_seq)| { (contig_name.clone(), contig_seq.len()) }).collect::>()) @@ -423,7 +301,7 @@ fn main() { ref_data.iter().for_each(|(ref_header, ref_seq)| { let res = kbo::map(ref_seq, &sbwt, &lcs, map_opts); - write_vcf(&mut stdout.lock(), &vcf_header, ref_seq, &res, ref_header).expect("Write contents to .vcf file"); + vcf_writer::write_vcf_contents(&mut stdout.lock(), &vcf_header, ref_seq, &res, ref_header).expect("Write contents to .vcf file"); }); } }); diff --git a/src/vcf_writer.rs b/src/vcf_writer.rs new file mode 100644 index 0000000..d6aff36 --- /dev/null +++ b/src/vcf_writer.rs @@ -0,0 +1,135 @@ +// kbo-cli: Command-line interface to the kbo local aligner. +// +// Copyright 2024 Tommi Mäklin [tommi@maklin.fi]. + +// Copyrights in this project are retained by contributors. No copyright assignment +// is required to contribute to this project. + +// Except as otherwise noted (below and/or in individual files), this +// project is licensed under the Apache License, Version 2.0 +// or or +// the MIT license, or , +// at your option. +// +use std::io::Write; + +use chrono::offset::Local; +use noodles_vcf::{ + header::record::value::{map::Contig, Map}, + header::record::value::Collection, + record::{ + alternate_bases::Allele, + genotypes::{keys::key, sample::Value, Keys}, + reference_bases::Base, + AlternateBases, Genotypes, Position, + }, +}; + +/// [`u8`] representation used elsewhere to [`noodles_vcf::record::reference_bases::Base`] +#[inline] +fn u8_to_base(ref_base: u8) -> Base { + match ref_base { + b'A' => Base::A, + b'C' => Base::C, + b'G' => Base::G, + b'T' => Base::T, + _ => Base::N, + } +} + +/// Write the header of a .vcf file +pub fn write_vcf_header(f: &mut W, + ref_name: &str, + contig_info: &[(String, usize)], +) -> Result { + let mut writer = noodles_vcf::Writer::new(f); + let mut header_builder = noodles_vcf::Header::builder(); + for (contig_header, length) in contig_info.iter() { + let record = Map::::builder() + .set_length(*length) + .build(); + + let mut header_contents = contig_header.split_whitespace(); + let contig_name = header_contents.next().expect("Contig name"); + header_builder = header_builder.add_contig( + contig_name.parse().expect("Query contig name in header"), + record.expect("Record of type noodles_vcf::header::record::value::map::Contig"), + ); + + }; + header_builder = header_builder.add_sample_name("unknown"); + let mut header = header_builder.build(); + + let current_date = Local::now().format("%Y%m%d").to_string(); + let vcf_date = Collection::Unstructured(vec![current_date]); + header.other_records_mut().insert("fileDate".parse().expect("Valid string"), vcf_date.clone()); + + let vcf_source = Collection::Unstructured(vec![format!("kbo-cli v{}", env!("CARGO_PKG_VERSION"))]); + header.other_records_mut().insert("source".parse().expect("Valid string"), vcf_source.clone()); + + let vcf_reference = Collection::Unstructured(vec![ref_name.to_string()]); + header.other_records_mut().insert("reference".parse().expect("Valid string"), vcf_reference.clone()); + + let vcf_phasing = Collection::Unstructured(vec!["none".to_string()]); + header.other_records_mut().insert("phasing".parse().expect("Valid string"), vcf_phasing.clone()); + + writer.write_header(&header)?; + + Ok(header) +} + +/// Write the contents of a .vcf file +pub fn write_vcf_contents(f: &mut W, + header: &noodles_vcf::Header, + ref_seq: &[u8], + mapped_seq: &[u8], + contig_header: &str +) -> Result<(), std::io::Error> { + let mut writer = noodles_vcf::Writer::new(f); + + // Write each record (column) + let keys = Keys::try_from(vec![key::GENOTYPE]).unwrap(); + + for (mapped_pos, ref_base) in ref_seq.iter().enumerate() { + let alt_base: Base; + let mut variant = false; + + let mapped_base = mapped_seq[mapped_pos]; + + let (genotype, alt_base) = if mapped_base == *ref_base { + (String::from("0"), u8_to_base(mapped_base)) + } else if mapped_base == b'-' { + // Only output changes that can be resolved + variant = false; + (String::from("."), u8_to_base(b'-')) + } else { + variant = true; + alt_base = u8_to_base(mapped_base); + (mapped_base.to_string(), alt_base) + }; + + if variant { + let ref_allele = u8_to_base(*ref_base); + let genotypes = Genotypes::new(keys.clone(), vec![vec![Some(Value::String(genotype))]]); + let alt_allele = vec![Allele::Bases(vec![alt_base])]; + + let mut header_contents = contig_header.split_whitespace(); + let contig_name = header_contents.next().expect("Contig name"); + + let record = noodles_vcf::Record::builder() + .set_chromosome( + contig_name + .parse() + .expect("Invalid chromosome name"), + ) + .set_position(Position::from(mapped_pos + 1)) + .add_reference_base(ref_allele) + .set_alternate_bases(AlternateBases::from(alt_allele)) + .set_genotypes(genotypes) + .build() + .expect("Could not construct record"); + writer.write_record(header, &record)?; + } + } + Ok(()) +} From 3f05cc412977373943ebff80c408583e36dc2146 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 4 Mar 2025 16:01:04 +0200 Subject: [PATCH 13/15] List possible output formats for map. --- src/cli.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cli.rs b/src/cli.rs index cc720e7..579c97c 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -128,7 +128,7 @@ pub enum Commands { ref_file: String, // Output format - #[arg(short = 'f', long = "format", default_value = "aln", help_heading = "Output")] + #[arg(short = 'f', long = "format", default_value = "aln", help_heading = "Output", help = "Output format (aln or vcf)")] out_format: String, // Parameters From 6a64deaaf68b686f4521b69f6a7f4ec3a3eeb0a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 4 Mar 2025 16:04:00 +0200 Subject: [PATCH 14/15] Panic if output format is not aln or vcf. --- src/main.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main.rs b/src/main.rs index 94c2a6e..cdc7719 100644 --- a/src/main.rs +++ b/src/main.rs @@ -303,6 +303,8 @@ fn main() { let res = kbo::map(ref_seq, &sbwt, &lcs, map_opts); vcf_writer::write_vcf_contents(&mut stdout.lock(), &vcf_header, ref_seq, &res, ref_header).expect("Write contents to .vcf file"); }); + } else { + panic!("Unrecognized output format `--format {}``", out_format); } }); }, From 7399bbedc6cb8adda4be797adaa72655753e92db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 4 Mar 2025 17:15:09 +0200 Subject: [PATCH 15/15] Genotype column should just contain a '1' --- src/vcf_writer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vcf_writer.rs b/src/vcf_writer.rs index d6aff36..5132793 100644 --- a/src/vcf_writer.rs +++ b/src/vcf_writer.rs @@ -105,7 +105,7 @@ pub fn write_vcf_contents(f: &mut W, } else { variant = true; alt_base = u8_to_base(mapped_base); - (mapped_base.to_string(), alt_base) + ("1".to_string(), alt_base) }; if variant {