diff --git a/Cargo.toml b/Cargo.toml index 69735f3..e98a7ed 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,9 +17,11 @@ path = "src/main.rs" [dependencies] ## core clap = { version = "4", features = ["derive"]} +chrono = "0.4.40" kbo = "0.4.1" log = "0.4.20" needletail = { version = "0.6.0", default-features = false, features = ["flate2"] } +noodles-vcf = "0.49" rayon = "1" sbwt = "0.3.4" stderrlog = "0.6.0" diff --git a/src/cli.rs b/src/cli.rs index c728bf6..579c97c 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -122,10 +122,15 @@ pub enum Commands { #[arg(group = "input", required = true)] query_files: Vec, - // Reference fasta + // Input options + // // Reference file #[arg(short = 'r', long = "reference", required = true, help_heading = "Input")] ref_file: String, + // Output format + #[arg(short = 'f', long = "format", default_value = "aln", help_heading = "Output", help = "Output format (aln or vcf)")] + out_format: String, + // Parameters // // Upper bound for random match probability #[arg(long = "max-error-prob", default_value_t = 0.0000001, help_heading = "Algorithm")] diff --git a/src/main.rs b/src/main.rs index c3362e5..cdc7719 100644 --- a/src/main.rs +++ b/src/main.rs @@ -22,6 +22,7 @@ use rayon::iter::IntoParallelRefIterator; // Command-line interface mod cli; +mod vcf_writer; // Given a needletail parser, reads the next contig sequence fn read_from_fastx_parser( @@ -40,13 +41,14 @@ fn read_from_fastx_parser( // Reads all sequence data from a fastX file fn read_fastx_file( file: &str, -) -> Vec> { - let mut seq_data: Vec> = Vec::new(); +) -> Vec<(String, Vec)> { + let mut seq_data: Vec<(String, Vec)> = Vec::new(); let mut reader = needletail::parse_fastx_file(file).unwrap_or_else(|_| panic!("Expected valid fastX file at {}", file)); - while let Some(rec) = read_from_fastx_parser(&mut *reader) { - let seqrec = rec.normalize(true); - seq_data.push(seqrec.to_vec()); - } + while let Some(rec) = read_from_fastx_parser(&mut *reader) { + let seqrec = rec.normalize(true); + let seqname = String::from_utf8(rec.id().to_vec()).expect("UTF-8"); + seq_data.push((seqname, seqrec.to_vec())); + } seq_data } @@ -107,7 +109,7 @@ fn main() { info!("Building SBWT index from {} files...", seq_files.len()); let mut seq_data: Vec> = Vec::new(); seq_files.iter().for_each(|file| { - seq_data.append(&mut read_fastx_file(file)); + seq_data.append(&mut read_fastx_file(file).into_iter().map(|(_, seq)| seq).collect::>>()); }); let (sbwt, lcs) = kbo::build(&seq_data, sbwt_build_options); @@ -161,7 +163,7 @@ fn main() { info!("Building SBWT from file {}...", ref_file.as_ref().unwrap()); if !*detailed { - let ref_data = read_fastx_file(ref_file.as_ref().unwrap()); + let ref_data = read_fastx_file(ref_file.as_ref().unwrap()).into_iter().map(|(_, seq)| seq).collect::>>(); let n_bases = ref_data.iter().map(|x| x.len()).reduce(|a, b| a + b).unwrap(); indexes.push((kbo::index::build_sbwt_from_vecs(&ref_data, &Some(sbwt_build_options)), ref_file.clone().unwrap(), n_bases)); } else { @@ -240,55 +242,72 @@ fn main() { }, Some(cli::Commands::Map { - query_files, - ref_file, - max_error_prob, - num_threads, + query_files, + ref_file, + out_format, + max_error_prob, + num_threads, kmer_size, - prefix_precalc, - dedup_batches, - mem_gb, - temp_dir, - verbose, + prefix_precalc, + dedup_batches, + mem_gb, + temp_dir, + verbose, }) => { - init_log(if *verbose { 2 } else { 1 }); + init_log(if *verbose { 2 } else { 1 }); let mut sbwt_build_options = kbo::index::BuildOpts::default(); - // These are required for the subcommand to work correctly - sbwt_build_options.add_revcomp = true; - sbwt_build_options.build_select = true; - // These can be adjusted - sbwt_build_options.k = *kmer_size; - sbwt_build_options.num_threads = *num_threads; - sbwt_build_options.prefix_precalc = *prefix_precalc; - sbwt_build_options.dedup_batches = *dedup_batches; - sbwt_build_options.mem_gb = *mem_gb; - sbwt_build_options.temp_dir = temp_dir.clone(); + // These are required for the subcommand to work correctly + sbwt_build_options.add_revcomp = true; + sbwt_build_options.build_select = true; + // These can be adjusted + sbwt_build_options.k = *kmer_size; + sbwt_build_options.num_threads = *num_threads; + sbwt_build_options.prefix_precalc = *prefix_precalc; + sbwt_build_options.dedup_batches = *dedup_batches; + sbwt_build_options.mem_gb = *mem_gb; + sbwt_build_options.temp_dir = temp_dir.clone(); - let mut map_opts = kbo::MapOpts::default(); - map_opts.max_error_prob = *max_error_prob; + let mut map_opts = kbo::MapOpts::default(); + map_opts.max_error_prob = *max_error_prob; - rayon::ThreadPoolBuilder::new() - .num_threads(*num_threads) - .thread_name(|i| format!("rayon-thread-{}", i)) - .build() - .unwrap(); + rayon::ThreadPoolBuilder::new() + .num_threads(*num_threads) + .thread_name(|i| format!("rayon-thread-{}", i)) + .build() + .unwrap(); - let ref_data = read_fastx_file(ref_file); + let ref_data: Vec<(String, Vec)> = read_fastx_file(ref_file); - let stdout = std::io::stdout(); - query_files.par_iter().for_each(|query_file| { - let query_data = read_fastx_file(query_file); - let (sbwt, lcs) = kbo::index::build_sbwt_from_vecs(&query_data, &Some(sbwt_build_options.clone())); + let stdout = std::io::stdout(); - let mut res: Vec = Vec::new(); - ref_data.iter().for_each(|ref_contig| { - res.append(&mut kbo::map(ref_contig, &sbwt, &lcs, map_opts)); - }); + query_files.iter().for_each(|query_file| { + let contigs: Vec> = read_fastx_file(query_file).iter().map(|(_, seq)| seq.clone()).collect(); + let (sbwt, lcs) = kbo::index::build_sbwt_from_vecs(&contigs, &Some(sbwt_build_options.clone())); - let _ = writeln!(&mut stdout.lock(), - ">{}\n{}", query_file, std::str::from_utf8(&res).expect("UTF-8")); - }); - }, - None => {} + if out_format == "aln" { + let mut res: Vec = Vec::new(); + ref_data.iter().for_each(|(_, ref_seq)| { + res.append(&mut kbo::map(ref_seq, &sbwt, &lcs, map_opts)); + }); + let _ = writeln!(&mut stdout.lock(), + ">{}\n{}", query_file, std::str::from_utf8(&res).expect("UTF-8")); + } else if out_format == "vcf" { + // Will map separately against each contig in the reference + let vcf_header = vcf_writer::write_vcf_header(&mut stdout.lock(), ref_file, + &ref_data.iter().map(|(contig_name, contig_seq)| { + (contig_name.clone(), contig_seq.len()) + }).collect::>()) + .expect("Write header to .vcf file"); + + ref_data.iter().for_each(|(ref_header, ref_seq)| { + let res = kbo::map(ref_seq, &sbwt, &lcs, map_opts); + vcf_writer::write_vcf_contents(&mut stdout.lock(), &vcf_header, ref_seq, &res, ref_header).expect("Write contents to .vcf file"); + }); + } else { + panic!("Unrecognized output format `--format {}``", out_format); + } + }); + }, + None => {} } } diff --git a/src/vcf_writer.rs b/src/vcf_writer.rs new file mode 100644 index 0000000..5132793 --- /dev/null +++ b/src/vcf_writer.rs @@ -0,0 +1,135 @@ +// kbo-cli: Command-line interface to the kbo local aligner. +// +// Copyright 2024 Tommi Mäklin [tommi@maklin.fi]. + +// Copyrights in this project are retained by contributors. No copyright assignment +// is required to contribute to this project. + +// Except as otherwise noted (below and/or in individual files), this +// project is licensed under the Apache License, Version 2.0 +// or or +// the MIT license, or , +// at your option. +// +use std::io::Write; + +use chrono::offset::Local; +use noodles_vcf::{ + header::record::value::{map::Contig, Map}, + header::record::value::Collection, + record::{ + alternate_bases::Allele, + genotypes::{keys::key, sample::Value, Keys}, + reference_bases::Base, + AlternateBases, Genotypes, Position, + }, +}; + +/// [`u8`] representation used elsewhere to [`noodles_vcf::record::reference_bases::Base`] +#[inline] +fn u8_to_base(ref_base: u8) -> Base { + match ref_base { + b'A' => Base::A, + b'C' => Base::C, + b'G' => Base::G, + b'T' => Base::T, + _ => Base::N, + } +} + +/// Write the header of a .vcf file +pub fn write_vcf_header(f: &mut W, + ref_name: &str, + contig_info: &[(String, usize)], +) -> Result { + let mut writer = noodles_vcf::Writer::new(f); + let mut header_builder = noodles_vcf::Header::builder(); + for (contig_header, length) in contig_info.iter() { + let record = Map::::builder() + .set_length(*length) + .build(); + + let mut header_contents = contig_header.split_whitespace(); + let contig_name = header_contents.next().expect("Contig name"); + header_builder = header_builder.add_contig( + contig_name.parse().expect("Query contig name in header"), + record.expect("Record of type noodles_vcf::header::record::value::map::Contig"), + ); + + }; + header_builder = header_builder.add_sample_name("unknown"); + let mut header = header_builder.build(); + + let current_date = Local::now().format("%Y%m%d").to_string(); + let vcf_date = Collection::Unstructured(vec![current_date]); + header.other_records_mut().insert("fileDate".parse().expect("Valid string"), vcf_date.clone()); + + let vcf_source = Collection::Unstructured(vec![format!("kbo-cli v{}", env!("CARGO_PKG_VERSION"))]); + header.other_records_mut().insert("source".parse().expect("Valid string"), vcf_source.clone()); + + let vcf_reference = Collection::Unstructured(vec![ref_name.to_string()]); + header.other_records_mut().insert("reference".parse().expect("Valid string"), vcf_reference.clone()); + + let vcf_phasing = Collection::Unstructured(vec!["none".to_string()]); + header.other_records_mut().insert("phasing".parse().expect("Valid string"), vcf_phasing.clone()); + + writer.write_header(&header)?; + + Ok(header) +} + +/// Write the contents of a .vcf file +pub fn write_vcf_contents(f: &mut W, + header: &noodles_vcf::Header, + ref_seq: &[u8], + mapped_seq: &[u8], + contig_header: &str +) -> Result<(), std::io::Error> { + let mut writer = noodles_vcf::Writer::new(f); + + // Write each record (column) + let keys = Keys::try_from(vec![key::GENOTYPE]).unwrap(); + + for (mapped_pos, ref_base) in ref_seq.iter().enumerate() { + let alt_base: Base; + let mut variant = false; + + let mapped_base = mapped_seq[mapped_pos]; + + let (genotype, alt_base) = if mapped_base == *ref_base { + (String::from("0"), u8_to_base(mapped_base)) + } else if mapped_base == b'-' { + // Only output changes that can be resolved + variant = false; + (String::from("."), u8_to_base(b'-')) + } else { + variant = true; + alt_base = u8_to_base(mapped_base); + ("1".to_string(), alt_base) + }; + + if variant { + let ref_allele = u8_to_base(*ref_base); + let genotypes = Genotypes::new(keys.clone(), vec![vec![Some(Value::String(genotype))]]); + let alt_allele = vec![Allele::Bases(vec![alt_base])]; + + let mut header_contents = contig_header.split_whitespace(); + let contig_name = header_contents.next().expect("Contig name"); + + let record = noodles_vcf::Record::builder() + .set_chromosome( + contig_name + .parse() + .expect("Invalid chromosome name"), + ) + .set_position(Position::from(mapped_pos + 1)) + .add_reference_base(ref_allele) + .set_alternate_bases(AlternateBases::from(alt_allele)) + .set_genotypes(genotypes) + .build() + .expect("Could not construct record"); + writer.write_record(header, &record)?; + } + } + Ok(()) +}