Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@ path = "src/main.rs"
[dependencies]
## core
clap = { version = "4", features = ["derive"]}
chrono = "0.4.40"
kbo = "0.4.1"
log = "0.4.20"
needletail = { version = "0.6.0", default-features = false, features = ["flate2"] }
noodles-vcf = "0.49"
rayon = "1"
sbwt = "0.3.4"
stderrlog = "0.6.0"
Expand Down
7 changes: 6 additions & 1 deletion src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,15 @@ pub enum Commands {
#[arg(group = "input", required = true)]
query_files: Vec<String>,

// Reference fasta
// Input options
// // Reference file
#[arg(short = 'r', long = "reference", required = true, help_heading = "Input")]
ref_file: String,

// Output format
#[arg(short = 'f', long = "format", default_value = "aln", help_heading = "Output", help = "Output format (aln or vcf)")]
out_format: String,

// Parameters
// // Upper bound for random match probability
#[arg(long = "max-error-prob", default_value_t = 0.0000001, help_heading = "Algorithm")]
Expand Down
117 changes: 68 additions & 49 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use rayon::iter::IntoParallelRefIterator;

// Command-line interface
mod cli;
mod vcf_writer;

// Given a needletail parser, reads the next contig sequence
fn read_from_fastx_parser(
Expand All @@ -40,13 +41,14 @@ fn read_from_fastx_parser(
// Reads all sequence data from a fastX file
fn read_fastx_file(
file: &str,
) -> Vec<Vec<u8>> {
let mut seq_data: Vec<Vec<u8>> = Vec::new();
) -> Vec<(String, Vec<u8>)> {
let mut seq_data: Vec<(String, Vec<u8>)> = Vec::new();
let mut reader = needletail::parse_fastx_file(file).unwrap_or_else(|_| panic!("Expected valid fastX file at {}", file));
while let Some(rec) = read_from_fastx_parser(&mut *reader) {
let seqrec = rec.normalize(true);
seq_data.push(seqrec.to_vec());
}
while let Some(rec) = read_from_fastx_parser(&mut *reader) {
let seqrec = rec.normalize(true);
let seqname = String::from_utf8(rec.id().to_vec()).expect("UTF-8");
seq_data.push((seqname, seqrec.to_vec()));
}
seq_data
}

Expand Down Expand Up @@ -107,7 +109,7 @@ fn main() {
info!("Building SBWT index from {} files...", seq_files.len());
let mut seq_data: Vec<Vec<u8>> = Vec::new();
seq_files.iter().for_each(|file| {
seq_data.append(&mut read_fastx_file(file));
seq_data.append(&mut read_fastx_file(file).into_iter().map(|(_, seq)| seq).collect::<Vec<Vec<u8>>>());
});

let (sbwt, lcs) = kbo::build(&seq_data, sbwt_build_options);
Expand Down Expand Up @@ -161,7 +163,7 @@ fn main() {
info!("Building SBWT from file {}...", ref_file.as_ref().unwrap());

if !*detailed {
let ref_data = read_fastx_file(ref_file.as_ref().unwrap());
let ref_data = read_fastx_file(ref_file.as_ref().unwrap()).into_iter().map(|(_, seq)| seq).collect::<Vec<Vec<u8>>>();
let n_bases = ref_data.iter().map(|x| x.len()).reduce(|a, b| a + b).unwrap();
indexes.push((kbo::index::build_sbwt_from_vecs(&ref_data, &Some(sbwt_build_options)), ref_file.clone().unwrap(), n_bases));
} else {
Expand Down Expand Up @@ -240,55 +242,72 @@ fn main() {
},

Some(cli::Commands::Map {
query_files,
ref_file,
max_error_prob,
num_threads,
query_files,
ref_file,
out_format,
max_error_prob,
num_threads,
kmer_size,
prefix_precalc,
dedup_batches,
mem_gb,
temp_dir,
verbose,
prefix_precalc,
dedup_batches,
mem_gb,
temp_dir,
verbose,
}) => {
init_log(if *verbose { 2 } else { 1 });
init_log(if *verbose { 2 } else { 1 });
let mut sbwt_build_options = kbo::index::BuildOpts::default();
// These are required for the subcommand to work correctly
sbwt_build_options.add_revcomp = true;
sbwt_build_options.build_select = true;
// These can be adjusted
sbwt_build_options.k = *kmer_size;
sbwt_build_options.num_threads = *num_threads;
sbwt_build_options.prefix_precalc = *prefix_precalc;
sbwt_build_options.dedup_batches = *dedup_batches;
sbwt_build_options.mem_gb = *mem_gb;
sbwt_build_options.temp_dir = temp_dir.clone();
// These are required for the subcommand to work correctly
sbwt_build_options.add_revcomp = true;
sbwt_build_options.build_select = true;
// These can be adjusted
sbwt_build_options.k = *kmer_size;
sbwt_build_options.num_threads = *num_threads;
sbwt_build_options.prefix_precalc = *prefix_precalc;
sbwt_build_options.dedup_batches = *dedup_batches;
sbwt_build_options.mem_gb = *mem_gb;
sbwt_build_options.temp_dir = temp_dir.clone();

let mut map_opts = kbo::MapOpts::default();
map_opts.max_error_prob = *max_error_prob;
let mut map_opts = kbo::MapOpts::default();
map_opts.max_error_prob = *max_error_prob;

rayon::ThreadPoolBuilder::new()
.num_threads(*num_threads)
.thread_name(|i| format!("rayon-thread-{}", i))
.build()
.unwrap();
rayon::ThreadPoolBuilder::new()
.num_threads(*num_threads)
.thread_name(|i| format!("rayon-thread-{}", i))
.build()
.unwrap();

let ref_data = read_fastx_file(ref_file);
let ref_data: Vec<(String, Vec<u8>)> = read_fastx_file(ref_file);

let stdout = std::io::stdout();
query_files.par_iter().for_each(|query_file| {
let query_data = read_fastx_file(query_file);
let (sbwt, lcs) = kbo::index::build_sbwt_from_vecs(&query_data, &Some(sbwt_build_options.clone()));
let stdout = std::io::stdout();

let mut res: Vec<u8> = Vec::new();
ref_data.iter().for_each(|ref_contig| {
res.append(&mut kbo::map(ref_contig, &sbwt, &lcs, map_opts));
});
query_files.iter().for_each(|query_file| {
let contigs: Vec<Vec<u8>> = read_fastx_file(query_file).iter().map(|(_, seq)| seq.clone()).collect();
let (sbwt, lcs) = kbo::index::build_sbwt_from_vecs(&contigs, &Some(sbwt_build_options.clone()));

let _ = writeln!(&mut stdout.lock(),
">{}\n{}", query_file, std::str::from_utf8(&res).expect("UTF-8"));
});
},
None => {}
if out_format == "aln" {
let mut res: Vec<u8> = Vec::new();
ref_data.iter().for_each(|(_, ref_seq)| {
res.append(&mut kbo::map(ref_seq, &sbwt, &lcs, map_opts));
});
let _ = writeln!(&mut stdout.lock(),
">{}\n{}", query_file, std::str::from_utf8(&res).expect("UTF-8"));
} else if out_format == "vcf" {
// Will map separately against each contig in the reference
let vcf_header = vcf_writer::write_vcf_header(&mut stdout.lock(), ref_file,
&ref_data.iter().map(|(contig_name, contig_seq)| {
(contig_name.clone(), contig_seq.len())
}).collect::<Vec<(String, usize)>>())
.expect("Write header to .vcf file");

ref_data.iter().for_each(|(ref_header, ref_seq)| {
let res = kbo::map(ref_seq, &sbwt, &lcs, map_opts);
vcf_writer::write_vcf_contents(&mut stdout.lock(), &vcf_header, ref_seq, &res, ref_header).expect("Write contents to .vcf file");
});
} else {
panic!("Unrecognized output format `--format {}``", out_format);
}
});
},
None => {}
}
}
135 changes: 135 additions & 0 deletions src/vcf_writer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
// kbo-cli: Command-line interface to the kbo local aligner.
//
// Copyright 2024 Tommi Mäklin [tommi@maklin.fi].

// Copyrights in this project are retained by contributors. No copyright assignment
// is required to contribute to this project.

// Except as otherwise noted (below and/or in individual files), this
// project is licensed under the Apache License, Version 2.0
// <LICENSE-APACHE> or <http://www.apache.org/licenses/LICENSE-2.0> or
// the MIT license, <LICENSE-MIT> or <http://opensource.org/licenses/MIT>,
// at your option.
//
use std::io::Write;

use chrono::offset::Local;
use noodles_vcf::{
header::record::value::{map::Contig, Map},
header::record::value::Collection,
record::{
alternate_bases::Allele,
genotypes::{keys::key, sample::Value, Keys},
reference_bases::Base,
AlternateBases, Genotypes, Position,
},
};

/// [`u8`] representation used elsewhere to [`noodles_vcf::record::reference_bases::Base`]
#[inline]
fn u8_to_base(ref_base: u8) -> Base {
match ref_base {
b'A' => Base::A,
b'C' => Base::C,
b'G' => Base::G,
b'T' => Base::T,
_ => Base::N,
}
}

/// Write the header of a .vcf file
pub fn write_vcf_header<W: Write>(f: &mut W,
ref_name: &str,
contig_info: &[(String, usize)],
) -> Result<noodles_vcf::Header, std::io::Error> {
let mut writer = noodles_vcf::Writer::new(f);
let mut header_builder = noodles_vcf::Header::builder();
for (contig_header, length) in contig_info.iter() {
let record = Map::<Contig>::builder()
.set_length(*length)
.build();

let mut header_contents = contig_header.split_whitespace();
let contig_name = header_contents.next().expect("Contig name");
header_builder = header_builder.add_contig(
contig_name.parse().expect("Query contig name in header"),
record.expect("Record of type noodles_vcf::header::record::value::map::Contig"),
);

};
header_builder = header_builder.add_sample_name("unknown");
let mut header = header_builder.build();

let current_date = Local::now().format("%Y%m%d").to_string();
let vcf_date = Collection::Unstructured(vec![current_date]);
header.other_records_mut().insert("fileDate".parse().expect("Valid string"), vcf_date.clone());

let vcf_source = Collection::Unstructured(vec![format!("kbo-cli v{}", env!("CARGO_PKG_VERSION"))]);
header.other_records_mut().insert("source".parse().expect("Valid string"), vcf_source.clone());

let vcf_reference = Collection::Unstructured(vec![ref_name.to_string()]);
header.other_records_mut().insert("reference".parse().expect("Valid string"), vcf_reference.clone());

let vcf_phasing = Collection::Unstructured(vec!["none".to_string()]);
header.other_records_mut().insert("phasing".parse().expect("Valid string"), vcf_phasing.clone());

writer.write_header(&header)?;

Ok(header)
}

/// Write the contents of a .vcf file
pub fn write_vcf_contents<W: Write>(f: &mut W,
header: &noodles_vcf::Header,
ref_seq: &[u8],
mapped_seq: &[u8],
contig_header: &str
) -> Result<(), std::io::Error> {
let mut writer = noodles_vcf::Writer::new(f);

// Write each record (column)
let keys = Keys::try_from(vec![key::GENOTYPE]).unwrap();

for (mapped_pos, ref_base) in ref_seq.iter().enumerate() {
let alt_base: Base;
let mut variant = false;

let mapped_base = mapped_seq[mapped_pos];

let (genotype, alt_base) = if mapped_base == *ref_base {
(String::from("0"), u8_to_base(mapped_base))
} else if mapped_base == b'-' {
// Only output changes that can be resolved
variant = false;
(String::from("."), u8_to_base(b'-'))
} else {
variant = true;
alt_base = u8_to_base(mapped_base);
("1".to_string(), alt_base)
};

if variant {
let ref_allele = u8_to_base(*ref_base);
let genotypes = Genotypes::new(keys.clone(), vec![vec![Some(Value::String(genotype))]]);
let alt_allele = vec![Allele::Bases(vec![alt_base])];

let mut header_contents = contig_header.split_whitespace();
let contig_name = header_contents.next().expect("Contig name");

let record = noodles_vcf::Record::builder()
.set_chromosome(
contig_name
.parse()
.expect("Invalid chromosome name"),
)
.set_position(Position::from(mapped_pos + 1))
.add_reference_base(ref_allele)
.set_alternate_bases(AlternateBases::from(alt_allele))
.set_genotypes(genotypes)
.build()
.expect("Could not construct record");
writer.write_record(header, &record)?;
}
}
Ok(())
}
Loading