From cd334cfc555eb8138572fcdf257aba4d748027c4 Mon Sep 17 00:00:00 2001 From: carsonhh Date: Wed, 11 Aug 2021 16:41:42 -0600 Subject: [PATCH] add support for read group and optitical duplicate only filtering --- khash.h | 627 +++++++++++++++++++++++++++++++++++++++++++++++++ samblaster.cpp | 389 ++++++++++++++++++++++++++---- 2 files changed, 968 insertions(+), 48 deletions(-) create mode 100644 khash.h diff --git a/khash.h b/khash.h new file mode 100644 index 0000000..f75f347 --- /dev/null +++ b/khash.h @@ -0,0 +1,627 @@ +/* The MIT License + + Copyright (c) 2008, 2009, 2011 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + An example: + +#include "khash.h" +KHASH_MAP_INIT_INT(32, char) +int main() { + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; +} +*/ + +/* + 2013-05-02 (0.2.8): + + * Use quadratic probing. When the capacity is power of 2, stepping function + i*(i+1)/2 guarantees to traverse each bucket. It is better than double + hashing on cache performance and is more robust than linear probing. + + In theory, double hashing should be more robust than quadratic probing. + However, my implementation is probably not for large hash tables, because + the second hash function is closely tied to the first hash function, + which reduce the effectiveness of double hashing. + + Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php + + 2011-12-29 (0.2.7): + + * Minor code clean up; no actual effect. + + 2011-09-16 (0.2.6): + + * The capacity is a power of 2. This seems to dramatically improve the + speed for simple keys. Thank Zilong Tan for the suggestion. Reference: + + - http://code.google.com/p/ulib/ + - http://nothings.org/computer/judy/ + + * Allow to optionally use linear probing which usually has better + performance for random input. Double hashing is still the default as it + is more robust to certain non-random input. + + * Added Wang's integer hash function (not used by default). This hash + function is more robust to certain non-random input. + + 2011-02-14 (0.2.5): + + * Allow to declare global functions. + + 2009-09-26 (0.2.4): + + * Improve portability + + 2008-09-19 (0.2.3): + + * Corrected the example + * Improved interfaces + + 2008-09-11 (0.2.2): + + * Improved speed a little in kh_put() + + 2008-09-10 (0.2.1): + + * Added kh_clear() + * Fixed a compiling error + + 2008-09-02 (0.2.0): + + * Changed to token concatenation which increases flexibility. + + 2008-08-31 (0.1.2): + + * Fixed a bug in kh_get(), which has not been tested previously. + + 2008-08-31 (0.1.1): + + * Added destructor +*/ + + +#ifndef __AC_KHASH_H +#define __AC_KHASH_H + +/*! + @header + + Generic hash table library. + */ + +#define AC_VERSION_KHASH_H "0.2.8" + +#include +#include +#include + +/* compiler specific configuration */ + +#if UINT_MAX == 0xffffffffu +typedef unsigned int khint32_t; +#elif ULONG_MAX == 0xffffffffu +typedef unsigned long khint32_t; +#endif + +#if ULONG_MAX == ULLONG_MAX +typedef unsigned long khint64_t; +#else +typedef unsigned long long khint64_t; +#endif + +#ifndef kh_inline +#ifdef _MSC_VER +#define kh_inline __inline +#else +#define kh_inline inline +#endif +#endif /* kh_inline */ + +#ifndef klib_unused +#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) +#define klib_unused __attribute__ ((__unused__)) +#else +#define klib_unused +#endif +#endif /* klib_unused */ + +typedef khint32_t khint_t; +typedef khint_t khiter_t; + +#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) +#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) +#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) +#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) +#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) +#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) +#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) + +#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#ifndef kcalloc +#define kcalloc(N,Z) calloc(N,Z) +#endif +#ifndef kmalloc +#define kmalloc(Z) malloc(Z) +#endif +#ifndef krealloc +#define krealloc(P,Z) realloc(P,Z) +#endif +#ifndef kfree +#define kfree(P) free(P) +#endif + +static const double __ac_HASH_UPPER = 0.77; + +#define __KHASH_TYPE(name, khkey_t, khval_t) \ + typedef struct kh_##name##_s { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; + +#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ + extern kh_##name##_t *kh_init_##name(void); \ + extern void kh_destroy_##name(kh_##name##_t *h); \ + extern void kh_clear_##name(kh_##name##_t *h); \ + extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ + extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ + extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ + extern void kh_del_##name(kh_##name##_t *h, khint_t x); + +#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + SCOPE kh_##name##_t *kh_init_##name(void) { \ + return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ + } \ + SCOPE void kh_destroy_##name(kh_##name##_t *h) \ + { \ + if (h) { \ + kfree((void *)h->keys); kfree(h->flags); \ + kfree((void *)h->vals); \ + kfree(h); \ + } \ + } \ + SCOPE void kh_clear_##name(kh_##name##_t *h) \ + { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + { \ + if (h->n_buckets) { \ + khint_t k, i, last, mask, step = 0; \ + mask = h->n_buckets - 1; \ + k = __hash_func(key); i = k & mask; \ + last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + i = (i + (++step)) & mask; \ + if (i == last) return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i)? h->n_buckets : i; \ + } else return 0; \ + } \ + SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ + khint32_t *new_flags = 0; \ + khint_t j = 1; \ + { \ + kroundup32(new_n_buckets); \ + if (new_n_buckets < 4) new_n_buckets = 4; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ + else { /* hash table size to be changed (shrink or expand); rehash */ \ + new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (!new_flags) return -1; \ + memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (h->n_buckets < new_n_buckets) { /* expand */ \ + khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (!new_keys) { kfree(new_flags); return -1; } \ + h->keys = new_keys; \ + if (kh_is_map) { \ + khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + if (!new_vals) { kfree(new_flags); return -1; } \ + h->vals = new_vals; \ + } \ + } /* otherwise shrink */ \ + } \ + } \ + if (j) { /* rehashing is needed */ \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + khint_t new_mask; \ + new_mask = new_n_buckets - 1; \ + if (kh_is_map) val = h->vals[j]; \ + __ac_set_isdel_true(h->flags, j); \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ + khint_t k, i, step = 0; \ + k = __hash_func(key); \ + i = k & new_mask; \ + while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ + { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ + if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ + __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ + h->keys[i] = key; \ + if (kh_is_map) h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ + h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + kfree(h->flags); /* free the working space */ \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ + } \ + return 0; \ + } \ + SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + { \ + khint_t x; \ + if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ + if (h->n_buckets > (h->size<<1)) { \ + if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ + *ret = -1; return h->n_buckets; \ + } \ + } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ + *ret = -1; return h->n_buckets; \ + } \ + } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ + { \ + khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \ + x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ + if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ + else { \ + last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) site = i; \ + i = (i + (++step)) & mask; \ + if (i == last) { x = site; break; } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ + else x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { /* not present at all */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ + return x; \ + } \ + SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ + { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_PROTOTYPES(name, khkey_t, khval_t) + +#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + __KHASH_TYPE(name, khkey_t, khval_t) \ + __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +/* --- BEGIN OF HASH FUNCTIONS --- */ + +/*! @function + @abstract Integer hash function + @param key The integer [khint32_t] + @return The hash value [khint_t] + */ +#define kh_int_hash_func(key) (khint32_t)(key) +/*! @function + @abstract Integer comparison function + */ +#define kh_int_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract 64-bit integer hash function + @param key The integer [khint64_t] + @return The hash value [khint_t] + */ +#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) +/*! @function + @abstract 64-bit integer comparison function + */ +#define kh_int64_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract const char* hash function + @param s Pointer to a null terminated string + @return The hash value + */ +static kh_inline khint_t __ac_X31_hash_string(const char *s) +{ + khint_t h = (khint_t)*s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; + return h; +} +/*! @function + @abstract Another interface to const char* hash function + @param key Pointer to a null terminated string [const char*] + @return The hash value [khint_t] + */ +#define kh_str_hash_func(key) __ac_X31_hash_string(key) +/*! @function + @abstract Const char* comparison function + */ +#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) + +static kh_inline khint_t __ac_Wang_hash(khint_t key) +{ + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} +#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key) + +/* --- END OF HASH FUNCTIONS --- */ + +/* Other convenient macros... */ + +/*! + @abstract Type of the hash table. + @param name Name of the hash table [symbol] + */ +#define khash_t(name) kh_##name##_t + +/*! @function + @abstract Initiate a hash table. + @param name Name of the hash table [symbol] + @return Pointer to the hash table [khash_t(name)*] + */ +#define kh_init(name) kh_init_##name() + +/*! @function + @abstract Destroy a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_destroy(name, h) kh_destroy_##name(h) + +/*! @function + @abstract Reset a hash table without deallocating memory. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_clear(name, h) kh_clear_##name(h) + +/*! @function + @abstract Resize a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param s New size [khint_t] + */ +#define kh_resize(name, h, s) kh_resize_##name(h, s) + +/*! @function + @abstract Insert a key to the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @param r Extra return code: -1 if the operation failed; + 0 if the key is present in the hash table; + 1 if the bucket is empty (never used); 2 if the element in + the bucket has been deleted [int*] + @return Iterator to the inserted element [khint_t] + */ +#define kh_put(name, h, k, r) kh_put_##name(h, k, r) + +/*! @function + @abstract Retrieve a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @return Iterator to the found element, or kh_end(h) if the element is absent [khint_t] + */ +#define kh_get(name, h, k) kh_get_##name(h, k) + +/*! @function + @abstract Remove a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Iterator to the element to be deleted [khint_t] + */ +#define kh_del(name, h, k) kh_del_##name(h, k) + +/*! @function + @abstract Test whether a bucket contains data. + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return 1 if containing data; 0 otherwise [int] + */ +#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) + +/*! @function + @abstract Get key given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Key [type of keys] + */ +#define kh_key(h, x) ((h)->keys[x]) + +/*! @function + @abstract Get value given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Value [type of values] + @discussion For hash sets, calling this results in segfault. + */ +#define kh_val(h, x) ((h)->vals[x]) + +/*! @function + @abstract Alias of kh_val() + */ +#define kh_value(h, x) ((h)->vals[x]) + +/*! @function + @abstract Get the start iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The start iterator [khint_t] + */ +#define kh_begin(h) (khint_t)(0) + +/*! @function + @abstract Get the end iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The end iterator [khint_t] + */ +#define kh_end(h) ((h)->n_buckets) + +/*! @function + @abstract Get the number of elements in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of elements in the hash table [khint_t] + */ +#define kh_size(h) ((h)->size) + +/*! @function + @abstract Get the number of buckets in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of buckets in the hash table [khint_t] + */ +#define kh_n_buckets(h) ((h)->n_buckets) + +/*! @function + @abstract Iterate over the entries in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param kvar Variable to which key will be assigned + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (kvar) = kh_key(h,__i); \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/*! @function + @abstract Iterate over the values in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @param vvar Variable to which value will be assigned + @param code Block of code to execute + */ +#define kh_foreach_value(h, vvar, code) { khint_t __i; \ + for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ + if (!kh_exist(h,__i)) continue; \ + (vvar) = kh_val(h,__i); \ + code; \ + } } + +/* More convenient interfaces */ + +/*! @function + @abstract Instantiate a hash set containing integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT(name) \ + KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT(name, khval_t) \ + KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash set containing 64-bit integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT64(name) \ + KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT64(name, khval_t) \ + KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + +typedef const char *kh_cstr_t; +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_STR(name) \ + KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_STR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) + +#endif /* __AC_KHASH_H */ diff --git a/samblaster.cpp b/samblaster.cpp index 211663b..f4ba976 100644 --- a/samblaster.cpp +++ b/samblaster.cpp @@ -32,6 +32,7 @@ #include #include #include "sbhash.h" +#include "khash.h" // Rename common integer types. // I like having these shorter name. @@ -39,6 +40,9 @@ typedef uint64_t UINT64; typedef uint32_t UINT32; typedef int32_t INT32; +// intialize khash +KHASH_MAP_INIT_INT64(i, UINT32) + // Some helper routines. // mempcpy is a GNU extension and not available everywhere. @@ -147,6 +151,7 @@ struct splitLine // It this were a class, these would be in a subclass. int flag; pos_t pos; + int grpNum; int seqNum; pos_t binPos; int binNum; @@ -233,27 +238,33 @@ splitLine_t * getSplitLine() // Split the line into fields. void splitSplitLine(splitLine_t * line, int maxSplits) { - line->numFields = 0; - int fieldStart = 0; - // replace the newline with a tab so that it works like the rest of the fields. - line->buffer[line->bufLen-1] = '\t'; - for (int i=0; ibufLen; ++i) + //grow fields array as needed + if (maxSplits > line->maxFields){ + line->maxFields = maxSplits; + line->fields = (char **)realloc(line->fields, line->maxFields * sizeof(char *)); + } + + //first field + line->fields[0] = line->buffer; + line->numFields = 1; + + //remaining fields + for (int i=0; i < line->bufLen-1; ++i) { if (line->buffer[i] == '\t') { - line->fields[line->numFields] = line->buffer + fieldStart; - line->numFields += 1; + line->buffer[i] = 0; //terminate previous field with null + line->fields[line->numFields++] = line->buffer + i + 1; //start next field if (line->numFields == maxSplits) break; - line->buffer[i] = 0; - // Get ready for the next iteration. - fieldStart = i+1; } } - // replace the tab at the end of the line with a null char to terminate the final string. + + // replace the newline with a null char so that it works like the rest of the fields. line->buffer[line->bufLen-1] = 0; line->split = true; } + // Unsplit the fields back into a single string. // This will mung the strings, so only call this when all processing on the line is done. void unsplitSplitLine(splitLine_t * line) @@ -556,6 +567,14 @@ struct less_str } }; +// This stores the map between RG IDs and RG numbers. +typedef std::map rgMap_t; + +inline void addRG(rgMap_t * rgs, char * item, int val) +{ + (*rgs)[item] = val; +} + // This stores the map between sequence names and sequence numbers. typedef std::map seqMap_t; @@ -580,9 +599,11 @@ struct state_struct FILE * unmappedClippedFile; char * unmappedClippedFileName; sigSet_t * sigs; + rgMap_t rgs; seqMap_t seqs; - UINT32 * seqLens; UINT64 * seqOffs; + khash_t(i) * grpNumHash; + UINT32 grpCount; splitLine_t ** splitterArray; int splitterArrayMaxSize; UINT32 sigArraySize; @@ -600,6 +621,8 @@ struct state_struct bool addMateTags; bool compatMode; bool ignoreUnmated; + bool opticalOnly; + bool optPlusExAmp; bool quiet; }; typedef struct state_struct state_t; @@ -618,6 +641,8 @@ state_t * makeState () s->unmappedClippedFile = NULL; s->unmappedClippedFileName = (char *)""; s->sigs = NULL; + s->grpNumHash = kh_init(i); + s->grpCount = 0; s->minNonOverlap = 20; s->maxSplitCount = 2; s->minIndelSize = 50; @@ -630,6 +655,8 @@ state_t * makeState () s->addMateTags = false; s->compatMode = false; s->ignoreUnmated = false; + s->opticalOnly = false; + s->optPlusExAmp = false; s->quiet = false; // Start this as -1 to indicate we don't know yet. // Once we are outputting our first line, we will decide. @@ -637,6 +664,9 @@ state_t * makeState () // Used as a temporary location for ptrs to splitter for sort routine. s->splitterArrayMaxSize = 1000; s->splitterArray = (splitLine_t **)(malloc(s->splitterArrayMaxSize * sizeof(splitLine_t *))); + // initialze seqence info + s->seqOffs = (UINT64*)calloc(1, sizeof(UINT64)); + return s; } @@ -649,12 +679,16 @@ void deleteState(state_t * s) for (UINT32 i=0; isigArraySize; i++) deleteHashTable(&(s->sigs[i])); free (s->sigs); } + for (seqMap_t::iterator iter = s->rgs.begin(); iter != s->rgs.end(); ++iter) + { + free((char *)(iter->first)); + } for (seqMap_t::iterator iter = s->seqs.begin(); iter != s->seqs.end(); ++iter) { free((char *)(iter->first)); } - if (s->seqLens != NULL) free(s->seqLens); if (s->seqOffs != NULL) free(s->seqOffs); + kh_destroy(i, s->grpNumHash); delete s; } @@ -668,21 +702,62 @@ void deleteState(state_t * s) // without performance degradation on more standard genomes. // Thanks to https://github.com/carsonhh for the suggestion. ///////////////////////////////////////////////////////////////////////////// + +// largest known genome is 150GB. 22 bits only gets us to 136GB with synthetic +// super contigs and an int type array index. That's close enough. This leaves us +// 20 bits in the signature that can be used to integrate RG, UMI, and tile info. +#define BIN_SIZE ((UINT64)22) //bin window is 22 bits wide +#define BIN_MASK ((UINT64)((1UL << BIN_SIZE)-1)) //bin window is 22 bits wide +#define BIN_SHIFT ((UINT64)BIN_SIZE) //bin window is 22 bits wide + +#define G_SIZE ((UINT64)(64-2*BIN_SIZE)) //group window is 20 bits wide +#define G_MASK ((UINT64)((1UL << RG_SIZE)-1)) //group window is 20 bits wide +#define G_SHIFT ((UINT64)(2*BIN_SIZE)) //group window is 20 bits wide + + + +#define RG_SIZE ((UINT64)20) //RG window is 20 bits wide +#define RG_MASK ((UINT64)((1UL << RG_SIZE)-1)) //RG window is 20 bits wide +#define RG_SHIFT ((UINT64)0) //RG window is 20 bits wide + + + +#define CELL_SIZE ((UINT64)(32-RG_SIZE)) //flowcell window is 12 bits wide +#define CELL_MASK ((UINT64)((1UL << CELL_SIZE)-1)) //flowcell window is 12 bits wide +#define CELL_SHIFT ((UINT64)(RG_SHIFT + RG_SIZE)) //flowcell window is 12 bits wide + +#define TILE_SIZE ((UINT64)(64-CELL_SIZE-RG_SIZE)) //tile window is 32 bits wide +#define TILE_MASK ((UINT64)((1UL << TILE_SIZE)-1)) //tile window is 32 bits wide +#define TILE_SHIFT ((UINT64)(CELL_SHIFT + CELL_SIZE)) //tile window is 32 bits wide + +#define UMI_SIZE ((UINT64)(64-RG_SIZE)) //UMI window is 44 bits wide +#define UMI_MASK ((UINT64)((1UL << UMI_SIZE)-1)) //UMI window is 44 bits wide +#define UMI_SHIFT ((UINT64)(RG_SHIFT + RG_SIZE)) //UMI window is 44 bits wide + + + template inline sgn_t calcSig(splitLine_t * first, splitLine_t * second) { UINT64 final; + + // Copy required because grpNum is only 32 bit and it must be promoted + // BEFORE the bitshift operation. otherwise digits are lost + UINT64 rg1 = second->grpNum; //promotion to 64 bit via copy + UINT64 rg2 = (rg1 << G_SHIFT); + if (orphan) { // For an orphan, we only use information fron the second read. - final = second->binPos; + final = rg2 | second->binPos; } else { - // Total nonsense to get the compiler to actually work. - UINT64 t1 = first->binPos; - UINT64 t2 = t1 << 32; - final = t2 | second->binPos; + // Copy required because binPos is only 32 bit and it must be promoted + // BEFORE the bitshift operation. otherwise digits are lost + UINT64 t1 = first->binPos; //promotion to 64 bit via copy + UINT64 t2 = (t1 << BIN_SHIFT); + final = rg2 | t2 | second->binPos; } return (sgn_t)final; } @@ -709,19 +784,203 @@ inline UINT32 calcSigArrOff(splitLine_t * first, splitLine_t * second, int binCo // Sequences /////////////////////////////////////////////////////////////////////////////// -inline int getSeqNum(splitLine_t * line, int field, state_t * state) __attribute__((always_inline)); -inline int getSeqNum(splitLine_t * line, int field, state_t * state) +inline int getSeqNum(splitLine_t * line, state_t * state) __attribute__((always_inline)); +inline int getSeqNum(splitLine_t * line, state_t * state) { - seqMap_t::iterator findret = state->seqs.find(line->fields[field]); + seqMap_t::iterator findret = state->seqs.find(line->fields[RNAME]); if (findret == state->seqs.end()) { char * temp; - asprintf(&temp, "samblaster: Unable to find sequence '%s' in sequence map for readid %s\n", line->fields[field], line->fields[QNAME]); + asprintf(&temp, "samblaster: Unable to find sequence '%s' in sequence map for readid %s\n", line->fields[RNAME], line->fields[QNAME]); fatalError(temp); } return findret->second; } +/////////////////////////////////////////////////////////////////////////////// +// Read Groups +/////////////////////////////////////////////////////////////////////////////// + +inline int getRgNum(splitLine_t * line, state_t * state) __attribute__((always_inline)); +inline int getRgNum(splitLine_t * line, state_t * state) +{ + //identify RG among tags + char * rg = strstr(line->fields[TAGS], "RG:Z:"); + if(rg == NULL) return 0; //not found + rg += 5; //move to start of value + + //process as cstring + seqMap_t::iterator findret; + char * end = strchr(rg, '\t'); + if(end == NULL) + { + findret = state->rgs.find(rg); + } + else //make tab a temporary NULL for cstring + { + end[0] = 0; + findret = state->rgs.find(rg); + end[0] = '\t'; + } + + if (findret == state->rgs.end()) //not found + { + if(end != NULL) end[0] = 0; //change back to cstring for error + char * temp; + asprintf(&temp, "samblaster: Unable to find read group '%s' in RG map for readid %s\n", rg, line->fields[QNAME]); + fatalError(temp); + } + return findret->second; +} + +/////////////////////////////////////////////////////////////////////////////// +// Optical + Read Groups +/////////////////////////////////////////////////////////////////////////////// + +inline UINT32 getGrpNum(UINT64 key, state_t * state) __attribute__((always_inline)); +inline UINT32 getGrpNum(UINT64 key, state_t * state) +{ + khint_t iter; + iter = kh_get(i, state->grpNumHash, key); + + //does not exist, so add it + if(iter == kh_end(state->grpNumHash)){ + state->grpCount += 1; + + //check values + if(state->grpCount > RG_MASK) + { + char * temp; + asprintf(&temp, "samblaster: Too many ReadGroup/Tile/FlowCellLane combinations\n"); + fatalError(temp); + } + + int ret; + iter = kh_put(i, state->grpNumHash, key, &ret); + kh_value(state->grpNumHash, iter) = state->grpCount; + } + + return kh_value(state->grpNumHash, iter); +} + + +inline int getOpticalNum(splitLine_t * line, state_t * state) __attribute__((always_inline)); +inline int getOpticalNum(splitLine_t * line, state_t * state) +{ + char * qname = line->fields[QNAME]; + + //split read name on ':' + char * splits[10]; + char * fields[10] = {qname}; + int numFields = 1; //split field count + for (int i = 0; qname[i] != 0; ++i) + { + if(qname[i] == ':') + { + qname[i] = 0; //terminate previous field with null + splits[numFields-1] = qname + i; //end of this field + fields[numFields++] = qname + i + 1; //start next field + if (numFields > 8) break; //unrecognized format + } + } + + //identify format + UINT64 cell; + UINT64 tile; + UINT64 rg = getRgNum(line, state); + if (numFields == 5) //ILLUMINA + { + cell = strtol(fields[1], NULL, 0) + 1; //forces greater than 0 value + tile = strtol(fields[2], NULL, 0) + 1; //forces greater than 0 value + } + else if (numFields == 7 || numFields == 8) //#ILLUMINA 1.8+ + { + cell = strtol(fields[3], NULL, 0) + 1; //forces greater than 0 value + tile = strtol(fields[4], NULL, 0) + 1; //forces greater than 0 value + } + else { //unknown + cell = 0; + tile = 0; + } + + //restore ':' at splits + for (int i=0; i < numFields-1; ++i) + { + splits[i][0] = ':'; + } + + //check values + if(cell > CELL_MASK || tile > TILE_MASK) + { + char * temp; + asprintf(&temp, "samblaster: Flowcell lane or tile number are too high to be real for readid '%s'\n", qname); + fatalError(temp); + } + + //merge values + UINT64 final = (tile << TILE_SHIFT) | (cell << CELL_SHIFT) | rg; + + //get hash for value + //fprintf(stderr, "%li\t%li\t%li\t%li\t%i\n", rg, cell, tile, final, getGrpNum(final, state)); //temp + + return getGrpNum(final, state); +} + +inline int getExAmpNum(splitLine_t * line, state_t * state) __attribute__((always_inline)); +inline int getExAmpNum(splitLine_t * line, state_t * state) +{ + char * qname = line->fields[QNAME]; + + //split read name on ':' + char * splits[10]; + char * fields[10] = {qname}; + int numFields = 1; //split field count + for (int i = 0; qname[i] != 0; ++i) + { + if(qname[i] == ':') + { + qname[i] = 0; //terminate previous field with null + splits[numFields-1] = qname + i; //end of this field + fields[numFields++] = qname + i + 1; //start next field + if (numFields > 8) break; //unrecognized format + } + } + + //identify format + UINT64 cell; + UINT64 rg = getRgNum(line, state); + if (numFields == 5) //ILLUMINA + { + cell = strtol(fields[1], NULL, 0) + 1; //forces greater than 0 value + } + else if (numFields == 7 || numFields == 8) //#ILLUMINA 1.8+ + { + cell = strtol(fields[3], NULL, 0) + 1; //forces greater than 0 value + } + else { //unknown + cell = 0; + } + + //restore ':' at splits + for (int i=0; i < numFields-1; ++i) + { + splits[i][0] = ':'; + } + + //check values + if(cell > CELL_MASK) + { + char * temp; + asprintf(&temp, "samblaster: Flowcell lane number is are too high to be real for readid '%s'\n", qname); + fatalError(temp); + } + + //merge values + UINT64 final = (cell << CELL_SHIFT) | rg; + + return getGrpNum(final, state); +} + /////////////////////////////////////////////////////////////////////////////// // Helpers to process CIGAR strings /////////////////////////////////////////////////////////////////////////////// @@ -825,14 +1084,6 @@ inline int getEndDiag(splitLine_t * line) /////////////////////////////////////////////////////////////////////////////// // Process SAM Blocks /////////////////////////////////////////////////////////////////////////////// -// This was historically 27 bits as that was large enough to hold any human chrom 1 offset. -// Now that we are using a synthetic genome representation, 27 has no special meaning. -// There are interesting space/time trade-offs between how large we make the -// synthetic chroms and how large the various hash tables become. -// If/when we handle RGs and/or UMIs, the signature scheme will change anyway. -// Therefore, leave at 27 for now to match space/time tradeoffs of earlier releases. -#define BIN_SHIFT ((UINT64)27) //bin window is 27 bits wide -#define BIN_MASK ((UINT64)((1 << BIN_SHIFT)-1)) //bin window is 27 bits wide // This is apparently no longer called. void outputSAMBlock(splitLine_t * block, FILE * output) @@ -961,7 +1212,21 @@ void prepareSigValues(splitLine_t * line, state_t * state, bool orphan) } // Now get the sequence number - line->seqNum = getSeqNum(line, RNAME, state); + line->seqNum = getSeqNum(line, state); + + // Now get group number + if(state->optPlusExAmp) + { + line->grpNum = getExAmpNum(line, state); + } + else if(state->opticalOnly) + { + line->grpNum = getOpticalNum(line, state); + } + else + { + line->grpNum = getRgNum(line, state); + } // Calculate the genome relative position // We need to do the read pos padding to handle negative calculated sequence position for the alignment. @@ -1347,6 +1612,8 @@ void printPGsamLine(FILE * f, state_t * s) else if (s->excludeDups && (s->discordantFile != NULL || s->splitterFile != NULL || s->unmappedClippedFile != NULL)) fprintf(f, " --excludeDups"); if (s->addMateTags) fprintf(f, " --addMateTags"); if (s->ignoreUnmated) fprintf(f, " --ignoreUnmated"); + if (s->optPlusExAmp) fprintf(f, " --optPlusExAmp"); + if (s->opticalOnly) fprintf(f, " --opticalOnly"); if (s->maxReadLength != 500) fprintf(f, " --maxReadLength %d", s->maxReadLength); if (s->discordantFile != NULL) fprintf(f, " -d %s", s->discordantFileName); if (s->splitterFile != NULL) @@ -1394,6 +1661,8 @@ void printUsageString() "-e --excludeDups Exclude reads marked as duplicates from discordant, splitter, and/or unmapped file.\n" "-r --removeDups Remove duplicates reads from all output files. (Implies --excludeDups).\n" " --addMateTags Add MC and MQ tags to all output paired-end SAM lines.\n" + " --opticalOnly Only mark reads on the same tile as duplicates (i.e. optical duplicates)\n" + " --optPlusExAmp Only mark reads on the same lane as duplicates (i.e. optical and ExAmp duplicates)\n" " --ignoreUnmated Suppress abort on unmated alignments. Use only when sure input is read-id grouped,\n" " and either paired-end alignments have been filtered or the input file contains singleton reads.\n" "-M Run in compatibility mode; both 0x100 and 0x800 are considered chimeric. Similar to BWA MEM -M option.\n" @@ -1653,6 +1922,15 @@ int main (int argc, char *argv[]) { state->ignoreUnmated = true; } + else if (streq(argv[argi],"--opticalOnly")) + { + state->opticalOnly = true; + } + else if (streq(argv[argi],"--optPlusExAmp")) + { + state->optPlusExAmp = true; + state->opticalOnly = false; + } else if (streq(argv[argi],"-M")) { state->compatMode = true; @@ -1772,13 +2050,11 @@ int main (int argc, char *argv[]) if (state->unmappedClippedFile == NULL) fsError(state->unmappedClippedFileName); } - // Read in the SAM header and create the seqs structure. - state->seqLens = (UINT32*)calloc(1, sizeof(UINT32)); //initialize to 0 - state->seqOffs = (UINT64*)calloc(1, sizeof(UINT64)); //initialize to 0 - state->seqs[strdup("*")] = 0; - state->seqLens[0] = padLength(0, state); + // Read in the SAM header and create the RG and seqs structures. + state->seqs[strdup("*")] = 0; //only unmapped to start state->seqOffs[0] = 0; - int count = 1; + int scount = 1; //0 is reserved for unknown or '*' + int rcount = 1; //0 is reserved for no read group UINT64 totalLen = 0; splitLine_t * line; // Read the first line to prime the loop, and also to allow checking for malformed input. @@ -1798,8 +2074,8 @@ int main (int argc, char *argv[]) if (strncmp(line->fields[i], "SN:", 3) == 0) { seqID = line->fields[i]+3; - seqNum = count; - count += 1; + seqNum = scount; + scount += 1; } else if (strncmp(line->fields[i], "LN:", 3) == 0) { @@ -1812,18 +2088,38 @@ int main (int argc, char *argv[]) // Unless we are marking dups, we don't need to use sequence numbers. if (!state->acceptDups) { - // grow seqLens and seqOffs arrays + // grow seqOffs arrays if(seqNum % 32768 == 1) { - state->seqLens = (UINT32*)realloc(state->seqLens, (seqNum+32768)*sizeof(UINT32)); state->seqOffs = (UINT64*)realloc(state->seqOffs, (seqNum+32768)*sizeof(UINT64)); } state->seqs[strdup(seqID)] = seqNum; - state->seqLens[seqNum] = seqLen; state->seqOffs[seqNum] = seqOff; } } + // Process input line to see if it defines a sequence. + else if (streq(line->fields[0], "@RG")) + { + char * rgID = NULL; + int rgNum = 0; + for (int i=1; inumFields; ++i) + { + if (strncmp(line->fields[i], "ID:", 3) == 0) + { + rgID = line->fields[i]+3; + rgNum = rcount; + rcount += 1; + } + } + + // Unless we are marking dups, we don't need to use read groups. + if (!state->acceptDups) + { + state->rgs[strdup(rgID)] = rgNum; + } + } + // Output the header line. writeLine(line, state->outputFile); if (state->discordantFile != NULL) @@ -1842,13 +2138,13 @@ int main (int argc, char *argv[]) printPGsamLine(state->splitterFile, state); // Make sure we have a header. - if (count == 1 && !state->acceptDups) + if (scount == 1 && !state->acceptDups) { fatalError("samblaster: Missing header on input sam file. Exiting.\n"); } // Don't count the "*" entry. - fprintf(stderr, "samblaster: Loaded %d header sequence entries.\n", count-1); + fprintf(stderr, "samblaster: Loaded %d header sequence entries.\n", scount-1); // Make sure we have any more lines to process. if (line == NULL) @@ -1864,7 +2160,7 @@ int main (int argc, char *argv[]) int binCount = (totalLen >> BIN_SHIFT); if (binCount >= (1 << 15)) { - fatalError("samblaster: Too many sequences in header of input sam file. Exiting.\n"); + fatalError("samblaster: total length of sequences found in sam file is too high. Exiting.\n"); } state->binCount = binCount; @@ -1878,7 +2174,6 @@ int main (int argc, char *argv[]) // line already has the first such line in it unless the file only has a header. // Keep a ptr to the end of the current list. splitLine_t * last = line; - count = 1; while (true) { splitLine_t * nextLine = readLine(state->inputFile); @@ -1887,7 +2182,6 @@ int main (int argc, char *argv[]) { processSAMBlock(line, state); last = line = nextLine; - count = 1; } else { @@ -1895,7 +2189,6 @@ int main (int argc, char *argv[]) // we need to be a little careful about how we make the list. last->next = nextLine; last = nextLine; - count += 1; } } // We need to process the final set.