From cd334cfc555eb8138572fcdf257aba4d748027c4 Mon Sep 17 00:00:00 2001
From: carsonhh <carsonhh@gmail.com>
Date: Wed, 11 Aug 2021 16:41:42 -0600
Subject: [PATCH] add support for read group and optitical duplicate only
 filtering

---
 khash.h        | 627 +++++++++++++++++++++++++++++++++++++++++++++++++
 samblaster.cpp | 389 ++++++++++++++++++++++++++----
 2 files changed, 968 insertions(+), 48 deletions(-)
 create mode 100644 khash.h

diff --git a/khash.h b/khash.h
new file mode 100644
index 0000000..f75f347
--- /dev/null
+++ b/khash.h
@@ -0,0 +1,627 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2009, 2011 by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/*
+  An example:
+
+#include "khash.h"
+KHASH_MAP_INIT_INT(32, char)
+int main() {
+	int ret, is_missing;
+	khiter_t k;
+	khash_t(32) *h = kh_init(32);
+	k = kh_put(32, h, 5, &ret);
+	kh_value(h, k) = 10;
+	k = kh_get(32, h, 10);
+	is_missing = (k == kh_end(h));
+	k = kh_get(32, h, 5);
+	kh_del(32, h, k);
+	for (k = kh_begin(h); k != kh_end(h); ++k)
+		if (kh_exist(h, k)) kh_value(h, k) = 1;
+	kh_destroy(32, h);
+	return 0;
+}
+*/
+
+/*
+  2013-05-02 (0.2.8):
+
+	* Use quadratic probing. When the capacity is power of 2, stepping function
+	  i*(i+1)/2 guarantees to traverse each bucket. It is better than double
+	  hashing on cache performance and is more robust than linear probing.
+
+	  In theory, double hashing should be more robust than quadratic probing.
+	  However, my implementation is probably not for large hash tables, because
+	  the second hash function is closely tied to the first hash function,
+	  which reduce the effectiveness of double hashing.
+
+	Reference: http://research.cs.vt.edu/AVresearch/hashing/quadratic.php
+
+  2011-12-29 (0.2.7):
+
+    * Minor code clean up; no actual effect.
+
+  2011-09-16 (0.2.6):
+
+	* The capacity is a power of 2. This seems to dramatically improve the
+	  speed for simple keys. Thank Zilong Tan for the suggestion. Reference:
+
+	   - http://code.google.com/p/ulib/
+	   - http://nothings.org/computer/judy/
+
+	* Allow to optionally use linear probing which usually has better
+	  performance for random input. Double hashing is still the default as it
+	  is more robust to certain non-random input.
+
+	* Added Wang's integer hash function (not used by default). This hash
+	  function is more robust to certain non-random input.
+
+  2011-02-14 (0.2.5):
+
+    * Allow to declare global functions.
+
+  2009-09-26 (0.2.4):
+
+    * Improve portability
+
+  2008-09-19 (0.2.3):
+
+	* Corrected the example
+	* Improved interfaces
+
+  2008-09-11 (0.2.2):
+
+	* Improved speed a little in kh_put()
+
+  2008-09-10 (0.2.1):
+
+	* Added kh_clear()
+	* Fixed a compiling error
+
+  2008-09-02 (0.2.0):
+
+	* Changed to token concatenation which increases flexibility.
+
+  2008-08-31 (0.1.2):
+
+	* Fixed a bug in kh_get(), which has not been tested previously.
+
+  2008-08-31 (0.1.1):
+
+	* Added destructor
+*/
+
+
+#ifndef __AC_KHASH_H
+#define __AC_KHASH_H
+
+/*!
+  @header
+
+  Generic hash table library.
+ */
+
+#define AC_VERSION_KHASH_H "0.2.8"
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+/* compiler specific configuration */
+
+#if UINT_MAX == 0xffffffffu
+typedef unsigned int khint32_t;
+#elif ULONG_MAX == 0xffffffffu
+typedef unsigned long khint32_t;
+#endif
+
+#if ULONG_MAX == ULLONG_MAX
+typedef unsigned long khint64_t;
+#else
+typedef unsigned long long khint64_t;
+#endif
+
+#ifndef kh_inline
+#ifdef _MSC_VER
+#define kh_inline __inline
+#else
+#define kh_inline inline
+#endif
+#endif /* kh_inline */
+
+#ifndef klib_unused
+#if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3)
+#define klib_unused __attribute__ ((__unused__))
+#else
+#define klib_unused
+#endif
+#endif /* klib_unused */
+
+typedef khint32_t khint_t;
+typedef khint_t khiter_t;
+
+#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
+#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
+#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
+#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
+#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
+#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
+#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
+
+#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4)
+
+#ifndef kroundup32
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+
+#ifndef kcalloc
+#define kcalloc(N,Z) calloc(N,Z)
+#endif
+#ifndef kmalloc
+#define kmalloc(Z) malloc(Z)
+#endif
+#ifndef krealloc
+#define krealloc(P,Z) realloc(P,Z)
+#endif
+#ifndef kfree
+#define kfree(P) free(P)
+#endif
+
+static const double __ac_HASH_UPPER = 0.77;
+
+#define __KHASH_TYPE(name, khkey_t, khval_t) \
+	typedef struct kh_##name##_s { \
+		khint_t n_buckets, size, n_occupied, upper_bound; \
+		khint32_t *flags; \
+		khkey_t *keys; \
+		khval_t *vals; \
+	} kh_##name##_t;
+
+#define __KHASH_PROTOTYPES(name, khkey_t, khval_t)	 					\
+	extern kh_##name##_t *kh_init_##name(void);							\
+	extern void kh_destroy_##name(kh_##name##_t *h);					\
+	extern void kh_clear_##name(kh_##name##_t *h);						\
+	extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); 	\
+	extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \
+	extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \
+	extern void kh_del_##name(kh_##name##_t *h, khint_t x);
+
+#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	SCOPE kh_##name##_t *kh_init_##name(void) {							\
+		return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t));		\
+	}																	\
+	SCOPE void kh_destroy_##name(kh_##name##_t *h)						\
+	{																	\
+		if (h) {														\
+			kfree((void *)h->keys); kfree(h->flags);					\
+			kfree((void *)h->vals);										\
+			kfree(h);													\
+		}																\
+	}																	\
+	SCOPE void kh_clear_##name(kh_##name##_t *h)						\
+	{																	\
+		if (h && h->flags) {											\
+			memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \
+			h->size = h->n_occupied = 0;								\
+		}																\
+	}																	\
+	SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) 	\
+	{																	\
+		if (h->n_buckets) {												\
+			khint_t k, i, last, mask, step = 0; \
+			mask = h->n_buckets - 1;									\
+			k = __hash_func(key); i = k & mask;							\
+			last = i; \
+			while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+				i = (i + (++step)) & mask; \
+				if (i == last) return h->n_buckets;						\
+			}															\
+			return __ac_iseither(h->flags, i)? h->n_buckets : i;		\
+		} else return 0;												\
+	}																	\
+	SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
+	{ /* This function uses 0.25*n_buckets bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \
+		khint32_t *new_flags = 0;										\
+		khint_t j = 1;													\
+		{																\
+			kroundup32(new_n_buckets); 									\
+			if (new_n_buckets < 4) new_n_buckets = 4;					\
+			if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0;	/* requested size is too small */ \
+			else { /* hash table size to be changed (shrink or expand); rehash */ \
+				new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t));	\
+				if (!new_flags) return -1;								\
+				memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \
+				if (h->n_buckets < new_n_buckets) {	/* expand */		\
+					khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+					if (!new_keys) { kfree(new_flags); return -1; }		\
+					h->keys = new_keys;									\
+					if (kh_is_map) {									\
+						khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+						if (!new_vals) { kfree(new_flags); return -1; }	\
+						h->vals = new_vals;								\
+					}													\
+				} /* otherwise shrink */								\
+			}															\
+		}																\
+		if (j) { /* rehashing is needed */								\
+			for (j = 0; j != h->n_buckets; ++j) {						\
+				if (__ac_iseither(h->flags, j) == 0) {					\
+					khkey_t key = h->keys[j];							\
+					khval_t val;										\
+					khint_t new_mask;									\
+					new_mask = new_n_buckets - 1; 						\
+					if (kh_is_map) val = h->vals[j];					\
+					__ac_set_isdel_true(h->flags, j);					\
+					while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \
+						khint_t k, i, step = 0; \
+						k = __hash_func(key);							\
+						i = k & new_mask;								\
+						while (!__ac_isempty(new_flags, i)) i = (i + (++step)) & new_mask; \
+						__ac_set_isempty_false(new_flags, i);			\
+						if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \
+							{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
+							if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
+							__ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \
+						} else { /* write the element and jump out of the loop */ \
+							h->keys[i] = key;							\
+							if (kh_is_map) h->vals[i] = val;			\
+							break;										\
+						}												\
+					}													\
+				}														\
+			}															\
+			if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \
+				h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \
+				if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \
+			}															\
+			kfree(h->flags); /* free the working space */				\
+			h->flags = new_flags;										\
+			h->n_buckets = new_n_buckets;								\
+			h->n_occupied = h->size;									\
+			h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
+		}																\
+		return 0;														\
+	}																	\
+	SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
+	{																	\
+		khint_t x;														\
+		if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \
+			if (h->n_buckets > (h->size<<1)) {							\
+				if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \
+					*ret = -1; return h->n_buckets;						\
+				}														\
+			} else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \
+				*ret = -1; return h->n_buckets;							\
+			}															\
+		} /* TODO: to implement automatically shrinking; resize() already support shrinking */ \
+		{																\
+			khint_t k, i, site, last, mask = h->n_buckets - 1, step = 0; \
+			x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \
+			if (__ac_isempty(h->flags, i)) x = i; /* for speed up */	\
+			else {														\
+				last = i; \
+				while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
+					if (__ac_isdel(h->flags, i)) site = i;				\
+					i = (i + (++step)) & mask; \
+					if (i == last) { x = site; break; }					\
+				}														\
+				if (x == h->n_buckets) {								\
+					if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
+					else x = i;											\
+				}														\
+			}															\
+		}																\
+		if (__ac_isempty(h->flags, x)) { /* not present at all */		\
+			h->keys[x] = key;											\
+			__ac_set_isboth_false(h->flags, x);							\
+			++h->size; ++h->n_occupied;									\
+			*ret = 1;													\
+		} else if (__ac_isdel(h->flags, x)) { /* deleted */				\
+			h->keys[x] = key;											\
+			__ac_set_isboth_false(h->flags, x);							\
+			++h->size;													\
+			*ret = 2;													\
+		} else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \
+		return x;														\
+	}																	\
+	SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x)				\
+	{																	\
+		if (x != h->n_buckets && !__ac_iseither(h->flags, x)) {			\
+			__ac_set_isdel_true(h->flags, x);							\
+			--h->size;													\
+		}																\
+	}
+
+#define KHASH_DECLARE(name, khkey_t, khval_t)		 					\
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	__KHASH_PROTOTYPES(name, khkey_t, khval_t)
+
+#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	__KHASH_TYPE(name, khkey_t, khval_t) 								\
+	__KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
+	KHASH_INIT2(name, static kh_inline klib_unused, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal)
+
+/* --- BEGIN OF HASH FUNCTIONS --- */
+
+/*! @function
+  @abstract     Integer hash function
+  @param  key   The integer [khint32_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int_hash_func(key) (khint32_t)(key)
+/*! @function
+  @abstract     Integer comparison function
+ */
+#define kh_int_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     64-bit integer hash function
+  @param  key   The integer [khint64_t]
+  @return       The hash value [khint_t]
+ */
+#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11)
+/*! @function
+  @abstract     64-bit integer comparison function
+ */
+#define kh_int64_hash_equal(a, b) ((a) == (b))
+/*! @function
+  @abstract     const char* hash function
+  @param  s     Pointer to a null terminated string
+  @return       The hash value
+ */
+static kh_inline khint_t __ac_X31_hash_string(const char *s)
+{
+	khint_t h = (khint_t)*s;
+	if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s;
+	return h;
+}
+/*! @function
+  @abstract     Another interface to const char* hash function
+  @param  key   Pointer to a null terminated string [const char*]
+  @return       The hash value [khint_t]
+ */
+#define kh_str_hash_func(key) __ac_X31_hash_string(key)
+/*! @function
+  @abstract     Const char* comparison function
+ */
+#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
+
+static kh_inline khint_t __ac_Wang_hash(khint_t key)
+{
+    key += ~(key << 15);
+    key ^=  (key >> 10);
+    key +=  (key << 3);
+    key ^=  (key >> 6);
+    key += ~(key << 11);
+    key ^=  (key >> 16);
+    return key;
+}
+#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)key)
+
+/* --- END OF HASH FUNCTIONS --- */
+
+/* Other convenient macros... */
+
+/*!
+  @abstract Type of the hash table.
+  @param  name  Name of the hash table [symbol]
+ */
+#define khash_t(name) kh_##name##_t
+
+/*! @function
+  @abstract     Initiate a hash table.
+  @param  name  Name of the hash table [symbol]
+  @return       Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_init(name) kh_init_##name()
+
+/*! @function
+  @abstract     Destroy a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_destroy(name, h) kh_destroy_##name(h)
+
+/*! @function
+  @abstract     Reset a hash table without deallocating memory.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+ */
+#define kh_clear(name, h) kh_clear_##name(h)
+
+/*! @function
+  @abstract     Resize a hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  s     New size [khint_t]
+ */
+#define kh_resize(name, h, s) kh_resize_##name(h, s)
+
+/*! @function
+  @abstract     Insert a key to the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @param  r     Extra return code: -1 if the operation failed;
+                0 if the key is present in the hash table;
+                1 if the bucket is empty (never used); 2 if the element in
+				the bucket has been deleted [int*]
+  @return       Iterator to the inserted element [khint_t]
+ */
+#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
+
+/*! @function
+  @abstract     Retrieve a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Key [type of keys]
+  @return       Iterator to the found element, or kh_end(h) if the element is absent [khint_t]
+ */
+#define kh_get(name, h, k) kh_get_##name(h, k)
+
+/*! @function
+  @abstract     Remove a key from the hash table.
+  @param  name  Name of the hash table [symbol]
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  k     Iterator to the element to be deleted [khint_t]
+ */
+#define kh_del(name, h, k) kh_del_##name(h, k)
+
+/*! @function
+  @abstract     Test whether a bucket contains data.
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       1 if containing data; 0 otherwise [int]
+ */
+#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
+
+/*! @function
+  @abstract     Get key given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Key [type of keys]
+ */
+#define kh_key(h, x) ((h)->keys[x])
+
+/*! @function
+  @abstract     Get value given an iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  x     Iterator to the bucket [khint_t]
+  @return       Value [type of values]
+  @discussion   For hash sets, calling this results in segfault.
+ */
+#define kh_val(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Alias of kh_val()
+ */
+#define kh_value(h, x) ((h)->vals[x])
+
+/*! @function
+  @abstract     Get the start iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The start iterator [khint_t]
+ */
+#define kh_begin(h) (khint_t)(0)
+
+/*! @function
+  @abstract     Get the end iterator
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       The end iterator [khint_t]
+ */
+#define kh_end(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Get the number of elements in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of elements in the hash table [khint_t]
+ */
+#define kh_size(h) ((h)->size)
+
+/*! @function
+  @abstract     Get the number of buckets in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @return       Number of buckets in the hash table [khint_t]
+ */
+#define kh_n_buckets(h) ((h)->n_buckets)
+
+/*! @function
+  @abstract     Iterate over the entries in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  kvar  Variable to which key will be assigned
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach(h, kvar, vvar, code) { khint_t __i;		\
+	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
+		if (!kh_exist(h,__i)) continue;						\
+		(kvar) = kh_key(h,__i);								\
+		(vvar) = kh_val(h,__i);								\
+		code;												\
+	} }
+
+/*! @function
+  @abstract     Iterate over the values in the hash table
+  @param  h     Pointer to the hash table [khash_t(name)*]
+  @param  vvar  Variable to which value will be assigned
+  @param  code  Block of code to execute
+ */
+#define kh_foreach_value(h, vvar, code) { khint_t __i;		\
+	for (__i = kh_begin(h); __i != kh_end(h); ++__i) {		\
+		if (!kh_exist(h,__i)) continue;						\
+		(vvar) = kh_val(h,__i);								\
+		code;												\
+	} }
+
+/* More convenient interfaces */
+
+/*! @function
+  @abstract     Instantiate a hash set containing integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT(name)										\
+	KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT(name, khval_t)								\
+	KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash set containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_INT64(name)										\
+	KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing 64-bit integer keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_INT64(name, khval_t)								\
+	KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
+
+typedef const char *kh_cstr_t;
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+ */
+#define KHASH_SET_INIT_STR(name)										\
+	KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
+
+/*! @function
+  @abstract     Instantiate a hash map containing const char* keys
+  @param  name  Name of the hash table [symbol]
+  @param  khval_t  Type of values [type]
+ */
+#define KHASH_MAP_INIT_STR(name, khval_t)								\
+	KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
+
+#endif /* __AC_KHASH_H */
diff --git a/samblaster.cpp b/samblaster.cpp
index 211663b..f4ba976 100644
--- a/samblaster.cpp
+++ b/samblaster.cpp
@@ -32,6 +32,7 @@
 #include <errno.h>
 #include <map>
 #include "sbhash.h"
+#include "khash.h"
 
 // Rename common integer types.
 // I like having these shorter name.
@@ -39,6 +40,9 @@ typedef uint64_t UINT64;
 typedef uint32_t UINT32;
 typedef int32_t   INT32;
 
+// intialize khash
+KHASH_MAP_INIT_INT64(i, UINT32)
+
 // Some helper routines.
 
 // mempcpy is a GNU extension and not available everywhere.
@@ -147,6 +151,7 @@ struct splitLine
     // It this were a class, these would be in a subclass.
     int   flag;
     pos_t pos;
+    int   grpNum;
     int   seqNum;
     pos_t binPos;
     int   binNum;
@@ -233,27 +238,33 @@ splitLine_t * getSplitLine()
 // Split the line into fields.
 void splitSplitLine(splitLine_t * line, int maxSplits)
 {
-    line->numFields = 0;
-    int fieldStart = 0;
-    // replace the newline with a tab so that it works like the rest of the fields.
-    line->buffer[line->bufLen-1] = '\t';
-    for (int i=0; i<line->bufLen; ++i)
+    //grow fields array as needed
+    if (maxSplits > line->maxFields){
+        line->maxFields = maxSplits;
+        line->fields = (char **)realloc(line->fields, line->maxFields * sizeof(char *));
+    }
+
+    //first field 
+    line->fields[0] = line->buffer;
+    line->numFields = 1;
+
+    //remaining fields
+    for (int i=0; i < line->bufLen-1; ++i)
     {
         if (line->buffer[i] == '\t')
         {
-            line->fields[line->numFields] = line->buffer + fieldStart;
-            line->numFields += 1;
+            line->buffer[i] = 0; //terminate previous field with null
+            line->fields[line->numFields++] = line->buffer + i + 1; //start next field
             if (line->numFields == maxSplits) break;
-            line->buffer[i] = 0;
-            // Get ready for the next iteration.
-            fieldStart = i+1;
         }
     }
-    // replace the tab at the end of the line with a null char to terminate the final string.
+
+    // replace the newline with a null char so that it works like the rest of the fields.
     line->buffer[line->bufLen-1] = 0;
     line->split = true;
 }
 
+
 // Unsplit the fields back into a single string.
 // This will mung the strings, so only call this when all processing on the line is done.
 void unsplitSplitLine(splitLine_t * line)
@@ -556,6 +567,14 @@ struct less_str
    }
 };
 
+// This stores the map between RG IDs and RG numbers.
+typedef std::map<const char *, int, less_str> rgMap_t;
+
+inline void addRG(rgMap_t * rgs, char * item, int val)
+{
+    (*rgs)[item] = val;
+}
+
 // This stores the map between sequence names and sequence numbers.
 typedef std::map<const char *, int, less_str> seqMap_t;
 
@@ -580,9 +599,11 @@ struct state_struct
     FILE *         unmappedClippedFile;
     char *         unmappedClippedFileName;
     sigSet_t *     sigs;
+    rgMap_t        rgs;
     seqMap_t       seqs;
-    UINT32 *       seqLens;
     UINT64 *       seqOffs;
+    khash_t(i) *   grpNumHash;
+    UINT32         grpCount;
     splitLine_t ** splitterArray;
     int            splitterArrayMaxSize;
     UINT32         sigArraySize;
@@ -600,6 +621,8 @@ struct state_struct
     bool           addMateTags;
     bool           compatMode;
     bool           ignoreUnmated;
+    bool           opticalOnly;
+    bool           optPlusExAmp;
     bool           quiet;
 };
 typedef struct state_struct state_t;
@@ -618,6 +641,8 @@ state_t * makeState ()
     s->unmappedClippedFile = NULL;
     s->unmappedClippedFileName = (char *)"";
     s->sigs = NULL;
+    s->grpNumHash = kh_init(i);
+    s->grpCount = 0;
     s->minNonOverlap = 20;
     s->maxSplitCount = 2;
     s->minIndelSize = 50;
@@ -630,6 +655,8 @@ state_t * makeState ()
     s->addMateTags = false;
     s->compatMode = false;
     s->ignoreUnmated = false;
+    s->opticalOnly = false;
+    s->optPlusExAmp = false;
     s->quiet = false;
     // Start this as -1 to indicate we don't know yet.
     // Once we are outputting our first line, we will decide.
@@ -637,6 +664,9 @@ state_t * makeState ()
     // Used as a temporary location for ptrs to splitter for sort routine.
     s->splitterArrayMaxSize = 1000;
     s->splitterArray = (splitLine_t **)(malloc(s->splitterArrayMaxSize * sizeof(splitLine_t *)));
+    // initialze seqence info
+    s->seqOffs = (UINT64*)calloc(1, sizeof(UINT64));
+
     return s;
 }
 
@@ -649,12 +679,16 @@ void deleteState(state_t * s)
         for (UINT32 i=0; i<s->sigArraySize; i++) deleteHashTable(&(s->sigs[i]));
         free (s->sigs);
     }
+    for (seqMap_t::iterator iter = s->rgs.begin(); iter != s->rgs.end(); ++iter)
+    {
+        free((char *)(iter->first));
+    }
     for (seqMap_t::iterator iter = s->seqs.begin(); iter != s->seqs.end(); ++iter)
     {
         free((char *)(iter->first));
     }
-    if (s->seqLens != NULL) free(s->seqLens);
     if (s->seqOffs != NULL) free(s->seqOffs);
+    kh_destroy(i, s->grpNumHash); 
     delete s;
 }
 
@@ -668,21 +702,62 @@ void deleteState(state_t * s)
 //  without performance degradation on more standard genomes.
 // Thanks to https://github.com/carsonhh for the suggestion.
 /////////////////////////////////////////////////////////////////////////////
+
+// largest known genome is 150GB. 22 bits only gets us to 136GB with synthetic
+// super contigs and an int type array index. That's close enough. This leaves us
+// 20 bits in the signature that can be used to integrate RG, UMI, and tile info.
+#define BIN_SIZE ((UINT64)22)                      //bin window is 22 bits wide
+#define BIN_MASK ((UINT64)((1UL << BIN_SIZE)-1))   //bin window is 22 bits wide
+#define BIN_SHIFT ((UINT64)BIN_SIZE)               //bin window is 22 bits wide
+
+#define G_SIZE ((UINT64)(64-2*BIN_SIZE))           //group window is 20 bits wide
+#define G_MASK ((UINT64)((1UL << RG_SIZE)-1))      //group window is 20 bits wide
+#define G_SHIFT ((UINT64)(2*BIN_SIZE))             //group window is 20 bits wide
+
+
+
+#define RG_SIZE ((UINT64)20)                       //RG window is 20 bits wide
+#define RG_MASK ((UINT64)((1UL << RG_SIZE)-1))      //RG window is 20 bits wide
+#define RG_SHIFT ((UINT64)0)                       //RG window is 20 bits wide
+
+
+
+#define CELL_SIZE ((UINT64)(32-RG_SIZE))           //flowcell window is 12 bits wide
+#define CELL_MASK ((UINT64)((1UL << CELL_SIZE)-1)) //flowcell window is 12 bits wide
+#define CELL_SHIFT ((UINT64)(RG_SHIFT + RG_SIZE))  //flowcell window is 12 bits wide
+
+#define TILE_SIZE ((UINT64)(64-CELL_SIZE-RG_SIZE))    //tile window is 32 bits wide
+#define TILE_MASK ((UINT64)((1UL << TILE_SIZE)-1))    //tile window is 32 bits wide
+#define TILE_SHIFT ((UINT64)(CELL_SHIFT + CELL_SIZE)) //tile window is 32 bits wide
+
+#define UMI_SIZE ((UINT64)(64-RG_SIZE))               //UMI window is 44 bits wide
+#define UMI_MASK ((UINT64)((1UL << UMI_SIZE)-1))      //UMI window is 44 bits wide
+#define UMI_SHIFT ((UINT64)(RG_SHIFT + RG_SIZE))      //UMI window is 44 bits wide
+
+
+
 template <bool orphan>
 inline sgn_t calcSig(splitLine_t * first, splitLine_t * second)
 {
     UINT64 final;
+
+    // Copy required because grpNum is only 32 bit and it must be promoted
+    // BEFORE the bitshift operation. otherwise digits are lost
+    UINT64 rg1 = second->grpNum; //promotion to 64 bit via copy
+    UINT64 rg2 = (rg1 << G_SHIFT);
+
     if (orphan)
     {
         // For an orphan, we only use information fron the second read.
-        final = second->binPos;
+        final = rg2 | second->binPos;
     }
     else
     {
-        // Total nonsense to get the compiler to actually work.
-        UINT64 t1 = first->binPos;
-        UINT64 t2 = t1 << 32;
-        final = t2 | second->binPos;
+        // Copy required  because binPos is only 32 bit and it must be promoted
+        // BEFORE the bitshift operation. otherwise digits are lost
+        UINT64 t1 = first->binPos;  //promotion to 64 bit via copy
+        UINT64 t2 = (t1 << BIN_SHIFT);
+        final = rg2 | t2 | second->binPos;
     }
     return (sgn_t)final;
 }
@@ -709,19 +784,203 @@ inline UINT32 calcSigArrOff(splitLine_t * first, splitLine_t * second, int binCo
 // Sequences
 ///////////////////////////////////////////////////////////////////////////////
 
-inline int getSeqNum(splitLine_t * line, int field, state_t * state) __attribute__((always_inline));
-inline int getSeqNum(splitLine_t * line, int field, state_t * state)
+inline int getSeqNum(splitLine_t * line, state_t * state) __attribute__((always_inline));
+inline int getSeqNum(splitLine_t * line, state_t * state)
 {
-    seqMap_t::iterator findret = state->seqs.find(line->fields[field]);
+    seqMap_t::iterator findret = state->seqs.find(line->fields[RNAME]);
     if (findret == state->seqs.end())
     {
         char * temp;
-        asprintf(&temp, "samblaster: Unable to find sequence '%s' in sequence map for readid %s\n", line->fields[field], line->fields[QNAME]);
+        asprintf(&temp, "samblaster: Unable to find sequence '%s' in sequence map for readid %s\n", line->fields[RNAME], line->fields[QNAME]);
         fatalError(temp);
     }
     return findret->second;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+// Read Groups
+///////////////////////////////////////////////////////////////////////////////
+
+inline int getRgNum(splitLine_t * line, state_t * state) __attribute__((always_inline));
+inline int getRgNum(splitLine_t * line, state_t * state)
+{
+    //identify RG among tags
+    char * rg = strstr(line->fields[TAGS], "RG:Z:");
+    if(rg == NULL) return 0; //not found
+    rg += 5; //move to start of value
+
+    //process as cstring
+    seqMap_t::iterator findret;
+    char * end = strchr(rg, '\t');
+    if(end == NULL)
+    {
+        findret = state->rgs.find(rg);
+    }
+    else //make tab a temporary NULL for cstring
+    {
+        end[0] = 0;
+        findret = state->rgs.find(rg);
+        end[0] = '\t';
+    }
+
+    if (findret == state->rgs.end()) //not found
+    {
+        if(end != NULL) end[0] = 0; //change back to cstring for error
+        char * temp;
+        asprintf(&temp, "samblaster: Unable to find read group '%s' in RG map for readid %s\n", rg, line->fields[QNAME]);
+        fatalError(temp);
+    }
+    return findret->second;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Optical + Read Groups
+///////////////////////////////////////////////////////////////////////////////
+
+inline UINT32 getGrpNum(UINT64 key, state_t * state) __attribute__((always_inline));
+inline UINT32 getGrpNum(UINT64 key, state_t * state)
+{
+    khint_t iter;
+    iter = kh_get(i, state->grpNumHash, key);
+
+    //does not exist, so add it
+    if(iter == kh_end(state->grpNumHash)){
+        state->grpCount += 1;
+
+        //check values
+        if(state->grpCount > RG_MASK)
+        {
+            char * temp;
+            asprintf(&temp, "samblaster: Too many ReadGroup/Tile/FlowCellLane combinations\n");
+            fatalError(temp);
+        }
+
+        int ret;
+        iter = kh_put(i, state->grpNumHash, key, &ret);
+        kh_value(state->grpNumHash, iter) = state->grpCount;
+    }
+
+    return kh_value(state->grpNumHash, iter);
+}
+
+
+inline int getOpticalNum(splitLine_t * line, state_t * state) __attribute__((always_inline));
+inline int getOpticalNum(splitLine_t * line, state_t * state)
+{
+    char * qname = line->fields[QNAME];
+
+    //split read name on ':'
+    char * splits[10];
+    char * fields[10] = {qname};
+    int numFields = 1; //split field count
+    for (int i = 0; qname[i] != 0; ++i)
+    {
+        if(qname[i] == ':')
+        {
+            qname[i] = 0; //terminate previous field with null
+            splits[numFields-1] = qname + i; //end of this field
+            fields[numFields++] = qname + i + 1; //start next field
+            if (numFields > 8) break; //unrecognized format
+        }
+    }
+
+    //identify format
+    UINT64 cell;
+    UINT64 tile;
+    UINT64 rg = getRgNum(line, state);
+    if (numFields == 5) //ILLUMINA
+    {
+        cell = strtol(fields[1], NULL, 0) + 1; //forces greater than 0 value
+        tile = strtol(fields[2], NULL, 0) + 1; //forces greater than 0 value
+    }
+    else if (numFields == 7 || numFields == 8) //#ILLUMINA 1.8+
+    {
+        cell = strtol(fields[3], NULL, 0) + 1; //forces greater than 0 value
+        tile = strtol(fields[4], NULL, 0) + 1; //forces greater than 0 value
+    }
+    else { //unknown
+        cell = 0;
+        tile = 0;
+    }
+
+    //restore ':' at splits
+    for (int i=0; i < numFields-1; ++i)
+    {
+        splits[i][0] = ':';
+    }
+
+    //check values
+    if(cell > CELL_MASK || tile > TILE_MASK)
+    {
+        char * temp;
+        asprintf(&temp, "samblaster: Flowcell lane or tile number are too high to be real for readid '%s'\n", qname);
+        fatalError(temp);
+    }
+
+    //merge values
+    UINT64 final = (tile << TILE_SHIFT) | (cell << CELL_SHIFT) | rg;
+
+    //get hash for value
+    //fprintf(stderr, "%li\t%li\t%li\t%li\t%i\n", rg, cell, tile, final, getGrpNum(final, state)); //temp
+
+    return getGrpNum(final, state);
+}
+
+inline int getExAmpNum(splitLine_t * line, state_t * state) __attribute__((always_inline));
+inline int getExAmpNum(splitLine_t * line, state_t * state)
+{
+    char * qname = line->fields[QNAME];
+
+    //split read name on ':'
+    char * splits[10];
+    char * fields[10] = {qname};
+    int numFields = 1; //split field count
+    for (int i = 0; qname[i] != 0; ++i)
+    {
+        if(qname[i] == ':')
+        {
+            qname[i] = 0; //terminate previous field with null
+            splits[numFields-1] = qname + i; //end of this field
+            fields[numFields++] = qname + i + 1; //start next field
+            if (numFields > 8) break; //unrecognized format
+        }
+    }
+
+    //identify format
+    UINT64 cell;
+    UINT64 rg = getRgNum(line, state);
+    if (numFields == 5) //ILLUMINA
+    {
+        cell = strtol(fields[1], NULL, 0) + 1; //forces greater than 0 value
+    }
+    else if (numFields == 7 || numFields == 8) //#ILLUMINA 1.8+
+    {
+        cell = strtol(fields[3], NULL, 0) + 1; //forces greater than 0 value
+    }
+    else { //unknown
+        cell = 0;
+    }
+
+    //restore ':' at splits
+    for (int i=0; i < numFields-1; ++i)
+    {
+        splits[i][0] = ':';
+    }
+
+    //check values
+    if(cell > CELL_MASK)
+    {
+        char * temp;
+        asprintf(&temp, "samblaster: Flowcell lane number is are too high to be real for readid '%s'\n", qname);
+        fatalError(temp);
+    }
+
+    //merge values
+    UINT64 final = (cell << CELL_SHIFT) | rg;
+
+    return getGrpNum(final, state);
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // Helpers to process CIGAR strings
 ///////////////////////////////////////////////////////////////////////////////
@@ -825,14 +1084,6 @@ inline int getEndDiag(splitLine_t * line)
 ///////////////////////////////////////////////////////////////////////////////
 // Process SAM Blocks
 ///////////////////////////////////////////////////////////////////////////////
-// This was historically 27 bits as that was large enough to hold any human chrom 1 offset.
-// Now that we are using a synthetic genome representation, 27 has no special meaning.
-// There are interesting space/time trade-offs between how large we make the
-//   synthetic chroms and how large the various hash tables become.
-// If/when we handle RGs and/or UMIs, the signature scheme will change anyway.
-// Therefore, leave at 27 for now to match space/time tradeoffs of earlier releases.
-#define BIN_SHIFT ((UINT64)27)                  //bin window is 27 bits wide
-#define BIN_MASK ((UINT64)((1 << BIN_SHIFT)-1)) //bin window is 27 bits wide
 
 // This is apparently no longer called.
 void outputSAMBlock(splitLine_t * block, FILE * output)
@@ -961,7 +1212,21 @@ void prepareSigValues(splitLine_t * line, state_t * state, bool orphan)
     }
 
     // Now get the sequence number
-    line->seqNum = getSeqNum(line, RNAME, state);
+    line->seqNum = getSeqNum(line, state);
+
+    // Now get group number
+    if(state->optPlusExAmp)
+    {
+        line->grpNum = getExAmpNum(line, state);
+    }
+    else if(state->opticalOnly)
+    {
+        line->grpNum = getOpticalNum(line, state);
+    }
+    else
+    {
+        line->grpNum = getRgNum(line, state);
+    }
 
     // Calculate the genome relative position
     // We need to do the read pos padding to handle negative calculated sequence position for the alignment.
@@ -1347,6 +1612,8 @@ void printPGsamLine(FILE * f, state_t * s)
     else if (s->excludeDups && (s->discordantFile != NULL || s->splitterFile != NULL || s->unmappedClippedFile != NULL)) fprintf(f, " --excludeDups");
     if (s->addMateTags) fprintf(f, " --addMateTags");
     if (s->ignoreUnmated) fprintf(f, " --ignoreUnmated");
+    if (s->optPlusExAmp) fprintf(f, " --optPlusExAmp");
+    if (s->opticalOnly) fprintf(f, " --opticalOnly");
     if (s->maxReadLength != 500) fprintf(f, " --maxReadLength %d", s->maxReadLength);
     if (s->discordantFile != NULL) fprintf(f, " -d %s", s->discordantFileName);
     if (s->splitterFile != NULL)
@@ -1394,6 +1661,8 @@ void printUsageString()
         "-e --excludeDups          Exclude reads marked as duplicates from discordant, splitter, and/or unmapped file.\n"
         "-r --removeDups           Remove duplicates reads from all output files. (Implies --excludeDups).\n"
         "   --addMateTags          Add MC and MQ tags to all output paired-end SAM lines.\n"
+        "   --opticalOnly          Only mark reads on the same tile as duplicates (i.e. optical duplicates)\n"
+        "   --optPlusExAmp         Only mark reads on the same lane as duplicates (i.e. optical and ExAmp duplicates)\n"
         "   --ignoreUnmated        Suppress abort on unmated alignments. Use only when sure input is read-id grouped,\n"
         "                          and either paired-end alignments have been filtered or the input file contains singleton reads.\n"
         "-M                        Run in compatibility mode; both 0x100 and 0x800 are considered chimeric. Similar to BWA MEM -M option.\n"
@@ -1653,6 +1922,15 @@ int main (int argc, char *argv[])
         {
             state->ignoreUnmated = true;
         }
+        else if (streq(argv[argi],"--opticalOnly"))
+        {
+            state->opticalOnly = true;
+        }
+        else if (streq(argv[argi],"--optPlusExAmp"))
+        {
+            state->optPlusExAmp = true;
+            state->opticalOnly  = false;
+        }
         else if (streq(argv[argi],"-M"))
         {
             state->compatMode = true;
@@ -1772,13 +2050,11 @@ int main (int argc, char *argv[])
         if (state->unmappedClippedFile == NULL) fsError(state->unmappedClippedFileName);
     }
 
-    // Read in the SAM header and create the seqs structure.
-    state->seqLens = (UINT32*)calloc(1, sizeof(UINT32)); //initialize to 0
-    state->seqOffs = (UINT64*)calloc(1, sizeof(UINT64)); //initialize to 0
-    state->seqs[strdup("*")] = 0;
-    state->seqLens[0] = padLength(0, state);
+    // Read in the SAM header and create the RG and seqs structures.
+    state->seqs[strdup("*")] = 0; //only unmapped to start
     state->seqOffs[0] = 0;
-    int count = 1;
+    int scount = 1; //0 is reserved for unknown or '*'
+    int rcount = 1; //0 is reserved for no read group
     UINT64 totalLen = 0;
     splitLine_t * line;
     // Read the first line to prime the loop, and also to allow checking for malformed input.
@@ -1798,8 +2074,8 @@ int main (int argc, char *argv[])
                 if (strncmp(line->fields[i], "SN:", 3) == 0)
                 {
                     seqID = line->fields[i]+3;
-                    seqNum = count;
-                    count += 1;
+                    seqNum = scount;
+                    scount += 1;
                 }
                 else if (strncmp(line->fields[i], "LN:", 3) == 0)
                 {
@@ -1812,18 +2088,38 @@ int main (int argc, char *argv[])
             // Unless we are marking dups, we don't need to use sequence numbers.
             if (!state->acceptDups)
             {
-                // grow seqLens and seqOffs arrays
+                // grow seqOffs arrays
                 if(seqNum % 32768 == 1)
                 {
-                    state->seqLens = (UINT32*)realloc(state->seqLens, (seqNum+32768)*sizeof(UINT32));
                     state->seqOffs = (UINT64*)realloc(state->seqOffs, (seqNum+32768)*sizeof(UINT64));
                 }
 
                 state->seqs[strdup(seqID)] = seqNum;
-                state->seqLens[seqNum] = seqLen;
                 state->seqOffs[seqNum] = seqOff;
             }
         }
+        // Process input line to see if it defines a sequence.
+        else if (streq(line->fields[0], "@RG"))
+        {
+            char * rgID = NULL;
+            int rgNum = 0;
+            for (int i=1; i<line->numFields; ++i)
+            {
+                if (strncmp(line->fields[i], "ID:", 3) == 0)
+                {
+                    rgID = line->fields[i]+3;
+                    rgNum = rcount;
+                    rcount += 1;
+                }
+            }
+
+            // Unless we are marking dups, we don't need to use read groups.
+            if (!state->acceptDups)
+            {
+                state->rgs[strdup(rgID)] = rgNum;
+            }
+        }
+
         // Output the header line.
         writeLine(line, state->outputFile);
         if (state->discordantFile != NULL)
@@ -1842,13 +2138,13 @@ int main (int argc, char *argv[])
     printPGsamLine(state->splitterFile, state);
 
     // Make sure we have a header.
-    if (count == 1 && !state->acceptDups)
+    if (scount == 1 && !state->acceptDups)
     {
         fatalError("samblaster: Missing header on input sam file.  Exiting.\n");
     }
 
     // Don't count the "*" entry.
-    fprintf(stderr, "samblaster: Loaded %d header sequence entries.\n", count-1);
+    fprintf(stderr, "samblaster: Loaded %d header sequence entries.\n", scount-1);
 
     // Make sure we have any more lines to process.
     if (line == NULL)
@@ -1864,7 +2160,7 @@ int main (int argc, char *argv[])
         int binCount = (totalLen >> BIN_SHIFT);
         if (binCount >= (1 << 15))
         {
-            fatalError("samblaster: Too many sequences in header of input sam file.  Exiting.\n");
+            fatalError("samblaster: total length of sequences found in sam file is too high.  Exiting.\n");
         }
 
         state->binCount = binCount;
@@ -1878,7 +2174,6 @@ int main (int argc, char *argv[])
     // line already has the first such line in it unless the file only has a header.
     // Keep a ptr to the end of the current list.
     splitLine_t * last = line;
-    count = 1;
     while (true)
     {
         splitLine_t * nextLine = readLine(state->inputFile);
@@ -1887,7 +2182,6 @@ int main (int argc, char *argv[])
         {
             processSAMBlock(line, state);
             last = line = nextLine;
-            count = 1;
         }
         else
         {
@@ -1895,7 +2189,6 @@ int main (int argc, char *argv[])
             //     we need to be a little careful about how we make the list.
             last->next = nextLine;
             last = nextLine;
-            count += 1;
         }
     }
     // We need to process the final set.