diff --git a/lib/decompress/decmp.c b/lib/decompress/decmp.c
index f73cb6313ed16bff251e8e5a4af5fe5f2486deca..9d616c88d25f1301d818ffea11890d8186a34ca3 100644
--- a/lib/decompress/decmp.c
+++ b/lib/decompress/decmp.c
@@ -35,6 +35,7 @@
 #include "../common/cmp_support.h"
 #include "../common/cmp_entity.h"
 #include "../common/cmp_max_used_bits.h"
+#include "read_bitstream.h"
 #include "cmp_max_used_bits_list.h"
 
 
@@ -72,19 +73,30 @@ struct decoder_setup {
 
 
 /**
- * @brief count leading 1-bits
+ * @brief decode a unary code word
  *
- * @param value	input vale to count
+ * @param dec		a pointer to a bit_DStream_t context
+ * @param unused_1	this parameter is not used
+ * @param unused_2	this parameter is not used
+ * @param decoded_cw	pointer where decoded value is written
  *
- * @returns the number of leading 1-bits in value, starting at the most
- *	significant bit position
+ * @returns the length of the decoded code word in bits (NOT the decoded value);
+ *	failure if the return value is larger than 32
  */
 
-static unsigned int count_leading_ones(uint32_t value)
+static __inline uint32_t unary_decoder(struct bit_decoder *dec, uint32_t unused_1,
+				       uint32_t unused_2, uint32_t *decoded_cw)
 {
-	if (unlikely(~value == 0))  /* __builtin_clz(0) is undefined. */
-		return 32;
-	return (unsigned int)__builtin_clz(~value);
+	uint32_t cw_len;
+
+	UNUSED(unused_1); /* we don't need this parameter */
+	UNUSED(unused_2); /* we don't need this parameter */
+
+	*decoded_cw = bit_count_leading_ones(dec); /* decode unary coding */
+	cw_len = *decoded_cw + 1; /* Number of 1's + following 0 */
+	bit_consume_bits(dec, cw_len);
+
+	return cw_len;
 }
 
 
@@ -100,32 +112,24 @@ static unsigned int count_leading_ones(uint32_t value)
  *	failure if the return value is larger than 32
  */
 
-static unsigned int rice_decoder(uint32_t code_word, uint32_t m, uint32_t log2_m,
+static unsigned int rice_decoder(struct bit_decoder *dec, uint32_t m, uint32_t log2_m,
 				 uint32_t *decoded_cw)
 {
 	uint32_t q; /* quotient code */
+	uint32_t ql; /* length of the quotient code */
 	uint32_t r; /* remainder code */
 	uint32_t rl = log2_m; /* length of the remainder code */
-	uint32_t cw_len; /* length of the decoded code word in bits */
-
-	(void)m; /* we don't need the Golomb parameter */
-
-	assert(log2_m < 32);
-	assert(decoded_cw != NULL);
 
 	/* decode quotient unary code part */
-	q = count_leading_ones(code_word);
-
-	cw_len = q + 1 + rl; /* Number of 1's + following 0 + remainder length */
+	ql = unary_decoder(dec, m, log2_m, &q);
 
 	/* get remainder code  */
-	/* mask shift to prevented undefined behaviour in error case cw_len > 32 */
-	code_word >>= (32 - cw_len) & 0x1FU;
-	r = code_word & ((1U << rl) - 1);
+	r = (uint32_t)bit_read_bits(dec, rl);
 
 	*decoded_cw = (q << rl) + r;
 
-	return cw_len;
+	return ql + rl;
+
 }
 
 
@@ -142,7 +146,7 @@ static unsigned int rice_decoder(uint32_t code_word, uint32_t m, uint32_t log2_m
  *	failure if the return value is larger than 32
  */
 
-static unsigned int golomb_decoder(uint32_t code_word, uint32_t m,
+static unsigned int golomb_decoder(struct bit_decoder *dec, uint32_t m,
 				   uint32_t log2_m, uint32_t *decoded_cw)
 {
 	uint32_t q;  /* quotient code */
@@ -156,26 +160,20 @@ static unsigned int golomb_decoder(uint32_t code_word, uint32_t m,
 	assert(log2_m == ilog_2(m) && log2_m < 32);
 	assert(decoded_cw != NULL);
 
-	q = count_leading_ones(code_word); /* decode quotient unary code part */
-
-	/* The behaviour is undefined if the right shift operand is greater than
-	 * or equal to the length in bits of the shifted left operand, so we mask
-	 * the right operand to avoid this case. (q = 32)
-	 */
-	code_word <<= (q & 0x1FU); /* shift out leading ones */
-	code_word <<= 1; /* shift out zero in the quotient unary code */
+	/* decode quotient unary code part */
+	ql = unary_decoder(dec, m, log2_m, &q);
 
 	/* get the remainder code for both cases */
-	r2 = code_word >> (32 - (log2_m + 1));
+	r2 = (uint32_t)bit_peek_bits(dec, log2_m +1);
 	r1 = r2 >> 1;
 
 	cutoff = (0x2U << log2_m) - m; /* = 2^(log2_m+1)-m */
 
 	if (r1 < cutoff) { /* remainder case 1: remainder length=log2_m */
-		cw_len = q + 1 + log2_m;
+		cw_len = ql + log2_m;
 		r = r1;
 	} else { /* remainder case 2: remainder length = log2_m+1 */
-		cw_len = q + 1 + log2_m + 1;
+		cw_len = ql + log2_m + 1;
 		r = r2 - cutoff;
 	}
 
diff --git a/lib/decompress/read_bitstream.h b/lib/decompress/read_bitstream.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9624e8c9141eafebb997bdfe0ba4bd0f38e0514
--- /dev/null
+++ b/lib/decompress/read_bitstream.h
@@ -0,0 +1,184 @@
+#ifndef READ_BITSTREAM_H
+#define READ_BITSTREAM_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include <assert.h>
+#include <string.h>
+
+#include "../common/byteorder.h"
+
+
+static __inline uint64_t bit_read_unaligned_64(const void* ptr)
+{
+	typedef __attribute__((aligned(1))) uint64_t unalign64;
+	return *(const unalign64*)ptr;
+}
+
+
+static __inline uint64_t bit_read_unalingned_be64(const void* ptr)
+{
+	return cpu_to_be64(bit_read_unaligned_64(ptr));
+}
+
+
+/**
+ * @brief bitstream decoding context type
+ */
+
+struct bit_decoder
+{
+	uint64_t bit_container;
+	unsigned int bits_consumed;
+	const uint8_t* cursor;
+	const uint8_t* limit_ptr;
+};
+
+
+static __inline size_t bit_init_decoder(struct bit_decoder *dec, const void* buf,
+				   size_t buf_size)
+{
+	if (buf_size < 1) {
+		memset(dec, 0, sizeof(*dec));
+		return 0;
+	}
+
+	dec->cursor = (const uint8_t *)buf;
+
+	if (buf_size >= sizeof(dec->bit_container)) {
+		dec->bits_consumed = 0;
+		dec->bit_container = bit_read_unalingned_be64(dec->cursor);
+		dec->limit_ptr = dec->cursor + buf_size - sizeof(dec->bit_container);
+	} else {
+		dec->bits_consumed = (unsigned int)(sizeof(dec->bit_container) - buf_size)*8;
+
+		dec->bit_container = (uint64_t)(((const uint8_t*)(buf))[0]) << 56;
+		switch(buf_size) {
+		case 7:
+			dec->bit_container += (uint64_t)(((const uint8_t*)(buf))[6]) <<  8;
+			/* fall-through */
+		case 6:
+			dec->bit_container += (uint64_t)(((const uint8_t*)(buf))[5]) << 16;
+			/* fall-through */
+		case 5:
+			dec->bit_container += (uint64_t)(((const uint8_t*)(buf))[4]) << 24;
+			/* fall-through */
+		case 4:
+			dec->bit_container += (uint64_t)(((const uint8_t*)(buf))[3]) << 32;
+			/* fall-through */
+		case 3:
+			dec->bit_container += (uint64_t)(((const uint8_t*)(buf))[2]) << 40;
+			/* fall-through */
+		case 2:
+			dec->bit_container += (uint64_t)(((const uint8_t*)(buf))[1]) << 48;
+			/* fall-through */
+		default:
+			break;
+		}
+		dec->bit_container >>= dec->bits_consumed;
+
+		dec->limit_ptr = dec->cursor;
+	}
+
+	return buf_size;
+}
+
+
+static __inline uint64_t bit_peek_bits(const struct bit_decoder *dec, unsigned int nb_bits)
+{
+	/* mask for the shift value register to prevent undefined behavior */
+	uint32_t const reg_mask = 0x3F;
+
+	assert(nb_bits >= 1 && nb_bits <= (64 - 7)); /* TODO: why -7 */
+	assert(dec->bits_consumed + nb_bits <= 64);
+
+	/* shift out consumed bits; return the top nb_bits bits we want to peek */
+	return (dec->bit_container << (dec->bits_consumed &reg_mask)) >> (64-nb_bits);
+}
+
+
+/**
+ * @brief count the leading ones in the local register; local register is not modified
+ * @warning if all bits are consumed in local register (bitD->bitsConsumed  >= 64),
+ *	the result is undefined
+ * @param dec	a pointer to a bit_DStream_t context
+ * @returns number of leading ones; up to maximum 63
+ */
+
+static __inline unsigned int bit_count_leading_ones(const struct bit_decoder* dec)
+{
+	/* mask for the shift value register to prevent undefined behavior */
+	uint32_t const reg_mask = 0x3F;
+	/* shift out the bits we've already consumed */
+	uint64_t remaining_flip = ~(dec->bit_container << (dec->bits_consumed & reg_mask));
+
+	/* clzll(0) is undefined behavior */
+	if (remaining_flip)
+		return sizeof(dec->bit_container)*8;
+
+	return (unsigned int)__builtin_clzll(remaining_flip);
+}
+
+
+static __inline void bit_consume_bits(struct bit_decoder *dec, unsigned int nb_bits)
+{
+	dec->bits_consumed += nb_bits;
+}
+
+
+static __inline uint64_t bit_read_bits(struct bit_decoder *dec, unsigned int nb_bits)
+{
+	uint64_t const read_bits = bit_peek_bits(dec, nb_bits);
+
+	bit_consume_bits(dec, nb_bits);
+	return read_bits;
+}
+
+
+/**
+ * @brief Check if the end of the bitstream has been reached
+ * @param dec	a bitstream decoding context
+ * @returns 1 if DStream has _exactly_ reached its end (all bits consumed).
+ */
+
+static __inline unsigned int bit_end_of_stream(const struct bit_decoder* dec)
+{
+    return ((dec->cursor == dec->limit_ptr) &&
+	    (dec->bits_consumed == sizeof(dec->bit_container)*8));
+}
+
+
+enum {BIT_OVERFLOW, BIT_END_OF_BUFFER, BIT_ALL_READ_IN, BIT_UNFINISHED};
+
+static __inline int bit_refill(struct bit_decoder *dec)
+{
+	if (dec->bits_consumed > (sizeof(dec->bit_container)*8))
+		return BIT_OVERFLOW;
+	if (dec->cursor < dec->limit_ptr) {
+		/* Advance the pointer by the number of full bytes we consumed */
+		dec->cursor += dec->bits_consumed >> 3;
+		/* Refill the bit container */
+		dec->bit_container = bit_read_unalingned_be64(dec->cursor);
+		/* The number of bits that we have already consumed in the current
+		 * byte, excluding the bits that formed a complete byte and were already
+		 * processed.
+		 */
+		dec->bits_consumed &= 0x7;
+		return BIT_UNFINISHED;
+	}
+
+	if (bit_end_of_stream(dec))
+		return BIT_ALL_READ_IN;
+	else
+		return BIT_END_OF_BUFFER;
+
+	/* limit_ptr < cursor < end */
+	dec->bits_consumed -= (dec->limit_ptr - dec->cursor)*8;
+	dec->cursor = dec->limit_ptr;
+	dec->bit_container = bit_read_unaligned_64(dec->cursor);
+
+	return BIT_END_OF_BUFFER;
+}
+
+
+#endif /* READ_BITSTREAM_H */