/*
								+----------------------------------+
								|                                  |
								|  *** Basic compression algs ***  |
								|                                  |
								|   Copyright  -tHE SWINe- 2008   |
								|                                  |
								|           Compress.cpp           |
								|                                  |
								+----------------------------------+
*/

/**
 *	@file Compress.cpp
 *	@author -tHE SWINe-
 *	@date 2008
 *	@brief Simple experimental data compression framework, focused on Burrows-Wheeler methods.
 *
 *	@date 2007-02-25
 *
 *	this is first beta version if the file. t_odo - rewrite TBuffer so it can realloc
 *	itself in a way std::vector can. todo - try to avoid any (re)allocations while (un)packing
 *
 *	@date 2008-03-13
 *
 *	TBuffer was rewritten as requested, fixed some signed / unsigned mismatches for gcc
 *
 *	@date 2008-11-20
 *
 *	TBuffer unit is no longer unsigned char, but uint8_t instead, this should avoid
 *	incompatibility with some extended character encoding in newer versions of visual studio
 *
 *	@date 2009-05-23
 *
 *	removed all instances of std::vector::reserve and replaced them by stl_ut::Reserve_*
 *
 *	@date 2009-10-08
 *
 *	slightly improved CHuffmanCodec, fixed bug in cannonical huffman code generation for
 *	trees where there are no symbols of length n, but there are both shorter and longer
 *	symbols (codes got shifted too much, got too long, had to be regenerated). this was
 *	hurting compression and so it had to be fixed, but the bug was also in decompression
 *	code, so this sadly breaks backward compatibility.
 *
 *	@date 2009-10-11
 *
 *	replaced stl container ::resize() by stl_ut::Resize_*() to avoid unhandled
 *	std::bad_alloc
 *
 *	optimized CBurrowsWheelerTransform::CWrapMemCmp() wrap-arround memory comparator by
 *	calculating lengths of blocks that do not wrap and comparing them in shorter loops
 *
 *	added __BWT_ENABLE_THREADED_ENCODE macro
 *
 *	@date 2009-10-20
 *
 *	fixed some warnings when compiling under VC 2005, implemented "Security
 *	Enhancements in the CRT" for VC 2008. compare against MyProjects_2009-10-19_
 *
 */

#include "NewFix.h"
#include "CallStack.h"
#include <vector>
#include <map>
#include <algorithm>
#include <stdio.h> // for debugging only
#include "MinMax.h"
#include "StlUtils.h"
#include "Buffer.h"
#include "Compress.h"

#ifdef __BWT_ENABLE_THREADED_ENCODE
#include "Thread.h"
#endif // __BWT_ENABLE_THREADED_ENCODE

#if defined(_MSC_VER) && !defined(__MWERKS__) && !defined(for) && _MSC_VER <= 1200
#define for if(0) {} else for
#endif // _MSC_VER && !__MWERKS__ && !for && _MSC_VER <= 1200
// msvc 'for' scoping hack

/*
 *								=== CBurrowsWheelerTransform ===
 */

/*
 *	CBurrowsWheelerTransform::CWrapMemCmp
 *		- function object implementing less-than ordering
 *		  for indices, pointing to round buffer
 */
class CBurrowsWheelerTransform::CWrapMemCmp {
protected:
	const uint8_t *m_p_org, *m_p_end;
	size_t m_n_size;

public:
	/*
	 *	inline CWrapMemCmp::CWrapMemCmp(const TBuffer &r_t_data_buffer)
	 *		- default constructor
	 *		- we're going to sort indices pointing to r_t_data_buffer
	 */
	inline CWrapMemCmp(const TBuffer &r_t_data_buffer)
		:m_p_org(r_t_data_buffer.p_Data()),
		m_p_end(r_t_data_buffer.p_Data() + r_t_data_buffer.n_Size()),
		m_n_size(r_t_data_buffer.n_Size())
	{}

#ifdef _MSC_VER // MSVC assembyl syntax
	static int _fast_memcmp(const uint8_t *a, const uint8_t *b, int n_length)
	{
		{
			__asm {
				mov ecx, dword ptr n_length
				shr ecx, 4
				jz _dontHaveResult
				mov esi, dword ptr a
				mov edx, dword ptr b

			_longLoop:
				prefetch [esi+16]
				prefetch [edx+16]
				prefetch [esi+32]
				prefetch [edx+32]
				prefetch [esi+48]
				prefetch [edx+48]
				prefetch [esi+64]
				prefetch [edx+64] // call for more data

				mov eax, [esi] // todo - use MMX / SSE for this
				bswap eax
				mov ebx, [edx]
				bswap ebx
				cmp eax, ebx
				jb _isBelow
				ja _isAbove
				add esi, 4
				add edx, 4

				mov eax, [esi]
				bswap eax
				mov ebx, [edx]
				bswap ebx
				cmp eax, ebx
				jb _isBelow
				ja _isAbove
				add esi, 4
				add edx, 4

				mov eax, [esi]
				bswap eax
				mov ebx, [edx]
				bswap ebx
				cmp eax, ebx
				jb _isBelow
				ja _isAbove
				add esi, 4
				add edx, 4

				mov eax, [esi]
				bswap eax
				mov ebx, [edx]
				bswap ebx
				cmp eax, ebx
				jb _isBelow
				ja _isAbove
				add esi, 4
				add edx, 4

				loop _longLoop
				mov dword ptr a, esi
				mov dword ptr b, edx // !!!
				jmp _dontHaveResult

			_isBelow:
			}
			return -1;

			__asm { _isAbove: }
			return 1;

			__asm {
			_dontHaveResult:
			}
		}
		// compare using longs (endian-dependable, uses bswap, prepared for MMX/SSE)

		/*{
			__asm {
				mov ecx, dword ptr n_length
				shr ecx, 2
				jz _dontHaveResult
				mov esi, dword ptr a
				mov edx, dword ptr b

			_longLoop:
				mov eax, [esi]
				bswap eax
				mov ebx, [edx]
				bswap ebx
				cmp eax, ebx
				jb _isBelow
				ja _isAbove
				add esi, 4
				add edx, 4
				loop _longLoop
				jmp _dontHaveResult

			_isBelow:
			}
			return -1;

			__asm { _isAbove: }
			return 1;

			__asm {
			_dontHaveResult:
				mov dword ptr a, esi
				mov dword ptr b, edx // !!!
			}
		}*/
		// compare using longs (endian-dependable, uses bswap)

		n_length %= 4 * sizeof(uint32_t);
		while(n_length --) {
			int cmp;
			if(cmp = *a ++ - *b ++)
				return cmp;
		}
		// compare up to 3 bytes
		// this compiles quite well

		return 0;
	}

	/*static int _fast_memcmp_check(const uint8_t *a, const uint8_t *b, int n_length)
	{
		int r1 = _fast_memcmp(a, b, n_length);
		if(r1 > 0)
			r1 = 1;
		else if(r1 < 0)
			r1 = -1;
		int r2 = memcmp(a, b, n_length);
		if(r2 > 0)
			r2 = 1;
		else if(r2 < 0)
			r2 = -1;
		_ASSERTE(r1 == r2);
		return r1;
	}*/
#endif // _MSC_VER

	/*
	 *	inline bool CWrapMemCmp::operator ()(uint32_t n_off_a,
	 *		uint32_t n_off_b) const
	 *		- less-than ordering operator
	 */
	inline bool operator ()(uint32_t n_off_a, uint32_t n_off_b) const
	{
		if(n_off_a == n_off_b)
			return false;
		// they are equal then

		const uint8_t *p_org = m_p_org, *p_end = m_p_end;
		// antialiass

		const uint8_t *p_byte_a = p_org + n_off_a;
		const uint8_t *p_byte_b = p_org + n_off_b;
		// get pointers

#ifdef _MSC_VER // MSVC assembly syntax
		size_t n_stage1_length = m_n_size - max(n_off_a, n_off_b);
		// until first one wraps arround

		{
			int n_result;
			if((n_result = _fast_memcmp(p_byte_a, p_byte_b, n_stage1_length * sizeof(uint8_t))) < 0)
				return true;
			else if(n_result > 0)
				return false;
		}
		// compare first stage (until one of arrays wraps arround)

		p_byte_a += n_stage1_length;
		p_byte_b += n_stage1_length;
		if(p_byte_a == p_end)
			p_byte_a = p_org;
		else /*if(p_byte_b == p_end)*/ { // n_off_a != n_off_b
			_ASSERTE(p_byte_b == p_end); // it must have
			p_byte_b = p_org;
		}
		// shift, wrap arround

		size_t n_stage2_length = m_n_size - min(n_off_a, n_off_b) - n_stage1_length;
		// until second one wraps arround

		{
			int n_result;
			if((n_result = _fast_memcmp(p_byte_a, p_byte_b, n_stage2_length * sizeof(uint8_t))) < 0)
				return true;
			else if(n_result > 0)
				return false;
		}
		// compare second stage (until the other array wraps arround)

		p_byte_a += n_stage2_length;
		p_byte_b += n_stage2_length;
		if(p_byte_a == p_end)
			p_byte_a = p_org;
		else /*if(p_byte_b == p_end)*/ { // n_off_a != n_off_b
			_ASSERTE(p_byte_b == p_end); // it must have
			p_byte_b = p_org;
		}
		// shift, wrap arround

		size_t n_stage3_length = m_n_size - n_stage1_length - n_stage2_length;
		// the rest of comparison

		{
			int n_result;
			if((n_result = _fast_memcmp(p_byte_a, p_byte_b, n_stage3_length * sizeof(uint8_t))) < 0)
				return true;
			else if(n_result > 0)
				return false;
		}
		// compare third stage
#else
		for(size_t i = m_n_size; i; -- i) {
			if(*p_byte_a < *p_byte_b)
				return true;
			else if(*p_byte_a > *p_byte_b)
				return false;
			// compare

			++ p_byte_a, ++ p_byte_b;
			// increment

			if(p_byte_a == p_end)
				p_byte_a = p_org;
			else if(p_byte_b == p_end) // n_off_a != n_off_b
				p_byte_b = p_org;
			// wrap
		}
		// simple, naive code
#endif

		return false;
	}
};

/*
 *	CBurrowsWheelerTransform::CIota
 *		- function object for creating ascending integer sequences
 */
class CBurrowsWheelerTransform::CIota {
protected:
	int m_n_counter;

public:
	inline CIota(int n_value)
		:m_n_counter(n_value)
	{}

	inline int operator ()()
	{
		return m_n_counter ++;
	}
};

/*
 *	static bool CBurrowsWheelerTransform::Decode(const TBuffer &r_t_in_buffer,
 *		TBuffer &r_t_out_buffer)
 *		- decodes data from r_t_in_buffer, outputs to r_t_out_buffer (can be empty)
 *		- returns true on success, false on failure
 */
bool CBurrowsWheelerTransform::Decode(const TBuffer &r_t_src, TBuffer &r_t_dest)
{
	_ASSERTE(&r_t_src != &r_t_dest);
	_ASSERTE(r_t_src.p_Data());
	// make some assumptions about source buffer

	if(r_t_src.n_Size() < sizeof(uint32_t))
		return false;
	uint32_t n_primary_index = *(uint32_t*)r_t_src.p_Data();
	// get primary index

	const uint8_t *p_src_data = r_t_src.p_Data() + sizeof(uint32_t);
	size_t n_src_size = r_t_src.n_Size() - sizeof(uint32_t);
	// get pointer to and size of the real data

	if(n_primary_index >= n_src_size)
		return false;
	// see if primary index is valid

	if(!r_t_dest.Resize(n_src_size, false))
		return false;
	// alloc output buffer

	const int n_max_word_value = 1 << (8 * sizeof(uint8_t));
	// number of combinations in a single word

	uint32_t p_buckets[n_max_word_value] = {0};

	std::vector<uint32_t> indices_list;
	if(!stl_ut::Reserve_N(indices_list, n_src_size))
		return false;
	// allocate list for indices

	{
		const uint8_t *p_src = p_src_data,
			*p_end = p_src_data + n_src_size;
		for(; p_src != p_end; ++ p_src) {				// for i := 0 to N-1 do
			indices_list.push_back(p_buckets[*p_src]);	//     P[i] := C[L[i]];
			if(!(++ p_buckets[*p_src]))					//     C[L[i]] := C[L[i]] + 1
				return false; // overflow
		}
	}
	// generate indices

	{
		uint32_t n_sum = 0;								// sum := 0;
 		for(int i = 0; i < n_max_word_value; ++ i) {	// for ch := FIRST(alphabet) to LAST(alphabet) do
			uint32_t n_value = p_buckets[i];			//     temp := C[ch]
			p_buckets[i] = n_sum;						//     C[ch] := sum
			if(n_sum > UINT32_MAX - n_value)
				return false; // overflow
			n_sum += n_value;							//     sum := sum + temp
		}
	}
	// integrate buckets

	{
		const uint8_t *p_src = p_src_data;
		uint8_t *p_dest = r_t_dest.p_Data() + n_src_size - 1,
			*p_end = r_t_dest.p_Data() - 1;
		for(uint32_t i = n_primary_index; p_dest != p_end; -- p_dest,	// i:=I;
		   i = indices_list[i] + p_buckets[p_src[i]])					// for j := N-1 downto 0 do
			*p_dest = p_src[i];											//     S[j] := L[i];
																		//     i := P[i] + C[L[i]]
	}
	// fill output buffer (backwards)

	return true;
}

#ifdef __BWT_ENABLE_THREADED_ENCODE

typedef std::vector<uint32_t>::iterator TULongIter;

class CBurrowsWheelerTransform::CSorter : public CRunable {
protected:
	TULongIter m_p_begin, m_p_end;
	const TBuffer *m_p_src;

public:
	CSorter()
	{}

	CSorter(TULongIter p_begin, TULongIter p_end, const TBuffer &r_t_src)
		:m_p_begin(p_begin), m_p_end(p_end), m_p_src(&r_t_src)
	{}

	virtual void Run()
	{
		std::sort(m_p_begin, m_p_end, CWrapMemCmp(*m_p_src));
		// sort the interval
	}
};

class CBurrowsWheelerTransform::CMerger : public CRunable {
protected:
	TULongIter m_p_out, m_p_begin, m_p_middle, m_p_end;
	std::vector<uint32_t> m_vec;
	const TBuffer *m_p_src;

public:
	CMerger()
	{}

	CMerger(TULongIter p_out, TULongIter p_begin,
		TULongIter p_middle, TULongIter p_end, const TBuffer &r_t_src)
		:m_p_out(p_out), m_p_begin(p_begin), m_p_middle(p_middle), m_p_end(p_end), m_p_src(&r_t_src)
	{}

	virtual void Run()
	{
		std::merge(m_p_begin, m_p_middle, m_p_middle, m_p_end, m_p_out, CWrapMemCmp(*m_p_src));
		// merge somewhere else. my implementation of inplace_merge is not thread safe

		//std::inplace_merge(m_p_begin, m_p_middle, m_p_end, CWrapMemCmp(*m_p_src));
		// sort the interval
	}
};

/*
 *	static bool CBurrowsWheelerTransform::ThreadedEncode(const TBuffer &r_t_in_buffer,
 *		TBuffer &r_t_out_buffer, int n_thread_num)
 *		- encodes data from r_t_in_buffer, outputs to r_t_out_buffer (can be empty)
 *		- works in parallel, n_thread_num must be power of two
 *		- returns true on success, false on failure
 *		- note this doesn't work with empty input buffer
 *		- note this only gets compiled if __BWT_ENABLE_THREADED_ENCODE macro
 *		  is defined (not by default)
 */
bool CBurrowsWheelerTransform::ThreadedEncode(const TBuffer &r_t_src,
	TBuffer &r_t_dest, int n_thread_num)
{
	_ASSERTE(&r_t_src != &r_t_dest);
	_ASSERTE(r_t_src.p_Data());
	// make some assumptions about source buffer

	if(r_t_src.n_Size() > UINT32_MAX - sizeof(uint32_t))
		return false; // overflow
	if(!r_t_dest.Resize(r_t_src.n_Size() + sizeof(uint32_t), false))
		return false;
	// allocate output buffer

	std::vector<uint32_t> indices_list(r_t_src.n_Size());
	std::vector<uint32_t> sorted_list(r_t_src.n_Size());
	if(indices_list.size() < r_t_src.n_Size() ||
	   sorted_list.size() < r_t_src.n_Size())
		return false;
	// allocate list for indices

	std::generate(indices_list.begin(), indices_list.end(), CIota(0));
	// generate sequence of indices

	const int n_max_thread_num = 64;
	// max number of worker threads

	if(n_thread_num > n_max_thread_num)
		return false;

	{
		const size_t n_size = r_t_src.n_Size();
		const size_t n_part_size = n_size / n_thread_num;
		const size_t n_rest_size = n_size % n_thread_num;
		// size of one part for sorting thread and of the rest

		CSorter p_sorter[n_max_thread_num];
		CThread p_thread[n_max_thread_num];

		{
			size_t n_begin = 0;
			size_t n_cur_size = n_part_size + n_rest_size;
			for(int i = 0; i < n_thread_num; ++ i,
			   n_begin += n_cur_size, n_cur_size = n_part_size) {
				p_sorter[i] = CSorter(indices_list.begin() + n_begin,
					indices_list.begin() + n_begin + n_cur_size, r_t_src);
				p_thread[i].AttachRunable(p_sorter[i]);
				if(!p_thread[i].Start())
					return false;
				// start sorting in threads
			}

			for(int i = 0; i < n_thread_num; ++ i)
				p_thread[i].Stop();
			// wait for threads to finish
		}
		// partially sort list, in n_thread_num threads

		CMerger p_merger[n_max_thread_num / 2];

		for(int n = n_thread_num / 2, m = 1; n != 0; n /= 2, m *= 2) {
			_ASSERTE(!(n % 2) || n == 1); // must be power of two

			size_t n_begin = 0;
			size_t n_cur_half = n_part_size * m + n_rest_size;
			size_t n_cur_size = n_cur_half + n_part_size * m;
			for(int i = 0; i < n; ++ i, n_begin += n_cur_size,
			   n_cur_half = n_part_size * m, n_cur_size = n_part_size * 2 * m) {
				p_merger[i] = CMerger(sorted_list.begin() + n_begin,
					indices_list.begin() + n_begin,
					indices_list.begin() + n_begin + n_cur_half,
					indices_list.begin() + n_begin + n_cur_size, r_t_src);
				p_thread[i].AttachRunable(p_merger[i]);
				if(!p_thread[i].Start())
					return false;
				// start merging in threads
			}
			// merge sub-lists, all merges in one iteration can be executed in parallel

			for(int i = 0; i < n; ++ i)
				p_thread[i].Stop();
			// wait for threads to finish

			indices_list.swap(sorted_list);
		}
		// merge to create sorted list, need n_thread_num / 2 threads
	}
	// sort indices (bottleneck)

	const uint8_t *p_src = r_t_src.p_Data();
	uint8_t *p_dest = r_t_dest.p_Data() + sizeof(uint32_t);
	for(size_t i = 0, n = r_t_src.n_Size(); i < n; ++ i, ++ p_dest)
		*p_dest = p_src[(indices_list[i] + n - 1) % n];
	// fill output buffer

	size_t n_primary_index = std::find(indices_list.begin(),
		indices_list.end(), 0U) - indices_list.begin();
	_ASSERTE(n_primary_index < r_t_src.n_Size());
	// find primary index

	*(uint32_t*)r_t_dest.p_Data() = uint32_t(n_primary_index);
	// write it to front of the buffer

	return true;
}

#endif // __BWT_ENABLE_THREADED_ENCODE

/*
 *	static bool CBurrowsWheelerTransform::Encode(const TBuffer &r_t_in_buffer,
 *		TBuffer &r_t_out_buffer)
 *		- encodes data from r_t_in_buffer, outputs to r_t_out_buffer (can be empty)
 *		- returns true on success, false on failure
 */
bool CBurrowsWheelerTransform::Encode(const TBuffer &r_t_src, TBuffer &r_t_dest)
{
	_ASSERTE(&r_t_src != &r_t_dest);
	_ASSERTE(r_t_src.p_Data());
	// make some assumptions about source buffer

	if(r_t_src.n_Size() > UINT32_MAX - sizeof(uint32_t))
		return false; // overflow
	if(!r_t_dest.Resize(r_t_src.n_Size() + sizeof(uint32_t), false))
		return false;
	// allocate output buffer

	std::vector<uint32_t> indices_list(r_t_src.n_Size());
	if(indices_list.size() < r_t_src.n_Size())
		return false;
	// allocate list for indices

	std::generate(indices_list.begin(), indices_list.end(), CIota(0));
	// generate sequence of indices

	std::sort(indices_list.begin(), indices_list.end(), CWrapMemCmp(r_t_src));
	// sort indices

	const uint8_t *p_src = r_t_src.p_Data();
	uint8_t *p_dest = r_t_dest.p_Data() + sizeof(uint32_t);
	for(size_t i = 0, n = r_t_src.n_Size(); i < n; ++ i, ++ p_dest)
		*p_dest = p_src[(indices_list[i] + n - 1) % n];
	// fill output buffer

	size_t n_primary_index = std::find(indices_list.begin(),
		indices_list.end(), 0U) - indices_list.begin();
	_ASSERTE(n_primary_index < r_t_src.n_Size());
	// find primary index

	*(uint32_t*)r_t_dest.p_Data() = uint32_t(n_primary_index);
	// write it to front of the buffer

	return true;
}

/*
 *								=== ~CBurrowsWheelerTransform ===
 */

/*
 *								=== CMoveToFrontTransform ===
 */

/*
 *	static void CMoveToFrontTransform::Decode(const TBuffer &r_t_buffer, int n_algorithm)
 *		- decodes data in r_t_buffer (operates in-sit)
 *		- n_algorithm is MTF algorithm, one of algo_MTF, algo_MTF_1
 */
void CMoveToFrontTransform::Decode(TBuffer &r_t_buffer, int n_algorithm)
{
	_Decode(r_t_buffer, r_t_buffer, n_algorithm);
	// decode can work in-sit
}

/*
 *	static void CMoveToFrontTransform::Encode(const TBuffer &r_t_buffer, int n_algorithm)
 *		- encodes data in r_t_buffer (operates in-sit)
 *		- n_algorithm is MTF algorithm, one of algo_MTF, algo_MTF_1
 */
void CMoveToFrontTransform::Encode(TBuffer &r_t_buffer, int n_algorithm)
{
	_Encode(r_t_buffer, r_t_buffer, n_algorithm);
	// encode can work in-sit
}

/*
 *	static bool CMoveToFrontTransform::Decode(const TBuffer &r_t_in_buffer,
 *		TBuffer &r_t_out_buffer, int n_algorithm)
 *		- decodes data from r_t_in_buffer, outputs to r_t_out_buffer (can be empty)
 *		- n_algorithm is MTF algorithm, one of algo_MTF, algo_MTF_1
 *		- returns true on success, false on failure
 */
bool CMoveToFrontTransform::Decode(const TBuffer &r_t_src, TBuffer &r_t_dest, int n_algorithm)
{
	if(!r_t_dest.Resize(r_t_src.n_Size(), false))
		return false;
	// allocate output buffer

	_Decode(r_t_src, r_t_dest, n_algorithm);
	// decode

	return true;
}

/*
 *	static bool CMoveToFrontTransform::Encode(const TBuffer &r_t_in_buffer,
 *		TBuffer &r_t_out_buffer, int n_algorithm)
 *		- encodes data from r_t_in_buffer, outputs to r_t_out_buffer (can be empty)
 *		- n_algorithm is MTF algorithm, one of algo_MTF, algo_MTF_1
 *		- returns true on success, false on failure
 */
bool CMoveToFrontTransform::Encode(const TBuffer &r_t_src, TBuffer &r_t_dest, int n_algorithm)
{
	if(!r_t_dest.Resize(r_t_src.n_Size(), false))
		return false;
	// allocate output buffer

	_Encode(r_t_src, r_t_dest, n_algorithm);
	// encode

	return true;
}

/*
 *	static bool CMoveToFrontTransform::_Decode(const TBuffer &r_t_src,
 *		TBuffer &r_t_dest, int n_algorithm)
 *		- decodes data from r_t_src, outputs to r_t_dest which
 *		  must be allocated to the same size as r_t_src
 *		- n_algorithm is MTF algorithm, one of algo_MTF, algo_MTF_1
 */
void CMoveToFrontTransform::_Decode(const TBuffer &r_t_src, TBuffer &r_t_dest, int n_algorithm)
{
	_ASSERTE(r_t_src.n_Size() == r_t_dest.n_Size());
	_ASSERTE(r_t_src.p_Data() && r_t_dest.p_Data());
	// make some assumptions about input / output buffers

	const int n_max_word_value = 1 << (8 * sizeof(uint8_t));
	// number of combinations in a single word

	uint8_t p_data[n_max_word_value];
	for(int i = 0; i < n_max_word_value; ++ i)
		p_data[i] = (uint8_t)i;
	// start with buffer of successive numbers

	if(n_algorithm == algo_MTF) {
		const uint8_t *p_src = r_t_src.p_Data(),
			*p_end = r_t_src.p_Data() + r_t_src.n_Size();
		for(uint8_t *p_dest = r_t_dest.p_Data(); p_src != p_end; ++ p_src, ++ p_dest) {
			uint8_t n_index = *p_src;
			if(!n_index)
				*p_dest = p_data[0];
			else {
				int n_data = p_data[n_index];
				// get index in the array

				for(int i = n_index; i > 0; -- i)
					p_data[i] = p_data[i - 1];
				// shift the array (simulate encoder)

				p_data[0] = n_data;
				// move our byte to the front

				*p_dest = n_data;
				// output index where the byte has been found
			}
		}
	} else /*if(n_algorithm == algo_MTF_1)*/ {
		const uint8_t *p_src = r_t_src.p_Data(),
			*p_end = r_t_src.p_Data() + r_t_src.n_Size();
		for(uint8_t *p_dest = r_t_dest.p_Data(); p_src != p_end; ++ p_src, ++ p_dest) {
			uint8_t n_index = *p_src;
			if(!n_index)
				*p_dest = p_data[0];
			else {
				int n_data = p_data[n_index];
				// get index in the array

				if(n_index == 1) {
					for(int i = n_index; i > 0; -- i)
						p_data[i] = p_data[i - 1];
					// shift the array (simulate encoder)

					p_data[0] = n_data;
					// move our byte to the front
				} else {
					for(int i = n_index; i > 1; -- i)
						p_data[i] = p_data[i - 1];
					// shift the array (simulate encoder)

					p_data[1] = n_data;
					// move our byte to second place
				}

				*p_dest = n_data;
				// output index where the byte has been found
			}
		}
	}
	// decode
}

/*
 *	static bool CMoveToFrontTransform::_Encode(const TBuffer &r_t_src,
 *		TBuffer &r_t_dest, int n_algorithm)
 *		- encodes data from r_t_src, outputs to r_t_dest which
 *		  must be allocated to the same size as r_t_src
 *		- n_algorithm is MTF algorithm, one of algo_MTF, algo_MTF_1
 */
void CMoveToFrontTransform::_Encode(const TBuffer &r_t_src, TBuffer &r_t_dest, int n_algorithm)
{
	_ASSERTE(r_t_src.n_Size() == r_t_dest.n_Size());
	_ASSERTE(r_t_src.p_Data() && r_t_dest.p_Data());
	// make some assumptions about input / output buffers

	const int n_max_word_value = 1 << (8 * sizeof(uint8_t));
	// number of combinations in a single word

	uint8_t p_indices[n_max_word_value];
	for(int i = 0; i < n_max_word_value; ++ i)
		p_indices[i] = (uint8_t)i;
	// start with buffer of successive numbers

	if(n_algorithm == algo_MTF) {
		const uint8_t *p_src = r_t_src.p_Data(),
			*p_end = r_t_src.p_Data() + r_t_src.n_Size();
		for(uint8_t *p_dest = r_t_dest.p_Data(); p_src != p_end; ++ p_src, ++ p_dest) {
			uint8_t n_data = *p_src;
			if(n_data == p_indices[0])
				*p_dest = 0; // we wish this would happen a lot
			else {
				int n_index = 0;
				for(;; ++ n_index) {
					_ASSERTE(n_index < n_max_word_value);
					if(p_indices[n_index] == n_data)
						break;
				}
				// find it in array

				for(int i = n_index; i > 0; -- i)
					p_indices[i] = p_indices[i - 1];
				// shift the array

				p_indices[0] = n_data;
				// move our byte to the front

				*p_dest = n_index;
				// output index where the byte has been found
			}
		}
	} else /*if(n_algorithm == algo_MTF_1)*/ {
		_ASSERTE(n_algorithm == algo_MTF_1);
		const uint8_t *p_src = r_t_src.p_Data(),
			*p_end = r_t_src.p_Data() + r_t_src.n_Size();
		for(uint8_t *p_dest = r_t_dest.p_Data(); p_src != p_end; ++ p_src, ++ p_dest) {
			uint8_t n_data = *p_src;
			if(n_data == p_indices[0])
				*p_dest = 0; // we wish this would happen a lot
			else {
				int n_index = 0;
				for(;; ++ n_index) {
					_ASSERTE(n_index < n_max_word_value);
					if(p_indices[n_index] == n_data)
						break;
				}
				// find it in array

				if(n_index == 1) {
					for(int i = n_index; i > 0; -- i)
						p_indices[i] = p_indices[i - 1];
					// shift the array

					p_indices[0] = n_data;
					// move our byte to the front
				} else {
					for(int i = n_index; i > 1; -- i)
						p_indices[i] = p_indices[i - 1];
					// shift the array

					p_indices[1] = n_data;
					// move our byte to second place
				}

				*p_dest = n_index;
				// output index where the byte has been found
			}
		}
	}
	// encode
}

/*
 *								=== ~CMoveToFrontTransform ===
 */

/*
 *								=== CRunLengthCodec ===
 */

/*
 *	static bool CRunLengthCodec::Decode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
 *		- decodes data from r_t_in_buffer, outputs to r_t_out_buffer (can be empty)
 *		- returns true on success, false on failure
 */
bool CRunLengthCodec::Decode(const TBuffer &r_t_src, TBuffer &r_t_dest)
{
	uint8_t *p_output = r_t_dest.p_Data(),
		*p_end = r_t_dest.p_Data() + r_t_dest.n_Size();
	// output buffer (size will change dynamically)

	for(const uint8_t *p_src = r_t_src.p_Data(),
	   *p_end2 = r_t_src.p_Data() + r_t_src.n_Size(); p_src != p_end2;) {
		uint8_t n_code = *p_src;
		bool b_compressed = n_code & 1;
		int n_run_length = (n_code >> 1) + 1;
		// determine compression and run length

		if(p_src + ((b_compressed)? 2 : 1 + n_run_length) > p_end2)
			return false;
		// buffer overrun, invalid input data

		++ p_src;
		// skip code

		if(p_output + n_run_length > p_end) {
			uint32_t n_off = uint32_t(p_output - r_t_dest.p_Data());
			if(!r_t_dest.Grow(n_run_length))
				return false;
			p_output = r_t_dest.p_Data() + n_off;
			p_end = r_t_dest.p_Data() + r_t_dest.n_Size();
		}
		// make sure there's enough space

		if(b_compressed) {
			uint8_t n_data = *p_src ++;
			for(const uint8_t *p_end2 = p_output + n_run_length;
			   p_output != p_end2; ++ p_output)
				*p_output = n_data;
			// replicate the same byte n_run_length times
		} else {
			for(const uint8_t *p_end2 = p_output + n_run_length;
			   p_output != p_end2; ++ p_output, ++ p_src)
				*p_output = *p_src;
			// copy n_run_length bytes
		}
		// read data

		_ASSERTE(p_output <= p_end);
		_ASSERTE(p_end == r_t_dest.p_Data() + r_t_dest.n_Size());
		// make sure we don't cross buffer boundaries
	}
	// IRLE loop

	r_t_dest.Resize(uint32_t(p_output - r_t_dest.p_Data()));
	// shrink the buffer to it's final length

	return true;
}

/*
 *	static bool CRunLengthCodec::Encode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
 *		- encodes data from r_t_in_buffer, outputs to r_t_out_buffer (can be empty)
 *		- returns true on success, false on failure
 */
bool CRunLengthCodec::Encode(const TBuffer &r_t_src, TBuffer &r_t_dest)
{
	const int n_max_repeats = 0x80;
	// compressor config

	r_t_dest.Resize(r_t_dest.n_Capacity());
	uint8_t *p_output = r_t_dest.p_Data(),
		*p_out_end = r_t_dest.p_Data() + r_t_dest.n_Size();
	// output buffer (size will change dynamically)

	for(const uint8_t *p_src = r_t_src.p_Data(),
	   *p_end = r_t_src.p_Data() + r_t_src.n_Size(); p_src != p_end;) {
		int n_uncompressed_size = p_end - p_src, n_compressed_size = 0;
		if(p_src + min_RunLength < p_end) {
			for(const uint8_t *p_src2 = p_src, *p_end2 = p_end - min_RunLength;
			   p_src2 != p_end2; ++ p_src2) {
				if(*p_src2 == p_src2[1]) {
					uint8_t n_byte = *p_src2;
					int n_run_length = 1;
					while(p_src2 + n_run_length != p_end && p_src2[n_run_length] == n_byte)
						++ n_run_length;
					if(n_run_length >= min_RunLength) {
						n_compressed_size = n_run_length;
						n_uncompressed_size = p_src2 - p_src;
						break;
					}
				}
			}
		}
		// get size of uncompressed data, preceeding compressed data,
		// get size of compressed data as well

		int n_add = n_uncompressed_size + (n_uncompressed_size +
			n_max_repeats - 1) / n_max_repeats;
		if(n_compressed_size)
			n_add += 2 * (n_compressed_size + n_max_repeats - 1) / n_max_repeats;
		if(p_output + n_add > p_out_end) {
			size_t n_off = p_output - r_t_dest.p_Data();
			if(!r_t_dest.Grow(n_add) || !r_t_dest.Resize(r_t_dest.n_Capacity()))
				return false;
			p_output = r_t_dest.p_Data() + n_off;
			p_out_end = r_t_dest.p_Data() + r_t_dest.n_Size();
			_ASSERTE(p_output + n_add <= p_out_end);
		}
		// make sure there's enough space

		while(n_uncompressed_size) {
			int n_write = min(n_uncompressed_size, n_max_repeats);
			_ASSERTE(!((n_write - 1) & 0x80));

			*p_output ++ = (n_write - 1) << 1;
			memcpy(p_output, p_src, n_write * sizeof(uint8_t));

			p_output += n_write;
			n_uncompressed_size -= n_write;
			p_src += n_write;

			_ASSERTE(p_output <= p_out_end);
		}
		// write uncompressed part

		while(n_compressed_size) {
			int n_write = min(n_compressed_size, n_max_repeats);
			_ASSERTE(!((n_write - 1) & 0x80));

			*p_output ++ = ((n_write - 1) << 1) | 1;
			*p_output ++ = *p_src;

			n_compressed_size -= n_write;
			p_src += n_write;

			_ASSERTE(p_output <= p_out_end);
		}
		// write compressed part
	}
	// RLE loop

	_ASSERTE(p_output <= p_out_end);
	r_t_dest.Resize(uint32_t(p_output - r_t_dest.p_Data()));
	// shrink the buffer to it's final length

	return true;
}

/*
 *								=== ~CRunLengthCodec ===
 */

/*
 *								=== CModifiedRLECodec ===
 */

/**
 *	@def __MODIFIED_RLE_USE_RLE_EXP
 *	@brief if defined, uses the exponential RLE. otherwise uses naive RLE
 */
#define __MODIFIED_RLE_USE_RLE_EXP

/*
 *	static bool CModifiedRLECodec::Decode(const TBuffer &r_t_src,
 *		TBuffer &r_t_src_runs, TBuffer &r_t_dest)
 *		- decodes data from r_t_src / r_t_src_runs, outputs to r_t_dest (can be empty)
 *		- returns true on success, false on failure
 */
bool CModifiedRLECodec::Decode(const TBuffer &r_t_src, const TBuffer &r_t_src_runs, TBuffer &r_t_dest)
{
	r_t_dest.Resize(r_t_dest.n_Capacity());
	uint8_t *p_output = r_t_dest.p_Data(),
		*p_out_end = r_t_dest.p_Data() + r_t_dest.n_Size();
	// output buffer (size will change dynamically)

	const uint8_t *p_src = r_t_src.p_Data(), *p_end = p_src + r_t_src.n_Size(),
		*p_rl_src = r_t_src_runs.p_Data(), *p_rl_end = p_rl_src + r_t_src_runs.n_Size();
	// source buffers

	//FILE *p_fw = fopen("rle_dec.txt", "w"); // debug

	while(p_src != p_end) {
		int n_uncompressed_size = p_end - p_src, n_compressed_num = 0;
		if(p_src + n_min_run_length <= p_end) {
			for(const uint8_t *p_src2 = p_src, *p_end2 = p_end - n_min_run_length + 1;
			   p_src2 != p_end2; ++ p_src2) {
				if(*p_src2 == p_src2[1]) {
					uint8_t n_byte = *p_src2;
					int n_run_length = 1;
					while(p_src2 + n_run_length != p_end && p_src2[n_run_length] == n_byte)
						++ n_run_length;
					if(n_run_length >= n_min_run_length) {
						n_compressed_num = n_run_length;
						n_uncompressed_size = p_src2 - p_src;
						break;
					}
				}
			}
		}
		// get size of uncompressed data, preceeding compressed data,
		// get size of compressed data as well

#ifdef __MODIFIED_RLE_USE_RLE_EXP
		int n_compressed_size = 0;
		if(n_compressed_num) {
			if(n_compressed_num < n_min_run_length)
				return false;
			// must be either 0 or more than n_min_run_length

			int n_rle_byte_num = n_compressed_num - (n_min_run_length - 1);

			if(p_rl_src + n_rle_byte_num > p_rl_end)
				return false;
			for(int i = 0; i < n_rle_byte_num; ++ i, ++ p_rl_src)
				n_compressed_size += *p_rl_src << (8 * i);
			n_compressed_size += n_min_run_length;
			n_compressed_size += (1 << ((n_rle_byte_num - 1) * 8)) - 1;
			// calculate length of compressed data
		}
#else // __MODIFIED_RLE_USE_RLE_EXP
		if(n_compressed_num % n_min_run_length)
			return false;
		n_compressed_num /= n_min_run_length;
		// must be multiplies of n_min_run_length

		int n_compressed_size = 0;
		if(p_rl_src + n_compressed_num > p_rl_end)
			return false;
		for(int i = 0; i < n_compressed_num; ++ i, ++ p_rl_src)
			n_compressed_size += int(*p_rl_src) + n_min_run_length;
		// calculate length of compressed data
#endif // __MODIFIED_RLE_USE_RLE_EXP

		/*if(n_uncompressed_size)
		{	if(n_uncompressed_size == 2) {
				_ASSERTE(n_min_run_length != 2 || p_src[0] != p_src[1]); // would be a run, no?
				fprintf(p_fw, "uncompressed %d [0x%02x, 0x%02x]\n", n_uncompressed_size, p_src[0], p_src[1]);
			} else
				fprintf(p_fw, "uncompressed %d\n", n_uncompressed_size);
		}
		if(n_compressed_size)
			fprintf(p_fw, "compressed %d [0x%02x]\n", n_compressed_size, p_src[n_uncompressed_size]);*/
		// debug

		int n_add = n_uncompressed_size + n_compressed_size;
		if(p_output + n_add > p_out_end) {
			size_t n_off = p_output - r_t_dest.p_Data();
			if(!r_t_dest.Grow(n_add) || !r_t_dest.Resize(r_t_dest.n_Capacity()))
				return false;
			p_output = r_t_dest.p_Data() + n_off;
			p_out_end = r_t_dest.p_Data() + r_t_dest.n_Size();
			_ASSERTE(p_output + n_add <= p_out_end);
		}
		// make sure there's enough space

		memcpy(p_output, p_src, n_uncompressed_size);
		p_output += n_uncompressed_size;
		p_src += n_uncompressed_size;
		// copy uncompressed data

		memset(p_output, *p_src, n_compressed_size);
		p_output += n_compressed_size;
#ifdef __MODIFIED_RLE_USE_RLE_EXP
		p_src += n_compressed_num;
#else // __MODIFIED_RLE_USE_RLE_EXP
		p_src += n_min_run_length * n_compressed_num;
#endif // __MODIFIED_RLE_USE_RLE_EXP
		// fill compressed data

		_ASSERTE(p_output <= p_out_end);
		_ASSERTE(p_src <= p_end);
	}
	// decompress modified RLE

	//fclose(p_fw); // debug

	_ASSERTE(p_src == p_end);
	_ASSERTE(p_rl_src == p_rl_end); // t_odo - fix this! (save compressed data, compare output, etc.)
	// make sure we've read both input buffers

	_ASSERTE(p_output <= p_out_end);
	r_t_dest.Resize(uint32_t(p_output - r_t_dest.p_Data()));
	// shrink the buffer to it's final length

	return true;
}

/*
 *	static bool CModifiedRLECodec::Encode(const TBuffer &r_t_src,
 *		TBuffer &r_t_dest, TBuffer &r_t_dest_runs)
 *		- encodes data from r_t_src, outputs to r_t_dest / r_t_dest_runs (can be empty)
 *		- returns true on success, false on failure
 */
bool CModifiedRLECodec::Encode(const TBuffer &r_t_src, TBuffer &r_t_dest, TBuffer &r_t_dest_runs)
{
	const int n_max_repeats = 0xff + n_min_run_length;
	// compressor config

	r_t_dest.Resize(r_t_dest.n_Capacity());
	uint8_t *p_output = r_t_dest.p_Data(),
		*p_out_end = r_t_dest.p_Data() + r_t_dest.n_Size();
	// output buffer (size will change dynamically)

	r_t_dest_runs.Resize(r_t_dest_runs.n_Capacity());
	uint8_t *p_rl_out = r_t_dest_runs.p_Data(),
		*p_rl_out_end = r_t_dest_runs.p_Data() + r_t_dest_runs.n_Size();
	// output buffer (size will change dynamically)

	//FILE *p_fw = fopen("rle_enc.txt", "w"); // debug

	for(const uint8_t *p_src = r_t_src.p_Data(),
	   *p_end = r_t_src.p_Data() + r_t_src.n_Size(); p_src != p_end;) {
		int n_uncompressed_size = p_end - p_src, n_compressed_size = 0;
		if(p_src + n_min_run_length <= p_end) {
			for(const uint8_t *p_src2 = p_src, *p_end2 = p_end - n_min_run_length + 1;
			   p_src2 != p_end2; ++ p_src2) {
				if(*p_src2 == p_src2[1]) {
					uint8_t n_byte = *p_src2;
					int n_run_length = 1;
					while(p_src2 + n_run_length != p_end && p_src2[n_run_length] == n_byte)
						++ n_run_length;
					if(n_run_length >= n_min_run_length) {
						n_compressed_size = n_run_length;
						n_uncompressed_size = p_src2 - p_src;
						break;
					}
				}
			}
		}
		// get size of uncompressed data, preceeding compressed data,
		// get size of compressed data as well

		/*if(n_uncompressed_size)
		{	if(n_uncompressed_size == 2)
				fprintf(p_fw, "uncompressed %d [0x%02x, 0x%02x]\n", n_uncompressed_size, p_src[0], p_src[1]);
			else
				fprintf(p_fw, "uncompressed %d\n", n_uncompressed_size);
		}
		//_ASSERTE(n_compressed_size != 401);
		if(n_compressed_size)
			fprintf(p_fw, "compressed %d [0x%02x]\n", n_compressed_size, p_src[n_uncompressed_size]);*/
		// debug

		int n_compressed_run_byte_num;
		{
			int n_add_rl = 0;
			int n_add = n_uncompressed_size;
			if(n_compressed_size) {
#ifdef __MODIFIED_RLE_USE_RLE_EXP
				_ASSERTE(n_compressed_size >= n_min_run_length);
				int n_bit_num = n_Log2(n_Make_POT(n_compressed_size - n_min_run_length));
				n_compressed_run_byte_num = max(1, (n_bit_num + 7) / 8);
				n_add += n_min_run_length + n_compressed_run_byte_num - 1;
				n_add_rl += n_compressed_run_byte_num;
#else // __MODIFIED_RLE_USE_RLE_EXP
				n_compressed_run_byte_num = ((n_compressed_size + n_max_repeats - 1) / n_max_repeats);
				n_add += n_min_run_length * n_compressed_run_byte_num;
				n_add_rl += n_compressed_run_byte_num;
#endif // __MODIFIED_RLE_USE_RLE_EXP
			}
			if(p_output + n_add > p_out_end) {
				size_t n_off = p_output - r_t_dest.p_Data();
				if(!r_t_dest.Grow(n_add) || !r_t_dest.Resize(r_t_dest.n_Capacity()))
					return false;
				p_output = r_t_dest.p_Data() + n_off;
				p_out_end = r_t_dest.p_Data() + r_t_dest.n_Size();
				_ASSERTE(p_output + n_add <= p_out_end);
			}
			if(p_rl_out + n_add_rl > p_rl_out_end) {
				size_t n_off = p_rl_out - r_t_dest_runs.p_Data();
				if(!r_t_dest_runs.Grow(n_add_rl) || !r_t_dest_runs.Resize(r_t_dest_runs.n_Capacity()))
					return false;
				p_rl_out = r_t_dest_runs.p_Data() + n_off;
				p_rl_out_end = r_t_dest_runs.p_Data() + r_t_dest_runs.n_Size();
				_ASSERTE(p_rl_out + n_add_rl <= p_rl_out_end);
			}
		}
		// make sure there's enough space

		memcpy(p_output, p_src, n_uncompressed_size * sizeof(uint8_t));
		p_output += n_uncompressed_size;
		p_src += n_uncompressed_size;
		_ASSERTE(p_output <= p_out_end);
		// write uncompressed part

#ifdef __MODIFIED_RLE_USE_RLE_EXP
		if(n_compressed_size) {
			for(int i = 0; i < n_min_run_length + n_compressed_run_byte_num - 1; ++ i, ++ p_output)
				*p_output = *p_src;
			p_src += n_compressed_size;
			// write output data

			n_compressed_size -= n_min_run_length;
			_ASSERTE(n_compressed_size >= (1 << ((n_compressed_run_byte_num - 1) * 8)) - 1);
			n_compressed_size -= (1 << ((n_compressed_run_byte_num - 1) * 8)) - 1;
			// subtract minimal n_compressed_size for a given n_compressed_run_byte_num

			// todo - implement true RLE exp (or check if this gives the same results)
			// note probably not, the number of n_compressed_run_byte_num written to p_rl_out
			// may be too high (although the bytes are nulls)

			for(int i = 0; i < n_compressed_run_byte_num; ++ i, n_compressed_size >>= 8, ++ p_rl_out) {
				//_ASSERTE((!i && n_compressed_run_byte_num == 1) || n_compressed_size); // number of bytes calculated correctly
				// does not apply if we subtract minimal n_compressed_size for a given n_compressed_run_byte_num

				*p_rl_out = n_compressed_size & 0xff;
			}
			_ASSERTE(!n_compressed_size); // number of bytes calculated correctly
			// write run length

			_ASSERTE(p_output <= p_out_end);
			_ASSERTE(p_rl_out <= p_rl_out_end);
			// make sure we didn't overwrite
		}
#else // __MODIFIED_RLE_USE_RLE_EXP
		while(n_compressed_size) {
			int n_write = min(n_compressed_size, n_max_repeats);

			if(n_compressed_size > n_write &&
			   n_compressed_size - n_min_run_length < n_write)
				n_write = n_compressed_size - n_min_run_length;
			_ASSERTE(n_compressed_size == n_write ||
				n_compressed_size - n_write >= n_min_run_length);
			// in case the next run would be encoded as negative, shrink this one

			_ASSERTE(n_write >= n_min_run_length); // make sure it won't underflow
			_ASSERTE((n_write - n_min_run_length) < 256); // make sure it won't overflow

			*p_rl_out ++ = n_write - n_min_run_length;

			for(int i = 0; i < n_min_run_length; ++ i)
				*p_output ++ = *p_src;
			// write run value, n_min_run_length times

			n_compressed_size -= n_write;
			p_src += n_write;

			_ASSERTE(p_output <= p_out_end);
			_ASSERTE(p_rl_out <= p_rl_out_end);
		}
#endif // __MODIFIED_RLE_USE_RLE_EXP
		// write compressed part
	}
	// RLE loop

	//fclose(p_fw); // debug

	_ASSERTE(p_output <= p_out_end);
	_ASSERTE(p_rl_out <= p_rl_out_end);
	r_t_dest.Resize(uint32_t(p_output - r_t_dest.p_Data()));
	r_t_dest_runs.Resize(uint32_t(p_rl_out - r_t_dest_runs.p_Data()));
	// shrink the buffers to it's final lengths

	return true;
}

/*
 *								=== ~CModifiedRLECodec ===
 */

/*
 *								=== CHuffmanCodec ===
 */

// utility huffman bitstream function (reused by another classes below as well)
static inline bool Encode_Symbol(int n_value, int n_bit_num,
	uint8_t &r_n_byte, int &r_n_bit_num, uint8_t *&r_p_output,
	uint8_t *&r_p_out_end, TBuffer &r_t_out_buffer)
{
	for(-- n_bit_num; n_bit_num >= 0; -- n_bit_num) {
		int n_bit = (n_value >> n_bit_num) & 1;

		r_n_byte <<= 1;
		r_n_byte |= n_bit;
		if(r_n_bit_num == 7) {
			r_n_bit_num = 0;
			_ASSERTE(r_p_output <= r_p_out_end);
			if(r_p_output == r_p_out_end) {
				size_t n_off = r_p_output - r_t_out_buffer.p_Data();
				if(!r_t_out_buffer.Grow(1))
					return false;
				r_p_output = r_t_out_buffer.p_Data() + n_off;
				r_p_out_end = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size();
			}
			*r_p_output ++ = r_n_byte;
		} else
			++ r_n_bit_num;
	}

	return true;
}

// t_odo - separate huffman tree from huffman codec (template it?)
// t_odo - create second version of huffman codec for encoding runs of zeroes
// (builds tree for symbols and tree for encoding length of runs in case symbol is a zero)

/*
 *	static bool CHuffmanCodec::Decode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
 *		- decodes data from r_t_in_buffer, outputs to r_t_out_buffer (can be empty)
 *		- returns true on success, false on failure
 */
bool CHuffmanCodec::Decode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
{
	if(r_t_in_buffer.n_Size() < sizeof(int32_t) * max_CodeBitNum + sizeof(uint32_t))
		return false;
	// least possible size of input

	const uint8_t *p_input = r_t_in_buffer.p_Data();
	// input buffer pointer

	uint32_t n_unpack_length = *(uint32_t*)p_input;
	p_input += sizeof(uint32_t);
	// get size of uncompressed data

	int32_t *p_code_num = (int32_t*)p_input; // numbers of codes of different lengths
	p_input += sizeof(int32_t) * max_CodeBitNum;
	// get numbers of codes

	uint32_t p_min_code[max_CodeBitNum];
	uint32_t p_max_code[max_CodeBitNum];
	int32_t p_table_off[max_CodeBitNum];
	// minimal / maximal codes for given lengths and offsets to symbol table

	/*for(int i = 0, n_code_word = 0, n_prev_code_length = 0; i < max_CodeBitNum; ++ i) {
		int n_code_length = i + 1; // used as difference value, don't need + 1
		if(n_symbol_num && n_code_length != n_prev_code_length)
			n_code_word <<= n_code_length - n_prev_code_length;
		p_min_code[i] = n_code_word;
		n_code_word += p_code_num[i];
		p_max_code[i] = n_code_word;
		p_table_offset[i] = n_symbol_num;
		n_symbol_num += p_code_num[i];
		if(p_code_num[i])
			n_prev_code_length = n_code_length;
	}*/
	// dumb version of below

	int n_symbol_num = 0;
	for(uint32_t i = 0, n_cw = 0; i < max_CodeBitNum; ++ i, n_cw <<= 1) {
		p_min_code[i] = n_cw;
		p_max_code[i] = (n_cw += p_code_num[i]);
		p_table_off[i] = n_symbol_num - p_min_code[i];
		n_symbol_num += p_code_num[i];
	}
	// calculate number of symbols, table indices and min / max code values

	if(r_t_in_buffer.n_Size() < sizeof(p_code_num) + sizeof(uint32_t) + n_symbol_num)
		return false;
	// check size of input

	const uint8_t *p_symbol = p_input;
	p_input += n_symbol_num * sizeof(uint8_t);
	// copy symbols

	if(!r_t_out_buffer.Resize(n_unpack_length, false))
		return false;
	// alloc output buffer

	uint8_t n_byte = 0;
	int n_bit_num = 0;
	for(uint8_t *p_dest = r_t_out_buffer.p_Data(),
	   *p_end = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size(); p_dest != p_end; ++ p_dest) {
		for(uint32_t i = 0, n_code = 0;;) {
			if(!n_bit_num) {
				if(p_input == r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size())
					return false;
				// not enough data to decompress

				n_byte = *p_input ++;
				n_bit_num = 7;
			} else
				-- n_bit_num;
			n_code = (n_code << 1) | n_byte >> 7;
			n_byte <<= 1;
			// get a single bit from input stream, add bit to code

			if(n_code >= p_min_code[i] && n_code < p_max_code[i]) {
				*p_dest = p_symbol[n_code + p_table_off[i]]; // add "- p_min_code[i]" to use with dumb version
				break;
			}
			// see if it's valid code for this bit length

			if(++ i == max_CodeBitNum)
				return false; // invalid code
		}

		// todo - implement lookahead table to guess the length of the symbol based on the first byte
	}
	// decode data

	_ASSERTE(p_input == r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size());
	// make sure we've read the whole input buffer

	return true;
}

/*
 *	static bool CHuffmanCodec::Encode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
 *		- encodes data from r_t_in_buffer, outputs to r_t_out_buffer (can be empty)
 *		- returns true on success, false on failure
 */
bool CHuffmanCodec::Encode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
{
	CHuff8 huff_tree;

	{
		const uint8_t *p_src = r_t_in_buffer.p_Data(),
			*p_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
		if(!huff_tree.CalculateSymbolFrequencies(p_src, p_end))
			return false;
	}
	// calculate symbol frequencies

	if(!huff_tree.Assign_CodeWords())
		return false;
	// create huffman tree

	size_t n_code_table_size = huff_tree.n_CodeTable_Size() * sizeof(int32_t);
	size_t n_symbol_table_size = huff_tree.n_SymbolTable_Size() * sizeof(uint8_t);
	size_t n_header_size = sizeof(uint32_t) + n_code_table_size + n_symbol_table_size;
	// we're going to need to store the huffman tree
	// (numbers of codes with distinct lengths and symbols)

	if(!r_t_out_buffer.Resize(n_header_size, false))
		return false;
	// make sure there's enough space for header

	uint8_t *p_output = r_t_out_buffer.p_Data(),
		*p_end = r_t_out_buffer.p_Data() + n_header_size;
	// output buffer (size will change dynamically)

	*(uint32_t*)p_output = uint32_t(r_t_in_buffer.n_Size());
	p_output += sizeof(uint32_t);
	// write size of decompressed stream so decompressor can work in a single pass

	memcpy(p_output, huff_tree.p_CodeTable(), n_code_table_size);
	p_output += n_code_table_size;
	huff_tree.Get_SymbolTable(p_output, n_symbol_table_size);
	p_output += n_symbol_table_size;
	_ASSERTE(p_output == p_end);
	// write numbers of codes of different lengths and associated symbols

	huff_tree.SortFrequencies_BySymbol();
	// sort by symbols so we can search using lower_bound (binary search)
	// note that freq_list has symbols with zero frequency removed
	// in CreateHuffmanTree(), searching is therefore inevitable (could
	// build lookup table though)

	{
		uint8_t m_n_byte = 0;
		int m_n_bit_num = 0;
		// bit writer

		const uint8_t *p_src = r_t_in_buffer.p_Data(),
			*p_end2 = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
		for(; p_src != p_end2; ++ p_src) {
			const CHuff8::TFrequency &r_freq =
				huff_tree.r_LookupSymbol(*p_src);
			_ASSERTE(r_freq.n_symbol == *p_src);
			// find symbol in huffman tree

			if(!Encode_Symbol(r_freq.n_code_word, r_freq.n_code_length,
			   m_n_byte, m_n_bit_num, p_output, p_end, r_t_out_buffer))
				return false;
			// write symbol as series of bits
		}

		if(m_n_bit_num) {
			m_n_byte <<= 8 - m_n_bit_num;
			// padd with 0-s

			if(p_output == p_end) {
				uint32_t n_off = uint32_t(p_output - r_t_out_buffer.p_Data());
				if(!r_t_out_buffer.Grow(1))
					return false;
				p_output = r_t_out_buffer.p_Data() + n_off;
				p_end = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size();
			}
			*p_output ++ = m_n_byte;
		}
		// output any leftover bits
	}
	// compress

	if(!r_t_out_buffer.Resize(uint32_t(p_output - r_t_out_buffer.p_Data()))) {
		_ASSERTE(0);
		return false;
	}
	// shrink the buffer to it's final length

	return true;
}

/*
 *								=== ~CHuffmanCodec ===
 */

/*
 *								=== CRLE0_HuffmanCodec ===
 */

bool CRLE0_HuffmanCodec::Decode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
{
	if(r_t_in_buffer.n_Size() < sizeof(uint32_t))
		return false;
	// least possible size of input

	const uint8_t *p_input = r_t_in_buffer.p_Data();
	const uint8_t *p_src_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
	// input buffer pointer

	uint32_t n_unpack_length = *(uint32_t*)p_input;
	p_input += sizeof(uint32_t);
	// get size of uncompressed data

	int32_t *p_code_num;
	int n_symbol_num = 0;
	uint32_t p_min_code[max_CodeBitNum];
	uint32_t p_max_code[max_CodeBitNum];
	int32_t p_table_off[max_CodeBitNum]; // minimal / maximal codes for given lengths and offsets to symbol table
	const uint8_t *p_symbol;
	{
		if(p_input + sizeof(int32_t) * max_CodeBitNum > p_src_end)
			return false;
		p_code_num = (int32_t*)p_input; // numbers of codes of different lengths
		p_input += sizeof(int32_t) * max_CodeBitNum;
		// get numbers of codes

		for(uint32_t i = 0, n_cw = 0; i < max_CodeBitNum; ++ i, n_cw <<= 1) {
			p_min_code[i] = n_cw;
			p_max_code[i] = (n_cw += p_code_num[i]);
			p_table_off[i] = n_symbol_num - p_min_code[i];
			n_symbol_num += p_code_num[i];
		}
		// calculate number of symbols, table indices and min / max code values

		if(p_input + n_symbol_num * sizeof(uint8_t) > p_src_end)
			return false;
		p_symbol = p_input;
		p_input += n_symbol_num * sizeof(uint8_t);
		// copy symbols
	}
	// huffman tree for symbols

	int32_t *p_rl_code_num;
	int n_rl_symbol_num = 0;
	uint32_t p_rl_min_code[max_CodeBitNum];
	uint32_t p_rl_max_code[max_CodeBitNum];
	int32_t p_rl_table_off[max_CodeBitNum]; // minimal / maximal codes for given lengths and offsets to symbol table
	const uint16_t *p_rl_symbol;
	{
		if(p_input + sizeof(int32_t) * max_CodeBitNum > p_src_end)
			return false;
		p_rl_code_num = (int32_t*)p_input; // numbers of codes of different lengths
		p_input += sizeof(int32_t) * max_CodeBitNum;
		// get numbers of codes

		for(uint32_t i = 0, n_cw = 0; i < max_CodeBitNum; ++ i, n_cw <<= 1) {
			p_rl_min_code[i] = n_cw;
			p_rl_max_code[i] = (n_cw += p_rl_code_num[i]);
			p_rl_table_off[i] = n_rl_symbol_num - p_rl_min_code[i];
			n_rl_symbol_num += p_rl_code_num[i];
		}
		// calculate number of symbols, table indices and min / max code values

		if(p_input + n_rl_symbol_num * sizeof(uint16_t) > p_src_end)
			return false;
		p_rl_symbol = (uint16_t*)p_input;
		p_input += n_rl_symbol_num * sizeof(uint16_t);
		// copy symbols
	}
	// huffman tree for run lengths

	if(!r_t_out_buffer.Resize(n_unpack_length, false))
		return false;
	// alloc output buffer

	uint8_t n_byte = 0;
	int n_bit_num = 0;
	const uint8_t *p_dest_end = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size();
	for(uint8_t *p_dest = r_t_out_buffer.p_Data(); p_dest != p_dest_end;) {
		for(uint32_t i = 0, n_code = 0;;) {
			if(!n_bit_num) {
				if(p_input == p_src_end)
					return false;
				// not enough data to decompress

				n_byte = *p_input ++;
				n_bit_num = 7;
			} else
				-- n_bit_num;
			n_code = (n_code << 1) | n_byte >> 7;
			n_byte <<= 1;
			// get a single bit from input stream, add bit to code

			if(n_code >= p_min_code[i] && n_code < p_max_code[i]) {
				*p_dest = p_symbol[n_code + p_table_off[i]];
				break;
			}
			// see if it's valid code for this bit length

			if(++ i == max_CodeBitNum)
				return false; // invalid code
		}
		// decode symbol value

		if(!*p_dest) {
			for(uint32_t i = 0, n_code = 0;;) {
				if(!n_bit_num) {
					if(p_input == p_src_end)
						return false;
					// not enough data to decompress

					n_byte = *p_input ++;
					n_bit_num = 7;
				} else
					-- n_bit_num;
				n_code = (n_code << 1) | n_byte >> 7;
				n_byte <<= 1;
				// get a single bit from input stream, add bit to code

				if(n_code >= p_rl_min_code[i] && n_code < p_rl_max_code[i]) {
					uint16_t n_run_length = p_rl_symbol[n_code + p_rl_table_off[i]];
					_ASSERTE(n_run_length >= 1);
					memset(p_dest, 0, n_run_length * sizeof(uint8_t)); // decompress zero run
					p_dest += n_run_length;
					break;
				}
				// see if it's valid code for this bit length

				if(++ i == max_CodeBitNum)
					return false; // invalid code
			}
		} else
			++ p_dest;
		// decode run length from the second huffman tree

		// todo - implement lookahead table to guess the length of the symbol based on the first byte
	}
	// decode data

	_ASSERTE(p_input == p_src_end);
	// make sure we've read the whole input buffer

	return true;
}

bool CRLE0_HuffmanCodec::Encode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
{
	std::vector<CHuff8::TFrequency> symbol_freq;
	std::vector<CHuff16::TFrequency> run_length_freq;

	{
		if(!stl_ut::Resize_To_N(symbol_freq, 1 << (8 * sizeof(uint8_t)),
		   CHuff8::TFrequency(0)))
			return false;
		for(size_t i = 0, n = symbol_freq.size(); i < n; ++ i)
			symbol_freq[i].n_symbol = i;
		// alloc symbol frequencies

		std::map<uint16_t, size_t> run_length_set;

		const uint8_t *p_src = r_t_in_buffer.p_Data(),
			*p_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
		for(; p_src != p_end; ++ p_src) {
			++ symbol_freq[*p_src].n_frequency;
			_ASSERTE(symbol_freq[*p_src].n_frequency > 0);
			// increment symbol frequency

			if(!*p_src) {
				const uint8_t *p_last_zero = p_src + 1;
				while(p_last_zero != p_end && !*(p_last_zero))
					++ p_last_zero;
				_ASSERTE(!*(p_last_zero - 1) && (p_last_zero == p_end || *p_last_zero));
				// find the last zero

				size_t n_zero_run_length = p_last_zero - p_src;
				// calculate number of zeros

				try {
					while(n_zero_run_length) {
						uint16_t n_run_length = uint16_t(min(n_zero_run_length, size_t(UINT16_MAX)));
						n_zero_run_length -= n_run_length;
						// decompose the zero run to up to 65536 repeats

						std::map<uint16_t, size_t>::iterator p_freq_it = run_length_set.find(n_run_length);
						if(p_freq_it != run_length_set.end())
							++ (*p_freq_it).second;
						else
							run_length_set[n_run_length] = 1;
						// increment run length frequency

						if(n_zero_run_length) {
							++ symbol_freq[0].n_frequency;
							_ASSERTE(symbol_freq[0].n_frequency > 0);
						}
						// in case the run of zeroes is saved as multiple chunks,
						// increase frequency of the zero symbol as well
					}
				} catch(std::bad_alloc&) {
					return false;
				}
				// accumulate zero run length frequencies

				p_src = p_last_zero - 1;
				// shift behind the zero run
			}
		}
		// calculate frequencies

		if(!stl_ut::Resize_To_N(run_length_freq, run_length_set.size(),
		   CHuff16::TFrequency(0)))
			return false;
		std::vector<CHuff16::TFrequency>::iterator p_freq_it =
			run_length_freq.begin();
		for(std::map<uint16_t, size_t>::const_iterator p_rlf_it = run_length_set.begin(),
		   p_end_it = run_length_set.end(); p_rlf_it != p_end_it; ++ p_rlf_it, ++ p_freq_it) {
			uint16_t n_run_length = (*p_rlf_it).first;
			size_t n_frequency = (*p_rlf_it).second;
			*p_freq_it = CHuff16::TFrequency(n_run_length, n_frequency);
		}
		// copy set of frequencies to the list of frequecies
	}
	// calculate symbol frequencies

	int32_t p_sym_code_table[max_CodeBitNum];
	int32_t p_rl_code_table[max_CodeBitNum];
	if(!CHuff8::Assign_CodeWords(symbol_freq, p_sym_code_table) ||
	   !CHuff16::Assign_CodeWords(run_length_freq, p_rl_code_table))
		return false;
	// create huffman tree

	size_t n_code_table_size = max_CodeBitNum * sizeof(int32_t);
	size_t n_sym_symbol_table_size = symbol_freq.size();
	size_t n_rl_symbol_table_size = run_length_freq.size();
	size_t n_header_size = sizeof(uint32_t) + 2 * n_code_table_size +
		n_sym_symbol_table_size * sizeof(uint8_t) + n_rl_symbol_table_size * sizeof(uint16_t);
	// we're going to need to store the huffman tree
	// (numbers of codes with distinct lengths and symbols)

	if(!r_t_out_buffer.Resize(n_header_size, false))
		return false;
	// make sure there's enough space for header

	uint8_t *p_output = r_t_out_buffer.p_Data(),
		*p_out_end = r_t_out_buffer.p_Data() + n_header_size;
	// output buffer (size will change dynamically)

	*(uint32_t*)p_output = uint32_t(r_t_in_buffer.n_Size());
	p_output += sizeof(uint32_t);
	// write size of decompressed stream so decompressor can work in a single pass

	memcpy(p_output, p_sym_code_table, n_code_table_size);
	p_output += n_code_table_size;
	for(size_t i = 0; i < n_sym_symbol_table_size; ++ i, ++ p_output)
		*p_output = symbol_freq[i].n_symbol;
	memcpy(p_output, p_rl_code_table, n_code_table_size);
	p_output += n_code_table_size;
	for(size_t i = 0; i < n_rl_symbol_table_size; ++ i, p_output += sizeof(uint16_t))
		*(uint16_t*)p_output = run_length_freq[i].n_symbol;
	_ASSERTE(p_output == p_out_end);
	// write numbers of codes of different lengths and associated symbols

	CHuff8::SortFrequencies_BySymbol(symbol_freq);
	CHuff16::SortFrequencies_BySymbol(run_length_freq);
	// sort by symbols so we can search using lower_bound (binary search)
	// note that freq_list has symbols with zero frequency removed
	// in CreateHuffmanTree(), searching is therefore inevitable (could
	// build lookup table though)

	{
		uint8_t m_n_byte = 0;
		int m_n_bit_num = 0;
		// bit writer

		const uint8_t *p_src = r_t_in_buffer.p_Data(),
			*p_src_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
		for(int n_pass = 0; p_src != p_src_end; ++ p_src, ++ n_pass) {
			_ASSERTE(p_src < p_src_end);
			const CHuff8::TFrequency &r_freq =
				CHuff8::r_LookupSymbol(*p_src, symbol_freq);
			_ASSERTE(r_freq.n_symbol == *p_src);
			// find symbol in huffman tree

			_ASSERTE(run_length_freq[0].n_symbol == 1);

			if(!Encode_Symbol(r_freq.n_code_word, r_freq.n_code_length,
			   m_n_byte, m_n_bit_num, p_output, p_out_end, r_t_out_buffer))
				return false;
			// write symbol as series of bits

			_ASSERTE(p_src < p_src_end);
			_ASSERTE(p_output <= p_out_end);
			_ASSERTE(run_length_freq[0].n_symbol == 1);

			if(!*p_src) {
				const uint8_t *p_last_zero = p_src + 1;
				while(p_last_zero != p_src_end && !*(p_last_zero))
					++ p_last_zero;
				_ASSERTE(!*(p_last_zero - 1) && (p_last_zero == p_src_end || *p_last_zero));
				// find the last zero

				size_t n_zero_run_length = p_last_zero - p_src;
				// calculate number of zeros

				while(n_zero_run_length) {
					uint16_t n_run_length = uint16_t(min(n_zero_run_length, size_t(UINT16_MAX)));
					n_zero_run_length -= n_run_length;
					// decompose the zero run to up to 65536 repeats

					const CHuff16::TFrequency &r_freq0 =
						CHuff16::r_LookupSymbol(n_run_length, run_length_freq);
					_ASSERTE(r_freq0.n_symbol == n_run_length);
					// find run length in the second huffman tree

					if(!Encode_Symbol(r_freq0.n_code_word, r_freq0.n_code_length,
					   m_n_byte, m_n_bit_num, p_output, p_out_end, r_t_out_buffer))
						return false;
					// write symbol as series of bits

					if(n_zero_run_length) {
						if(!Encode_Symbol(r_freq.n_code_word, r_freq.n_code_length,
						   m_n_byte, m_n_bit_num, p_output, p_out_end, r_t_out_buffer))
							return false;
					}
					// in case the zero run length is decomposed,
					// we need to write another zero symbol first
				}

				_ASSERTE(p_output <= p_out_end);

				p_src = p_last_zero - 1;
			}
			_ASSERTE(p_src < p_src_end);
			// in case the symbol was zero, encode run length and skip the occurences of the symbol
		}

		if(m_n_bit_num) {
			m_n_byte <<= 8 - m_n_bit_num;
			// padd with 0-s

			if(p_output == p_out_end) {
				uint32_t n_off = uint32_t(p_output - r_t_out_buffer.p_Data());
				if(!r_t_out_buffer.Grow(1))
					return false;
				p_output = r_t_out_buffer.p_Data() + n_off;
				p_out_end = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size();
			}
			*p_output ++ = m_n_byte;
		}
		// output any leftover bits
	}
	// compress

	if(!r_t_out_buffer.Resize(uint32_t(p_output - r_t_out_buffer.p_Data()))) {
		_ASSERTE(0);
		return false;
	}
	// shrink the buffer to it's final length

	// t_odo - this probably does something else than BW94 did. they model symbols as
	//		1) 255 symbols + all the different runs of nulls and
	//		2) 256 possible symbols that occur after a null

	return true;
}

/*
 *								=== ~CRLE0_HuffmanCodec ===
 */

/*
 *								=== CRLE0_HuffmanCodec_1 ===
 */

bool CRLE0_HuffmanCodec_1::Decode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
{
	if(r_t_in_buffer.n_Size() < sizeof(uint32_t))
		return false;
	// least possible size of input

	const uint8_t *p_input = r_t_in_buffer.p_Data();
	const uint8_t *p_src_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
	// input buffer pointer

	uint32_t n_unpack_length = *(uint32_t*)p_input;
	p_input += sizeof(uint32_t);
	// get size of uncompressed data

	int32_t *p_code_num;
	int n_symbol_num = 0;
	uint32_t p_min_code[max_CodeBitNum];
	uint32_t p_max_code[max_CodeBitNum];
	int32_t p_table_off[max_CodeBitNum]; // minimal / maximal codes for given lengths and offsets to symbol table
	const uint8_t *p_symbol;
	{
		if(p_input + sizeof(int32_t) * max_CodeBitNum > p_src_end)
			return false;
		p_code_num = (int32_t*)p_input; // numbers of codes of different lengths
		p_input += sizeof(int32_t) * max_CodeBitNum;
		// get numbers of codes

		for(uint32_t i = 0, n_cw = 0; i < max_CodeBitNum; ++ i, n_cw <<= 1) {
			p_min_code[i] = n_cw;
			p_max_code[i] = (n_cw += p_code_num[i]);
			p_table_off[i] = n_symbol_num - p_min_code[i];
			n_symbol_num += p_code_num[i];
		}
		// calculate number of symbols, table indices and min / max code values

		if(p_input + n_symbol_num * sizeof(uint8_t) > p_src_end)
			return false;
		p_symbol = p_input;
		p_input += n_symbol_num * sizeof(uint8_t);
		// copy symbols
	}
	// huffman tree for symbols

	int32_t *p_rl_code_num;
	int n_rl_symbol_num = 0;
	uint32_t p_rl_min_code[max_CodeBitNum];
	uint32_t p_rl_max_code[max_CodeBitNum];
	int32_t p_rl_table_off[max_CodeBitNum]; // minimal / maximal codes for given lengths and offsets to symbol table
	const uint16_t *p_rl_symbol;
	{
		if(p_input + sizeof(int32_t) * max_CodeBitNum > p_src_end)
			return false;
		p_rl_code_num = (int32_t*)p_input; // numbers of codes of different lengths
		p_input += sizeof(int32_t) * max_CodeBitNum;
		// get numbers of codes

		for(uint32_t i = 0, n_cw = 0; i < max_CodeBitNum; ++ i, n_cw <<= 1) {
			p_rl_min_code[i] = n_cw;
			p_rl_max_code[i] = (n_cw += p_rl_code_num[i]);
			p_rl_table_off[i] = n_rl_symbol_num - p_rl_min_code[i];
			n_rl_symbol_num += p_rl_code_num[i];
		}
		// calculate number of symbols, table indices and min / max code values

		if(p_input + n_rl_symbol_num * sizeof(uint16_t) > p_src_end)
			return false;
		p_rl_symbol = (uint16_t*)p_input;
		p_input += n_rl_symbol_num * sizeof(uint16_t);
		// copy symbols
	}
	// huffman tree for run lengths

	int32_t *p_a0_code_num;
	int n_a0_symbol_num = 0;
	uint32_t p_a0_min_code[max_CodeBitNum];
	uint32_t p_a0_max_code[max_CodeBitNum];
	int32_t p_a0_table_off[max_CodeBitNum]; // minimal / maximal codes for given lengths and offsets to symbol table
	const uint8_t *p_a0_symbol;
	{
		if(p_input + sizeof(int32_t) * max_CodeBitNum > p_src_end)
			return false;
		p_a0_code_num = (int32_t*)p_input; // numbers of codes of different lengths
		p_input += sizeof(int32_t) * max_CodeBitNum;
		// get numbers of codes

		for(uint32_t i = 0, n_cw = 0; i < max_CodeBitNum; ++ i, n_cw <<= 1) {
			p_a0_min_code[i] = n_cw;
			p_a0_max_code[i] = (n_cw += p_a0_code_num[i]);
			p_a0_table_off[i] = n_a0_symbol_num - p_a0_min_code[i];
			n_a0_symbol_num += p_a0_code_num[i];
		}
		// calculate number of symbols, table indices and min / max code values

		if(p_input + n_a0_symbol_num * sizeof(uint8_t) > p_src_end)
			return false;
		p_a0_symbol = p_input;
		p_input += n_a0_symbol_num * sizeof(uint8_t);
		// copy symbols
	}
	// huffman tree for symbols after zero runs

	if(!r_t_out_buffer.Resize(n_unpack_length, false))
		return false;
	// alloc output buffer

	uint8_t n_byte = 0;
	int n_bit_num = 0;
	bool b_had_zero_run = false;
	const uint8_t *p_dest_end = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size();
	for(uint8_t *p_dest = r_t_out_buffer.p_Data(); p_dest != p_dest_end;) {
		for(uint32_t i = 0, n_code = 0;;) {
			if(!n_bit_num) {
				if(p_input == p_src_end)
					return false;
				// not enough data to decompress

				n_byte = *p_input ++;
				n_bit_num = 7;
			} else
				-- n_bit_num;
			n_code = (n_code << 1) | n_byte >> 7;
			n_byte <<= 1;
			// get a single bit from input stream, add bit to code

			if(!b_had_zero_run) {
				if(n_code >= p_min_code[i] && n_code < p_max_code[i]) {
					*p_dest = p_symbol[n_code + p_table_off[i]];
					break;
				}
			} else {
				if(n_code >= p_a0_min_code[i] && n_code < p_a0_max_code[i]) {
					*p_dest = p_a0_symbol[n_code + p_a0_table_off[i]];
					b_had_zero_run = false;
					break;
				}
			}
			// see if it's valid code for this bit length

			if(++ i == max_CodeBitNum)
				return false; // invalid code
		}
		// decode symbol value

		if(!*p_dest) {
			for(uint32_t i = 0, n_code = 0;;) {
				if(!n_bit_num) {
					if(p_input == p_src_end)
						return false;
					// not enough data to decompress

					n_byte = *p_input ++;
					n_bit_num = 7;
				} else
					-- n_bit_num;
				n_code = (n_code << 1) | n_byte >> 7;
				n_byte <<= 1;
				// get a single bit from input stream, add bit to code

				if(n_code >= p_rl_min_code[i] && n_code < p_rl_max_code[i]) {
					uint16_t n_run_length = p_rl_symbol[n_code + p_rl_table_off[i]];
					_ASSERTE(n_run_length >= 1);
					memset(p_dest, 0, n_run_length * sizeof(uint8_t)); // decompress zero run
					p_dest += n_run_length;
					break;
				}
				// see if it's valid code for this bit length

				if(++ i == max_CodeBitNum)
					return false; // invalid code
			}

			b_had_zero_run = true;
		} else
			++ p_dest;
		// decode run length from the second huffman tree

		// todo - implement lookahead table to guess the length of the symbol based on the first byte
	}
	// decode data

	_ASSERTE(p_input == p_src_end);
	// make sure we've read the whole input buffer

	return true;
}

bool CRLE0_HuffmanCodec_1::Encode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
{
	std::vector<CHuff8::TFrequency> symbol_freq;
	std::vector<CHuff16::TFrequency> run_length_freq;
	std::vector<CHuff8::TFrequency> symbol_after_zero_run_freq;

	{
		if(!stl_ut::Resize_To_N(symbol_freq, 1 << (8 * sizeof(uint8_t)),
		   CHuff8::TFrequency(0)) ||
		   !stl_ut::Resize_To_N(symbol_after_zero_run_freq, 1 << (8 * sizeof(uint8_t)),
		   CHuff8::TFrequency(0)))
			return false;
		for(size_t i = 0, n = symbol_freq.size(); i < n; ++ i) {
			symbol_freq[i].n_symbol = i;
			symbol_after_zero_run_freq[i].n_symbol = i;
		}
		// alloc symbol frequencies

		std::map<uint16_t, size_t> run_length_set;

		const uint8_t *p_src = r_t_in_buffer.p_Data(),
			*p_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
		for(bool b_had_zero_run = false; p_src != p_end; ++ p_src) {
			if(!b_had_zero_run) {
				++ symbol_freq[*p_src].n_frequency;
				_ASSERTE(symbol_freq[*p_src].n_frequency > 0);
				// encodes all symbols, including the nulls
			} else {
				_ASSERTE(*p_src != 0); // there shouldn't be a 0 after a zero run
				b_had_zero_run = false; // clear the flag
				++ symbol_after_zero_run_freq[*p_src].n_frequency;
				_ASSERTE(symbol_after_zero_run_freq[*p_src].n_frequency > 0);
				// encodes symbols occuring after zero runs
			}
			// increment symbol frequency

			if(!*p_src) {
				const uint8_t *p_last_zero = p_src + 1;
				while(p_last_zero != p_end && !*(p_last_zero))
					++ p_last_zero;
				_ASSERTE(!*(p_last_zero - 1) && (p_last_zero == p_end || *p_last_zero));
				// find the last zero

				size_t n_zero_run_length = p_last_zero - p_src;
				// calculate number of zeros

				try {
					while(n_zero_run_length) {
						uint16_t n_run_length = uint16_t(min(n_zero_run_length, size_t(UINT16_MAX)));
						n_zero_run_length -= n_run_length;
						// decompose the zero run to up to 65536 repeats

						std::map<uint16_t, size_t>::iterator p_freq_it = run_length_set.find(n_run_length);
						if(p_freq_it != run_length_set.end())
							++ (*p_freq_it).second;
						else
							run_length_set[n_run_length] = 1;
						// increment run length frequency

						if(n_zero_run_length) {
							++ symbol_after_zero_run_freq[0].n_frequency;
							_ASSERTE(symbol_after_zero_run_freq[0].n_frequency > 0);
						}
						// in case the run of zeroes is saved as multiple chunks,
						// increase frequency of the zero symbol as well
					}
				} catch(std::bad_alloc&) {
					return false;
				}
				// accumulate zero run length frequencies

				p_src = p_last_zero - 1;
				// shift behind the zero run

				b_had_zero_run = true;
				// set the context flag
			}
		}
		// calculate frequencies

		if(!stl_ut::Resize_To_N(run_length_freq, run_length_set.size(),
		   CHuff16::TFrequency(0)))
			return false;
		std::vector<CHuff16::TFrequency>::iterator p_freq_it =
			run_length_freq.begin();
		for(std::map<uint16_t, size_t>::const_iterator p_rlf_it = run_length_set.begin(),
		   p_end_it = run_length_set.end(); p_rlf_it != p_end_it; ++ p_rlf_it, ++ p_freq_it) {
			uint16_t n_run_length = (*p_rlf_it).first;
			size_t n_frequency = (*p_rlf_it).second;
			*p_freq_it = CHuff16::TFrequency(n_run_length, n_frequency);
		}
		// copy set of frequencies to the list of frequecies
	}
	// calculate symbol frequencies

	int32_t p_sym_code_table[max_CodeBitNum];
	int32_t p_rl_code_table[max_CodeBitNum];
	int32_t p_sym0_code_table[max_CodeBitNum];
	if(!CHuff8::Assign_CodeWords(symbol_freq, p_sym_code_table) ||
	   !CHuff16::Assign_CodeWords(run_length_freq, p_rl_code_table) ||
	   !CHuff8::Assign_CodeWords(symbol_after_zero_run_freq, p_sym0_code_table))
		return false;
	// create huffman tree

	size_t n_code_table_size = max_CodeBitNum * sizeof(int32_t);
	size_t n_sym_symbol_table_size = symbol_freq.size();
	size_t n_sym0_symbol_table_size = symbol_after_zero_run_freq.size();
	size_t n_rl_symbol_table_size = run_length_freq.size();
	size_t n_header_size = sizeof(uint32_t) + 3 * n_code_table_size +
		(n_sym0_symbol_table_size + n_sym_symbol_table_size) * sizeof(uint8_t) +
		n_rl_symbol_table_size * sizeof(uint16_t);
	// we're going to need to store the huffman tree
	// (numbers of codes with distinct lengths and symbols)

	if(!r_t_out_buffer.Resize(n_header_size, false))
		return false;
	// make sure there's enough space for header

	uint8_t *p_output = r_t_out_buffer.p_Data(),
		*p_out_end = r_t_out_buffer.p_Data() + n_header_size;
	// output buffer (size will change dynamically)

	*(uint32_t*)p_output = uint32_t(r_t_in_buffer.n_Size());
	p_output += sizeof(uint32_t);
	// write size of decompressed stream so decompressor can work in a single pass

	memcpy(p_output, p_sym_code_table, n_code_table_size);
	p_output += n_code_table_size;
	for(size_t i = 0; i < n_sym_symbol_table_size; ++ i, ++ p_output)
		*p_output = symbol_freq[i].n_symbol;
	//
	memcpy(p_output, p_rl_code_table, n_code_table_size);
	p_output += n_code_table_size;
	for(size_t i = 0; i < n_rl_symbol_table_size; ++ i, p_output += sizeof(uint16_t))
		*(uint16_t*)p_output = run_length_freq[i].n_symbol;
	//
	memcpy(p_output, p_sym0_code_table, n_code_table_size);
	p_output += n_code_table_size;
	for(size_t i = 0; i < n_sym0_symbol_table_size; ++ i, ++ p_output)
		*p_output = symbol_after_zero_run_freq[i].n_symbol;
	_ASSERTE(p_output == p_out_end);
	// write numbers of codes of different lengths and associated symbols

	CHuff8::SortFrequencies_BySymbol(symbol_freq);
	CHuff16::SortFrequencies_BySymbol(run_length_freq);
	CHuff8::SortFrequencies_BySymbol(symbol_after_zero_run_freq);
	// sort by symbols so we can search using lower_bound (binary search)
	// note that freq_list has symbols with zero frequency removed
	// in CreateHuffmanTree(), searching is therefore inevitable (could
	// build lookup table though)

	{
		uint8_t m_n_byte = 0;
		int m_n_bit_num = 0;
		// bit writer

		const uint8_t *p_src = r_t_in_buffer.p_Data(),
			*p_src_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
		for(bool b_had_zero_run = false; p_src != p_src_end; ++ p_src) {
			_ASSERTE(p_src < p_src_end);
			const CHuff8::TFrequency &r_freq = (b_had_zero_run)?
				CHuff8::r_LookupSymbol(*p_src, symbol_after_zero_run_freq) :
				CHuff8::r_LookupSymbol(*p_src, symbol_freq);
			_ASSERTE(r_freq.n_symbol == *p_src);
			// find symbol in huffman tree

			b_had_zero_run = false;
			// clear the flag

			if(!Encode_Symbol(r_freq.n_code_word, r_freq.n_code_length,
			   m_n_byte, m_n_bit_num, p_output, p_out_end, r_t_out_buffer))
				return false;
			// write symbol as series of bits

			_ASSERTE(p_src < p_src_end);
			_ASSERTE(p_output <= p_out_end);

			if(!*p_src) {
				const uint8_t *p_last_zero = p_src + 1;
				while(p_last_zero != p_src_end && !*(p_last_zero))
					++ p_last_zero;
				_ASSERTE(!*(p_last_zero - 1) && (p_last_zero == p_src_end || *p_last_zero));
				// find the last zero

				size_t n_zero_run_length = p_last_zero - p_src;
				// calculate number of zeros

				while(n_zero_run_length) {
					uint16_t n_run_length = uint16_t(min(n_zero_run_length, size_t(UINT16_MAX)));
					n_zero_run_length -= n_run_length;
					// decompose the zero run to up to 65536 repeats

					const CHuff16::TFrequency &r_freq0 =
						CHuff16::r_LookupSymbol(n_run_length, run_length_freq);
					_ASSERTE(r_freq0.n_symbol == n_run_length);
					// find run length in the second huffman tree

					if(!Encode_Symbol(r_freq0.n_code_word, r_freq0.n_code_length,
					   m_n_byte, m_n_bit_num, p_output, p_out_end, r_t_out_buffer))
						return false;
					// write symbol as series of bits

					if(n_zero_run_length) {
						const CHuff8::TFrequency &r_freq_aft = 
							CHuff8::r_LookupSymbol(0, symbol_after_zero_run_freq);
						_ASSERTE(r_freq_aft.n_symbol == 0);

						if(!Encode_Symbol(r_freq_aft.n_code_word, r_freq_aft.n_code_length,
						   m_n_byte, m_n_bit_num, p_output, p_out_end, r_t_out_buffer))
							return false;
					}
					// in case the zero run length is decomposed,
					// we need to write another zero symbol first (from the after-zero-run tree!)
				}

				_ASSERTE(p_output <= p_out_end);

				p_src = p_last_zero - 1;

				b_had_zero_run = true;
			}
			_ASSERTE(p_src < p_src_end);
			// in case the symbol was zero, encode run length and skip the occurences of the symbol
		}

		if(m_n_bit_num) {
			m_n_byte <<= 8 - m_n_bit_num;
			// padd with 0-s

			if(p_output == p_out_end) {
				uint32_t n_off = uint32_t(p_output - r_t_out_buffer.p_Data());
				if(!r_t_out_buffer.Grow(1))
					return false;
				p_output = r_t_out_buffer.p_Data() + n_off;
				p_out_end = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size();
			}
			*p_output ++ = m_n_byte;
		}
		// output any leftover bits
	}
	// compress

	if(!r_t_out_buffer.Resize(uint32_t(p_output - r_t_out_buffer.p_Data()))) {
		_ASSERTE(0);
		return false;
	}
	// shrink the buffer to it's final length

	return true;
}

/*
 *								=== ~CRLE0_HuffmanCodec_1 ===
 */

/*
 *								=== CRLE0_HuffmanCodec_2 ===
 */

bool CRLE0_HuffmanCodec_2::Decode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
{
	if(r_t_in_buffer.n_Size() < sizeof(uint32_t))
		return false;
	// least possible size of input

	const uint8_t *p_input = r_t_in_buffer.p_Data();
	const uint8_t *p_src_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
	// input buffer pointer

	uint32_t n_unpack_length = *(uint32_t*)p_input;
	p_input += sizeof(uint32_t);
	// get size of uncompressed data

	int32_t *p_code_num;
	int n_symbol_num = 0;
	uint32_t p_min_code[max_CodeBitNum];
	uint32_t p_max_code[max_CodeBitNum];
	int32_t p_table_off[max_CodeBitNum]; // minimal / maximal codes for given lengths and offsets to symbol table
	const uint8_t *p_symbol;
	{
		if(p_input + sizeof(int32_t) * max_CodeBitNum > p_src_end)
			return false;
		p_code_num = (int32_t*)p_input; // numbers of codes of different lengths
		p_input += sizeof(int32_t) * max_CodeBitNum;
		// get numbers of codes

		for(uint32_t i = 0, n_cw = 0; i < max_CodeBitNum; ++ i, n_cw <<= 1) {
			p_min_code[i] = n_cw;
			p_max_code[i] = (n_cw += p_code_num[i]);
			p_table_off[i] = n_symbol_num - p_min_code[i];
			n_symbol_num += p_code_num[i];
		}
		// calculate number of symbols, table indices and min / max code values

		if(p_input + n_symbol_num * sizeof(uint8_t) > p_src_end)
			return false;
		p_symbol = p_input;
		p_input += n_symbol_num * sizeof(uint8_t);
		// copy symbols
	}
	// huffman tree for symbols

	int32_t *p_a0_code_num;
	int n_a0_symbol_num = 0;
	uint32_t p_a0_min_code[max_CodeBitNum];
	uint32_t p_a0_max_code[max_CodeBitNum];
	int32_t p_a0_table_off[max_CodeBitNum]; // minimal / maximal codes for given lengths and offsets to symbol table
	const uint8_t *p_a0_symbol;
	{
		if(p_input + sizeof(int32_t) * max_CodeBitNum > p_src_end)
			return false;
		p_a0_code_num = (int32_t*)p_input; // numbers of codes of different lengths
		p_input += sizeof(int32_t) * max_CodeBitNum;
		// get numbers of codes

		for(uint32_t i = 0, n_cw = 0; i < max_CodeBitNum; ++ i, n_cw <<= 1) {
			p_a0_min_code[i] = n_cw;
			p_a0_max_code[i] = (n_cw += p_a0_code_num[i]);
			p_a0_table_off[i] = n_a0_symbol_num - p_a0_min_code[i];
			n_a0_symbol_num += p_a0_code_num[i];
		}
		// calculate number of symbols, table indices and min / max code values

		if(p_input + n_a0_symbol_num * sizeof(uint8_t) > p_src_end)
			return false;
		p_a0_symbol = p_input;
		p_input += n_a0_symbol_num * sizeof(uint8_t);
		// copy symbols
	}
	// huffman tree for symbols after zero runs

	if(!r_t_out_buffer.Resize(n_unpack_length, false))
		return false;
	// alloc output buffer

	uint8_t n_byte = 0;
	int n_bit_num = 0;
	bool b_had_zero_run = false;
	const uint8_t *p_dest_end = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size();
	for(uint8_t *p_dest = r_t_out_buffer.p_Data(); p_dest != p_dest_end;) {
		for(uint32_t i = 0, n_code = 0;;) {
			if(!n_bit_num) {
				if(p_input == p_src_end)
					return false;
				// not enough data to decompress

				n_byte = *p_input ++;
				n_bit_num = 7;
			} else
				-- n_bit_num;
			n_code = (n_code << 1) | n_byte >> 7;
			n_byte <<= 1;
			// get a single bit from input stream, add bit to code

			if(!b_had_zero_run) {
				if(n_code >= p_min_code[i] && n_code < p_max_code[i]) {
					*p_dest = p_symbol[n_code + p_table_off[i]];
					break;
				}
			} else {
				if(n_code >= p_a0_min_code[i] && n_code < p_a0_max_code[i]) {
					*p_dest = p_a0_symbol[n_code + p_a0_table_off[i]];
					b_had_zero_run = false;
					break;
				}
			}
			// see if it's valid code for this bit length

			if(++ i == max_CodeBitNum)
				return false; // invalid code
		}
		// decode symbol value

		if(!*p_dest) {
			uint16_t n_run_length = 0;
			for(int i = 0; i < 16; ++ i) {
				if(!n_bit_num) {
					if(p_input == p_src_end)
						return false;
					// not enough data to decompress

					n_byte = *p_input ++;
					n_bit_num = 7;
				} else
					-- n_bit_num;
				n_run_length = (n_run_length << 1) | n_byte >> 7;
				n_byte <<= 1;
				// get a single bit from input stream, add bit to code
			}
			// read the run length as raw number

			_ASSERTE(n_run_length >= 1);
			memset(p_dest, 0, n_run_length * sizeof(uint8_t)); // decompress zero run
			p_dest += n_run_length;
			// fill run length with zeroes

			b_had_zero_run = true;
		} else
			++ p_dest;
		// decode run length from the second huffman tree

		// todo - implement lookahead table to guess the length of the symbol based on the first byte
	}
	// decode data

	_ASSERTE(p_input == p_src_end);
	// make sure we've read the whole input buffer

	return true;
}

bool CRLE0_HuffmanCodec_2::Encode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
{
	std::vector<CHuff8::TFrequency> symbol_freq;
	std::vector<CHuff8::TFrequency> symbol_after_zero_run_freq;

	{
		if(!stl_ut::Resize_To_N(symbol_freq, 1 << (8 * sizeof(uint8_t)),
		   CHuff8::TFrequency(0)) ||
		   !stl_ut::Resize_To_N(symbol_after_zero_run_freq, 1 << (8 * sizeof(uint8_t)),
		   CHuff8::TFrequency(0)))
			return false;
		for(size_t i = 0, n = symbol_freq.size(); i < n; ++ i) {
			symbol_freq[i].n_symbol = i;
			symbol_after_zero_run_freq[i].n_symbol = i;
		}
		// alloc symbol frequencies

		const uint8_t *p_src = r_t_in_buffer.p_Data(),
			*p_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
		for(bool b_had_zero_run = false; p_src != p_end; ++ p_src) {
			if(!b_had_zero_run) {
				++ symbol_freq[*p_src].n_frequency;
				_ASSERTE(symbol_freq[*p_src].n_frequency > 0);
				// encodes all symbols, including the nulls
			} else {
				_ASSERTE(*p_src != 0); // there shouldn't be a 0 after a zero run
				b_had_zero_run = false; // clear the flag
				++ symbol_after_zero_run_freq[*p_src].n_frequency;
				_ASSERTE(symbol_after_zero_run_freq[*p_src].n_frequency > 0);
				// encodes symbols occuring after zero runs
			}
			// increment symbol frequency

			if(!*p_src) {
				const uint8_t *p_last_zero = p_src + 1;
				while(p_last_zero != p_end && !*(p_last_zero))
					++ p_last_zero;
				_ASSERTE(!*(p_last_zero - 1) && (p_last_zero == p_end || *p_last_zero));
				// find the last zero

				size_t n_zero_run_length = p_last_zero - p_src;
				// calculate number of zeros

				while(n_zero_run_length) {
					uint16_t n_run_length = uint16_t(min(n_zero_run_length, size_t(UINT16_MAX)));
					n_zero_run_length -= n_run_length;
					// decompose the zero run to up to 65536 repeats

					if(n_zero_run_length) {
						++ symbol_after_zero_run_freq[0].n_frequency;
						_ASSERTE(symbol_after_zero_run_freq[0].n_frequency > 0);
					}
					// in case the run of zeroes is saved as multiple chunks,
					// increase frequency of the zero symbol as well
				}
				// accumulate zero run length frequencies

				p_src = p_last_zero - 1;
				// shift behind the zero run

				b_had_zero_run = true;
				// set the context flag
			}
		}
		// calculate frequencies
	}
	// calculate symbol frequencies

	int32_t p_sym_code_table[max_CodeBitNum];
	int32_t p_sym0_code_table[max_CodeBitNum];
	if(!CHuff8::Assign_CodeWords(symbol_freq, p_sym_code_table) ||
	   !CHuff8::Assign_CodeWords(symbol_after_zero_run_freq, p_sym0_code_table))
		return false;
	// create huffman tree

	size_t n_code_table_size = max_CodeBitNum * sizeof(int32_t);
	size_t n_sym_symbol_table_size = symbol_freq.size();
	size_t n_sym0_symbol_table_size = symbol_after_zero_run_freq.size();
	size_t n_header_size = sizeof(uint32_t) + 2 * n_code_table_size +
		(n_sym0_symbol_table_size + n_sym_symbol_table_size) * sizeof(uint8_t);
	// we're going to need to store the huffman tree
	// (numbers of codes with distinct lengths and symbols)

	if(!r_t_out_buffer.Resize(n_header_size, false))
		return false;
	// make sure there's enough space for header

	uint8_t *p_output = r_t_out_buffer.p_Data(),
		*p_out_end = r_t_out_buffer.p_Data() + n_header_size;
	// output buffer (size will change dynamically)

	*(uint32_t*)p_output = uint32_t(r_t_in_buffer.n_Size());
	p_output += sizeof(uint32_t);
	// write size of decompressed stream so decompressor can work in a single pass

	memcpy(p_output, p_sym_code_table, n_code_table_size);
	p_output += n_code_table_size;
	for(size_t i = 0; i < n_sym_symbol_table_size; ++ i, ++ p_output)
		*p_output = symbol_freq[i].n_symbol;
	//
	memcpy(p_output, p_sym0_code_table, n_code_table_size);
	p_output += n_code_table_size;
	for(size_t i = 0; i < n_sym0_symbol_table_size; ++ i, ++ p_output)
		*p_output = symbol_after_zero_run_freq[i].n_symbol;
	_ASSERTE(p_output == p_out_end);
	// write numbers of codes of different lengths and associated symbols

	CHuff8::SortFrequencies_BySymbol(symbol_freq);
	CHuff8::SortFrequencies_BySymbol(symbol_after_zero_run_freq);
	// sort by symbols so we can search using lower_bound (binary search)
	// note that freq_list has symbols with zero frequency removed
	// in CreateHuffmanTree(), searching is therefore inevitable (could
	// build lookup table though)

	{
		uint8_t m_n_byte = 0;
		int m_n_bit_num = 0;
		// bit writer

		const uint8_t *p_src = r_t_in_buffer.p_Data(),
			*p_src_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
		for(bool b_had_zero_run = false; p_src != p_src_end; ++ p_src) {
			_ASSERTE(p_src < p_src_end);
			const CHuff8::TFrequency &r_freq = (b_had_zero_run)?
				CHuff8::r_LookupSymbol(*p_src, symbol_after_zero_run_freq) :
				CHuff8::r_LookupSymbol(*p_src, symbol_freq);
			_ASSERTE(r_freq.n_symbol == *p_src);
			// find symbol in huffman tree

			b_had_zero_run = false;
			// clear the flag

			if(!Encode_Symbol(r_freq.n_code_word, r_freq.n_code_length,
			   m_n_byte, m_n_bit_num, p_output, p_out_end, r_t_out_buffer))
				return false;
			// write symbol as series of bits

			_ASSERTE(p_src < p_src_end);
			_ASSERTE(p_output <= p_out_end);

			if(!*p_src) {
				const uint8_t *p_last_zero = p_src + 1;
				while(p_last_zero != p_src_end && !*(p_last_zero))
					++ p_last_zero;
				_ASSERTE(!*(p_last_zero - 1) && (p_last_zero == p_src_end || *p_last_zero));
				// find the last zero

				size_t n_zero_run_length = p_last_zero - p_src;
				// calculate number of zeros

				while(n_zero_run_length) {
					uint16_t n_run_length = uint16_t(min(n_zero_run_length, size_t(UINT16_MAX)));
					n_zero_run_length -= n_run_length;
					// decompose the zero run to up to 65536 repeats

					if(!Encode_Symbol(n_run_length, 16,
					   m_n_byte, m_n_bit_num, p_output, p_out_end, r_t_out_buffer))
						return false;
					// write raw run length

					if(n_zero_run_length) {
						{
							const CHuff8::TFrequency &r_freq_aft = 
								CHuff8::r_LookupSymbol(0, symbol_after_zero_run_freq);
							_ASSERTE(r_freq_aft.n_symbol == 0);

							if(!Encode_Symbol(r_freq_aft.n_code_word, r_freq_aft.n_code_length,
							   m_n_byte, m_n_bit_num, p_output, p_out_end, r_t_out_buffer))
								return false;
						}
					}
					// in case the zero run length is decomposed,
					// we need to write another zero symbol first (from the after-zero-run tree!)
				}

				_ASSERTE(p_output <= p_out_end);

				p_src = p_last_zero - 1;

				b_had_zero_run = true;
			}
			_ASSERTE(p_src < p_src_end);
			// in case the symbol was zero, encode run length and skip the occurences of the symbol
		}

		if(m_n_bit_num) {
			m_n_byte <<= 8 - m_n_bit_num;
			// padd with 0-s

			if(p_output == p_out_end) {
				uint32_t n_off = uint32_t(p_output - r_t_out_buffer.p_Data());
				if(!r_t_out_buffer.Grow(1))
					return false;
				p_output = r_t_out_buffer.p_Data() + n_off;
				p_out_end = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size();
			}
			*p_output ++ = m_n_byte;
		}
		// output any leftover bits
	}
	// compress

	if(!r_t_out_buffer.Resize(uint32_t(p_output - r_t_out_buffer.p_Data()))) {
		_ASSERTE(0);
		return false;
	}
	// shrink the buffer to it's final length

	return true;
}

/*
 *								=== ~CRLE0_HuffmanCodec_2 ===
 */

/*
 *								=== CRLE0_HuffmanCodec_3 ===
 */

bool CRLE0_HuffmanCodec_3::Decode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
{
	if(r_t_in_buffer.n_Size() < sizeof(uint32_t))
		return false;
	// least possible size of input

	const uint8_t *p_input = r_t_in_buffer.p_Data();
	const uint8_t *p_src_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
	// input buffer pointer

	uint32_t n_unpack_length = *(uint32_t*)p_input;
	p_input += sizeof(uint32_t);
	// get size of uncompressed data

	int32_t *p_code_num;
	int n_symbol_num = 0;
	uint32_t p_min_code[max_CodeBitNum];
	uint32_t p_max_code[max_CodeBitNum];
	int32_t p_table_off[max_CodeBitNum]; // minimal / maximal codes for given lengths and offsets to symbol table
	const uint16_t *p_symbol;
	{
		if(p_input + sizeof(int32_t) * max_CodeBitNum > p_src_end)
			return false;
		p_code_num = (int32_t*)p_input; // numbers of codes of different lengths
		p_input += sizeof(int32_t) * max_CodeBitNum;
		// get numbers of codes

		for(uint32_t i = 0, n_cw = 0; i < max_CodeBitNum; ++ i, n_cw <<= 1) {
			p_min_code[i] = n_cw;
			p_max_code[i] = (n_cw += p_code_num[i]);
			p_table_off[i] = n_symbol_num - p_min_code[i];
			n_symbol_num += p_code_num[i];
		}
		// calculate number of symbols, table indices and min / max code values

		if(p_input + n_symbol_num * sizeof(uint8_t) > p_src_end)
			return false;
		p_symbol = (const uint16_t*)p_input;
		p_input += n_symbol_num * sizeof(uint16_t);
		// copy symbols
	}
	// huffman tree for symbols/run lengths

	int32_t *p_a0_code_num;
	int n_a0_symbol_num = 0;
	uint32_t p_a0_min_code[max_CodeBitNum];
	uint32_t p_a0_max_code[max_CodeBitNum];
	int32_t p_a0_table_off[max_CodeBitNum]; // minimal / maximal codes for given lengths and offsets to symbol table
	const uint8_t *p_a0_symbol;
	{
		if(p_input + sizeof(int32_t) * max_CodeBitNum > p_src_end)
			return false;
		p_a0_code_num = (int32_t*)p_input; // numbers of codes of different lengths
		p_input += sizeof(int32_t) * max_CodeBitNum;
		// get numbers of codes

		for(uint32_t i = 0, n_cw = 0; i < max_CodeBitNum; ++ i, n_cw <<= 1) {
			p_a0_min_code[i] = n_cw;
			p_a0_max_code[i] = (n_cw += p_a0_code_num[i]);
			p_a0_table_off[i] = n_a0_symbol_num - p_a0_min_code[i];
			n_a0_symbol_num += p_a0_code_num[i];
		}
		// calculate number of symbols, table indices and min / max code values

		if(p_input + n_a0_symbol_num * sizeof(uint8_t) > p_src_end)
			return false;
		p_a0_symbol = p_input;
		p_input += n_a0_symbol_num * sizeof(uint8_t);
		// copy symbols
	}
	// huffman tree for symbols after zero runs

	if(!r_t_out_buffer.Resize(n_unpack_length, false))
		return false;
	// alloc output buffer

	uint8_t n_byte = 0;
	int n_bit_num = 0;
	bool b_had_zero_run = false;
	const uint8_t *p_dest_end = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size();
	for(uint8_t *p_dest = r_t_out_buffer.p_Data(); p_dest != p_dest_end;) {
		for(uint32_t i = 0, n_code = 0;;) {
			if(!n_bit_num) {
				if(p_input == p_src_end)
					return false;
				// not enough data to decompress

				n_byte = *p_input ++;
				n_bit_num = 7;
			} else
				-- n_bit_num;
			n_code = (n_code << 1) | n_byte >> 7;
			n_byte <<= 1;
			// get a single bit from input stream, add bit to code

			if(!b_had_zero_run) {
				if(n_code >= p_min_code[i] && n_code < p_max_code[i]) {
					uint16_t n_sym = p_symbol[n_code + p_table_off[i]];
					if(n_sym < 256) {
						*p_dest = uint8_t(n_sym);
						++ p_dest;
						if(!n_sym)
							b_had_zero_run = true; // just a single zero, but ...
					} else {
						uint16_t n_run_length = n_sym - 256;
						memset(p_dest, 0, n_run_length * sizeof(uint8_t)); // decompress zero run
						p_dest += n_run_length;
						if(n_run_length < UINT16_MAX - 256) { // !!
							b_had_zero_run = true;
							// note - in case there was UINT16_MAX - 256 repeats,
							// there will be at least one more run (maybe a run of zero length)

							// todo - this is not true, what about a single run of 65535 - 255 zeros? need to encode run of zero zeros to make this decoder work!
						}
					}
					break;
				}
			} else {
				if(n_code >= p_a0_min_code[i] && n_code < p_a0_max_code[i]) {
					*p_dest = p_a0_symbol[n_code + p_a0_table_off[i]];
					++ p_dest;
					b_had_zero_run = false;
					break;
				}
			}
			// see if it's valid code for this bit length

			if(++ i == max_CodeBitNum)
				return false; // invalid code
		}
		// decode symbol value

		// todo - implement lookahead table to guess the length of the symbol based on the first byte
	}
	// decode data

	_ASSERTE(p_input == p_src_end);
	// make sure we've read the whole input buffer

	return true;
}

bool CRLE0_HuffmanCodec_3::Encode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
{
	std::vector<CHuff16::TFrequency> symbol_freq;
	std::vector<CHuff8::TFrequency> symbol_after_zero_run_freq;

	{
		if(!stl_ut::Resize_To_N(symbol_freq,
		   1 << (8 * sizeof(uint8_t)), CHuff16::TFrequency(0)) ||
		   !stl_ut::Resize_To_N(symbol_after_zero_run_freq,
		   1 << (8 * sizeof(uint8_t)), CHuff8::TFrequency(0)))
			return false;
		for(size_t i = 0, n = symbol_freq.size(); i < n; ++ i) {
			symbol_freq[i].n_symbol = i;
			symbol_after_zero_run_freq[i].n_symbol = i;
		}
		// alloc symbol frequencies

		std::map<uint16_t, size_t> run_length_set;

		const uint8_t *p_src = r_t_in_buffer.p_Data(),
			*p_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
		for(bool b_had_zero_run = false; p_src != p_end; ++ p_src) {
			if(b_had_zero_run) {
				_ASSERTE(*p_src != 0); // there shouldn't be a 0 after a zero run
				b_had_zero_run = false; // clear the flag
				++ symbol_after_zero_run_freq[*p_src].n_frequency;
				_ASSERTE(symbol_after_zero_run_freq[*p_src].n_frequency > 0);
				// encodes symbols occuring after zero runs
			} else if(!*p_src) {
				const uint8_t *p_last_zero = p_src + 1;
				while(p_last_zero != p_end && !*(p_last_zero))
					++ p_last_zero;
				_ASSERTE(!*(p_last_zero - 1) && (p_last_zero == p_end || *p_last_zero));
				// find the last zero

				size_t n_zero_run_length = p_last_zero - p_src;
				// calculate number of zeros

				if(n_zero_run_length == 1)
					b_had_zero_run = true; // only a single zero, but ...
				else
					b_had_zero_run = true;
				// set the context flag

				try {
					while(n_zero_run_length) {
						uint16_t n_run_length = uint16_t(min(n_zero_run_length,
							size_t(UINT16_MAX) - 256)); // !!
						n_zero_run_length -= n_run_length;
						// decompose the zero run to up to 65536 repeats

						if(n_run_length > 1) {
							std::map<uint16_t, size_t>::iterator p_freq_it =
								run_length_set.find(n_run_length);
							if(p_freq_it != run_length_set.end())
								++ (*p_freq_it).second;
							else
								run_length_set[n_run_length] = 1;
							// increment run length frequency

							if(n_run_length == UINT16_MAX - 256 && !n_zero_run_length) {
								n_run_length = 0;

								std::map<uint16_t, size_t>::iterator p_freq_it =
									run_length_set.find(n_run_length);
								if(p_freq_it != run_length_set.end())
									++ (*p_freq_it).second;
								else
									run_length_set[n_run_length] = 1;
								// increment run length frequency zero
							}
							// in case the last run length was 65536 - 256,
							// force one more zero-length run
						} else {
							++ symbol_freq[0].n_frequency;
							_ASSERTE(symbol_freq[0].n_frequency > 0);
							// encodes all symbols, including the nulls
						}
					}
				} catch(std::bad_alloc&) {
					return false;
				}
				// accumulate zero run length frequencies

				p_src = p_last_zero - 1;
				// shift behind the zero run
			} else {
				++ symbol_freq[*p_src].n_frequency;
				_ASSERTE(symbol_freq[*p_src].n_frequency > 0);
				// encodes all symbols, including the nulls
			}
		}
		// calculate frequencies

		_ASSERTE(symbol_freq.size() == 256);
		if(!stl_ut::Resize_To_N(symbol_freq, 256 + run_length_set.size(),
		   CHuff16::TFrequency(0)))
			return false;
		std::vector<CHuff16::TFrequency>::iterator p_freq_it =
			symbol_freq.begin() + 256;
		for(std::map<uint16_t, size_t>::const_iterator p_rlf_it = run_length_set.begin(),
		   p_end_it = run_length_set.end(); p_rlf_it != p_end_it; ++ p_rlf_it, ++ p_freq_it) {
			uint16_t n_run_length = (*p_rlf_it).first;
			_ASSERTE(n_run_length > 1); // runs of a single zero are encoded differently
			size_t n_frequency = (*p_rlf_it).second;
			*p_freq_it = CHuff16::TFrequency(256 + n_run_length, n_frequency);
		}
		// copy set of frequencies to the list of frequecies
	}
	// calculate symbol frequencies

	int32_t p_sym_code_table[max_CodeBitNum];
	int32_t p_sym0_code_table[max_CodeBitNum];
	if(!CHuff16::Assign_CodeWords(symbol_freq, p_sym_code_table) ||
	   !CHuff8::Assign_CodeWords(symbol_after_zero_run_freq, p_sym0_code_table))
		return false;
	// create huffman tree

	size_t n_code_table_size = max_CodeBitNum * sizeof(int32_t);
	size_t n_sym_symbol_table_size = symbol_freq.size();
	size_t n_sym0_symbol_table_size = symbol_after_zero_run_freq.size();
	size_t n_header_size = sizeof(uint32_t) + 2 * n_code_table_size +
		n_sym0_symbol_table_size * sizeof(uint8_t) +
		n_sym_symbol_table_size * sizeof(uint16_t);
	// we're going to need to store the huffman tree
	// (numbers of codes with distinct lengths and symbols)

	if(!r_t_out_buffer.Resize(n_header_size, false))
		return false;
	// make sure there's enough space for header

	uint8_t *p_output = r_t_out_buffer.p_Data(),
		*p_out_end = r_t_out_buffer.p_Data() + n_header_size;
	// output buffer (size will change dynamically)

	*(uint32_t*)p_output = uint32_t(r_t_in_buffer.n_Size());
	p_output += sizeof(uint32_t);
	// write size of decompressed stream so decompressor can work in a single pass

	memcpy(p_output, p_sym_code_table, n_code_table_size);
	p_output += n_code_table_size;
	for(size_t i = 0; i < n_sym_symbol_table_size; ++ i, p_output += sizeof(uint16_t))
		*(uint16_t*)p_output = symbol_freq[i].n_symbol;
	//
	memcpy(p_output, p_sym0_code_table, n_code_table_size);
	p_output += n_code_table_size;
	for(size_t i = 0; i < n_sym0_symbol_table_size; ++ i, ++ p_output)
		*p_output = symbol_after_zero_run_freq[i].n_symbol;
	_ASSERTE(p_output == p_out_end);
	// write numbers of codes of different lengths and associated symbols

	CHuff16::SortFrequencies_BySymbol(symbol_freq);
	CHuff8::SortFrequencies_BySymbol(symbol_after_zero_run_freq);
	// sort by symbols so we can search using lower_bound (binary search)
	// note that freq_list has symbols with zero frequency removed
	// in CreateHuffmanTree(), searching is therefore inevitable (could
	// build lookup table though)

	{
		uint8_t m_n_byte = 0;
		int m_n_bit_num = 0;
		// bit writer

		const uint8_t *p_src = r_t_in_buffer.p_Data(),
			*p_src_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
		for(bool b_had_zero_run = false; p_src != p_src_end; ++ p_src) {
			_ASSERTE(p_src < p_src_end);
			if(*p_src) {
				int n_bit_num, n_value;
				if(!b_had_zero_run) {
					const CHuff16::TFrequency &r_freq =
						CHuff16::r_LookupSymbol(*p_src, symbol_freq);
					_ASSERTE(r_freq.n_symbol == *p_src);
					// find symbol in huffman tree

					n_bit_num = r_freq.n_code_length;
					n_value = r_freq.n_code_word;
				} else {
					const CHuff8::TFrequency &r_freq =
						CHuff8::r_LookupSymbol(*p_src, symbol_after_zero_run_freq);
					_ASSERTE(r_freq.n_symbol == *p_src);
					// find symbol in huffman tree

					b_had_zero_run = false;
					// clear the flag

					n_bit_num = r_freq.n_code_length;
					n_value = r_freq.n_code_word;
				}
				// find symbol in the huffman tree

				// t_odo - finish encoding here, note uint16_max - 256
				// todo - remove adding escape sequences from long zero runs, simply detect uint16_max sequence in the receiver and read the next symbol

				if(!Encode_Symbol(n_value, n_bit_num, m_n_byte,
				   m_n_bit_num, p_output, p_out_end, r_t_out_buffer))
					return false;
				// write symbol as series of bits

				_ASSERTE(p_src < p_src_end);
				_ASSERTE(p_output <= p_out_end);
			} else {
				_ASSERTE(!b_had_zero_run);

				const uint8_t *p_last_zero = p_src + 1;
				while(p_last_zero != p_src_end && !*(p_last_zero))
					++ p_last_zero;
				_ASSERTE(!*(p_last_zero - 1) && (p_last_zero == p_src_end || *p_last_zero));
				// find the last zero

				size_t n_zero_run_length = p_last_zero - p_src;
				// calculate number of zeros

				if(n_zero_run_length == 1)
					b_had_zero_run = true; // just a single zero, but still ...
				else
					b_had_zero_run = true;
				// had a zero run

				while(n_zero_run_length) {
					uint16_t n_run_length = uint16_t(min(n_zero_run_length,
						size_t(UINT16_MAX) - 256)); // !!
					n_zero_run_length -= n_run_length;
					// decompose the zero run to up to 65536 repeats

					uint16_t n_encode = (n_run_length != 1)? n_run_length + 256 : 0;
					const CHuff16::TFrequency &r_freq0 =
						CHuff16::r_LookupSymbol(n_encode, symbol_freq);
					_ASSERTE(r_freq0.n_symbol == n_encode);
					// find run length in the second huffman tree

					if(!Encode_Symbol(r_freq0.n_code_word, r_freq0.n_code_length,
					   m_n_byte, m_n_bit_num, p_output, p_out_end, r_t_out_buffer))
						return false;
					// write symbol as series of bits

					if(n_run_length == UINT16_MAX - 256 && !n_run_length) {
						n_run_length = 0;

						const CHuff16::TFrequency &r_freq00 =
							CHuff16::r_LookupSymbol((n_run_length != 1)?
							n_run_length + 256 : 0, symbol_freq);
						_ASSERTE(r_freq00.n_symbol == ((n_run_length != 1)? n_run_length + 256 : 0));
						// find run length in the second huffman tree

						if(!Encode_Symbol(r_freq00.n_code_word, r_freq00.n_code_length,
						   m_n_byte, m_n_bit_num, p_output, p_out_end, r_t_out_buffer))
							return false;
						// write symbol as series of bits
					}
					// in case the last run length was 65536 - 256,
					// force one more zero-length run
				}

				_ASSERTE(p_output <= p_out_end);

				p_src = p_last_zero - 1;
			}
			_ASSERTE(p_src < p_src_end);
			// in case the symbol was zero, encode run length and
			// skip the occurences of the symbol
		}

		if(m_n_bit_num) {
			m_n_byte <<= 8 - m_n_bit_num;
			// padd with 0-s

			if(p_output == p_out_end) {
				uint32_t n_off = uint32_t(p_output - r_t_out_buffer.p_Data());
				if(!r_t_out_buffer.Grow(1))
					return false;
				p_output = r_t_out_buffer.p_Data() + n_off;
				p_out_end = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size();
			}
			*p_output ++ = m_n_byte;
		}
		// output any leftover bits
	}
	// compress

	if(!r_t_out_buffer.Resize(uint32_t(p_output - r_t_out_buffer.p_Data()))) {
		_ASSERTE(0);
		return false;
	}
	// shrink the buffer to it's final length

	// todo - this probably does something else than BW94 did. they model symbols as 1) 255 symbols + all the different runs of nulls and 2) 256 possible symbols that occur after a null

	return true;
}

/*
 *								=== ~CRLE0_HuffmanCodec_3 ===
 */

/*
 *								=== CInversionFrequenciesCodec ===
 */

class CInversionFrequenciesCodec::CSortAsc {
protected:
	const uint32_t *m_p_freq_list;

public:
	CSortAsc(const uint32_t *p_freq_list)
		:m_p_freq_list(p_freq_list)
	{}

	inline bool operator ()(uint8_t n_sym_a, uint8_t n_sym_b) const
	{
		return m_p_freq_list[n_sym_a] < m_p_freq_list[n_sym_b];
	}
};

class CInversionFrequenciesCodec::CSortDesc {
protected:
	const uint32_t *m_p_freq_list;

public:
	CSortDesc(const uint32_t *p_freq_list)
		:m_p_freq_list(p_freq_list)
	{}

	inline bool operator ()(uint8_t n_sym_a, uint8_t n_sym_b) const
	{
		return m_p_freq_list[n_sym_a] > m_p_freq_list[n_sym_b];
	}
};

/*
 *	static bool CInversionFrequenciesCodec::Decode(const TBuffer &r_t_in_buffer,
 *		TBuffer &r_t_out_buffer)
 *		- decodes data from r_t_in_buffer, outputs to r_t_out_buffer (can be empty)
 *		- returns true on success, false on failure
 */
bool CInversionFrequenciesCodec::Decode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
{
	if(r_t_in_buffer.n_Size() < sizeof(uint32_t))
		return false;
	// minimal size of input

	const uint8_t *p_src = r_t_in_buffer.p_Data();
	const uint8_t *p_end = p_src + r_t_in_buffer.n_Size();

	uint32_t n_output_size = *(uint32_t*)p_src;
	p_src += sizeof(uint32_t);
	// read output size

	if(!r_t_out_buffer.Resize(n_output_size, false))
		return false;
	// allocate output buffer

	memset(r_t_out_buffer.p_Data(), 0xff, r_t_out_buffer.n_Size());
	// set output to contain highest symbols

	for(int i = 0; i < 255; ++ i) { // symbol 255 is already filled in output buffer by memset() above
		uint32_t n_count = 0;
		for(int n_char_size = 0;; ++ n_char_size) {
			if(p_src == p_end || n_char_size > 5)
				return false;
			uint8_t n_char = *p_src ++;
			n_count |= n_char & 0x7f;
			if(!(n_char & 0x80))
				break;
			n_count <<= 7;
		}
		// read escaped value

		if(!n_count)
			continue;
		// no occurencies of this symbol

		uint8_t *p_dest = r_t_out_buffer.p_Data();
		uint8_t *p_dest_end = p_dest + r_t_out_buffer.n_Size();
		// get destination pointers

		uint8_t n_decoded_char = i;
		do {
			uint32_t n_distance = 0;
			for(int n_char_size = 0;; ++ n_char_size) {
				if(p_src == p_end || n_char_size > 5)
					return false;
				uint8_t n_char = *p_src ++;
				n_distance |= n_char & 0x7f;
				if(!(n_char & 0x80))
					break;
				n_distance <<= 7;
			}
			// read escaped value

			_ASSERTE(p_dest <= p_dest_end);
			while(n_distance && p_dest != p_dest_end) {
				if(*p_dest >= n_decoded_char)
					-- n_distance;
				++ p_dest;
			}
			if(p_dest == p_dest_end)
				return false;
			while(*p_dest < n_decoded_char) {
				if(++ p_dest == p_dest_end)
					return false;
			}
			// skip to next destination, do not count already-filled values

			if(p_dest == p_dest_end)
				return false;
			// reached end of output

			*p_dest ++ = n_decoded_char;
			// write character to it's position
		} while(-- n_count);
		// fill-in character occurences (rather slow, but memory-efficient)
	}
	// read inversion frequencies, reconstruct positions

	if(p_src + sizeof(uint16_t) > p_end)
		return false;
	int n_perm_table_size = *(int16_t*)p_src;
	p_src += sizeof(uint16_t);
	// read permutation table size

	int n_perm_table_offset = 0;
	if(n_perm_table_size) {
		if(p_src == p_end)
			return false;
		n_perm_table_offset = *p_src ++;
	}
	// read permutaton table offset, if present

	const uint8_t *p_perm_table = p_src;
	if(p_src + n_perm_table_size != p_end)
		return false;
	// get permutatuion table, apply offset

	if(n_perm_table_size) {
		uint8_t *p_dest = r_t_out_buffer.p_Data();
		uint8_t *p_dest_end = p_dest + r_t_out_buffer.n_Size();
		// get destination pointers

		for(; p_dest != p_dest_end; ++ p_dest) {
			int n_index = int(*p_dest) - n_perm_table_offset;
			if(n_index < 0 || n_index >= n_perm_table_size)
				return false;
			*p_dest = p_perm_table[n_index];
		}
	}
	// apply permutation table

	return true;
}

/*
 *	static bool CInversionFrequenciesCodec::Encode(const TBuffer &r_t_in_buffer,
 *		TBuffer &r_t_out_buffer, int n_permutation_type = sort_NoSort)
 *		- encodes data from r_t_in_buffer, outputs to r_t_out_buffer (can be empty)
 *		- returns true on success, false on failure
 */
bool CInversionFrequenciesCodec::Encode(const TBuffer &r_t_in_buffer,
	TBuffer &r_t_out_buffer, int n_permutation_type)
{
	r_t_out_buffer.Resize(r_t_in_buffer.n_Size(), false);
	// preallocate some space in output

	if(!r_t_out_buffer.Resize(sizeof(uint32_t), false))
		return false;
	*(uint32_t*)r_t_out_buffer.p_Data() = r_t_in_buffer.n_Size();
	// write size of uncompressed data for easier decoding

	uint32_t p_frequency[256] = {0};
	{
		const uint8_t *p_src = r_t_in_buffer.p_Data();
		const uint8_t *p_end = p_src + r_t_in_buffer.n_Size();
		for(; p_src != p_end; ++ p_src) {
			++ p_frequency[*p_src];
			_ASSERTE(p_frequency[*p_src] > 0); // watch out for overflow
		}
	}
	// count occurencies of all symbols in input sequence

	uint8_t p_perm_table[256];
	for(int i = 0; i < 256; ++ i)
		p_perm_table[i] = i;
	if(n_permutation_type != sort_NoSort) {
		if(n_permutation_type == sort_FreqAscending)
			std::sort(p_perm_table, p_perm_table + 256, CSortAsc(p_frequency));
		else /*if(n_permutation_type == sort_FreqDescending)*/ {
			_ASSERTE(n_permutation_type == sort_FreqDescending);
			std::sort(p_perm_table, p_perm_table + 256, CSortDesc(p_frequency));
		}
		// create permutation table
	}
	uint8_t p_inv_perm_table[256];
	for(int i = 0; i < 256; ++ i)
		p_inv_perm_table[p_perm_table[i]] = i;
	// build permutation table

	for(int i = 0; i < 255; ++ i) { // don't have to code the last symbol
		uint8_t n_encoded_char = i;
		uint32_t n_count = p_frequency[p_perm_table[i]];
		if(!Emit(r_t_out_buffer, n_count))
			return false;
		// write frequency of the symbol

		if(!n_count)
			continue;
		// are there occurences of this symbol?

		uint32_t n_distance = 0;
		for(const uint8_t *p_src = r_t_in_buffer.p_Data();; ++ p_src) {
			_ASSERTE(p_src < r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size());
			uint8_t n_char = p_inv_perm_table[*p_src]; // apply inverse permutation here, we need to process symbols in ordered fashion
			if(n_char == n_encoded_char) {
				if(!Emit(r_t_out_buffer, n_distance))
					return false;
				// emit distance

				n_distance = 0;
				// reset distance

				if(!(-- n_count))
					break;
				// are there more occurences?
			} else if(n_char > n_encoded_char) // we need to ensure symbols are compared correct (inverse permutation table)
				++ n_distance;
		}
		// write 
	}
	// write inverse frequencies of symbols

	if(n_permutation_type != sort_NoSort) {
		// t_odo - this doesn't work. think about it

		int n_perm_table_size = 0, n_perm_table_off = 0;
		int b = 0, e = 256;
		while(b < e && !p_frequency[p_perm_table[b]])
			++ b;
		while(e - 1 >= b && !p_frequency[p_perm_table[e - 1]])
			-- e;
		n_perm_table_size = e - b; // even 0
		n_perm_table_off = b;
		// crop permutation table

		int n_perm_tab_space = sizeof(uint16_t) + ((n_perm_table_size)?
			n_perm_table_size + sizeof(uint8_t) : 0);
		// calculate space

		if(!r_t_out_buffer.Grow(n_perm_tab_space))
			return false;
		uint8_t *p_dest_end = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size();
		uint8_t *p_dest = p_dest_end - n_perm_tab_space;
		// resize

		*(uint16_t*)p_dest = n_perm_table_size;
		p_dest += sizeof(uint16_t);
		// write size

		if(n_perm_table_size) {
			*p_dest ++ = n_perm_table_off;
			memcpy(p_dest, p_perm_table + n_perm_table_off,
				n_perm_table_size * sizeof(uint8_t));
		}
		// write offset + contents

		_ASSERTE(p_dest + n_perm_table_size == p_dest_end);
	} else {
		if(!r_t_out_buffer.Grow(2))
			return false;
		uint8_t *p_dest = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size() - 2;
		*(uint16_t*)p_dest = 0;
		// write zero (perm table size)
	}
	// write permutation table

	return true;
}

inline bool CInversionFrequenciesCodec::Emit(TBuffer &r_t_out_buffer, uint32_t n_value)
{
	if(n_value <= 0x7f) { // 7 bits
		if(!r_t_out_buffer.Grow(1))
			return false;
		uint8_t *p_dest = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size() - 1;
		// resize output

		p_dest[0] = n_value;
		// write values
	} else if(n_value <= 0x3fff) { // 14 bits
		if(!r_t_out_buffer.Grow(2))
			return false;
		uint8_t *p_dest = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size() - 2;
		// resize output

		p_dest[0] = ((n_value >> 7) & 0x7f) | 0x80;
		p_dest[1] = n_value & 0x7f;
		// write values
	} else if(n_value <= 0x1fffff) { // 21 bits
		if(!r_t_out_buffer.Grow(3))
			return false;
		uint8_t *p_dest = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size() - 3;
		// resize output

		p_dest[0] = ((n_value >> 14) & 0x7f) | 0x80;
		p_dest[1] = ((n_value >> 7) & 0x7f) | 0x80;
		p_dest[2] = n_value & 0x7f;
		// write values
	} else if(n_value <= 0xfffffff) { // 28 bits
		if(!r_t_out_buffer.Grow(4))
			return false;
		uint8_t *p_dest = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size() - 4;
		// resize output

		p_dest[0] = ((n_value >> 21) & 0x7f) | 0x80;
		p_dest[1] = ((n_value >> 14) & 0x7f) | 0x80;
		p_dest[2] = ((n_value >> 7) & 0x7f) | 0x80;
		p_dest[3] = n_value & 0x7f;
		// write values
	} else { // 32 bits
		if(!r_t_out_buffer.Grow(5))
			return false;
		uint8_t *p_dest = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size() - 5;
		// resize output

		p_dest[0] = ((n_value >> 28) & 0x7f) | 0x80;
		p_dest[1] = ((n_value >> 21) & 0x7f) | 0x80;
		p_dest[2] = ((n_value >> 14) & 0x7f) | 0x80;
		p_dest[3] = ((n_value >> 7) & 0x7f) | 0x80;
		p_dest[4] = n_value & 0x7f;
		// write values
	}

	// t_odo - feeling guilty about this one.
	// todo - this should be encoded using huffman as values, not as bytes
	// todo - this is (in preprtint) saved rather complicately, involving breaking of the values down to exponent/mantissa
	// todo - exponential part from RLE-EXP is encoded differently

	return true;
}

/*
 *								=== ~CInversionFrequenciesCodec ===
 */
