/*
								+----------------------------------+
								|                                  |
								|  *** Basic compression algs ***  |
								|                                  |
								|   Copyright  -tHE SWINe- 2008   |
								|                                  |
								|            Compress.h            |
								|                                  |
								+----------------------------------+
*/

#pragma once
#ifndef __SIMPLE_COMPRESSION_INCLUDED
#define __SIMPLE_COMPRESSION_INCLUDED

/**
 *	@file Compress.h
 *	@author -tHE SWINe-
 *	@date 2008
 *	@brief Simple experimental data compression framework, focused on Burrows-Wheeler methods.
 *
 *	@date 2007-02-25
 *
 *	this is first beta version if the file. todo - rewrite TBuffer so it can realloc
 *	itself in a way std::vector can and try to avoid any (re)allocations while (un)packing
 *
 *	@date 2008-03-13
 *
 *	TBuffer was rewritten as requested, fixed some signed / unsigned mismatches for gcc
 *
 *	@date 2008-11-20
 *
 *	TBuffer unit is no longer unsigned char, but uint8_t instead, this should avoid
 *	incompatibility with some extended character encoding in newer versions of visual studio
 *
 *	@date 2009-05-23
 *
 *	removed all instances of std::vector::reserve and replaced them by stl_ut::Reserve_*
 *
 *	@date 2009-10-08
 *
 *	slightly improved CHuffmanCodec, fixed bug in cannonical huffman code generation for
 *	trees where there are no symbols of length n, but there are both shorter and longer
 *	symbols (codes got shifted too much, got too long, had to be regenerated). this was
 *	hurting compression and so it had to be fixed, but the bug was also in decomression
 *	code, so this sadly breaks backward compatibility.
 *
 *	@date 2009-10-11
 *
 *	replaced stl container ::resize() by stl_ut::Resize_*() to avoid unhandled
 *	std::bad_alloc
 *
 *	optimized CBurrowsWheelerTransform::CWrapMemCmp() wrap-arround memory comparator by
 *	calculating lengths of blocks that do not wrap and comparing them in shorter loops
 *
 *	added __BWT_ENABLE_THREADED_ENCODE macro
 *
 *	@date 2009-10-20
 *
 *	fixed some warnings when compiling under VC 2005, implemented "Security
 *	Enhancements in the CRT" for VC 2008. compare against MyProjects_2009-10-19_
 *
 *	@date 2012-06-19
 *
 *	Moved multiple inclusion guard before file documentation comment.
 *
 *	@date 2012-06-30
 *
 *	Changed the function for symbol lookup in CHuffmanTree to use a predicate instead
 *	of defining yet another comparison function for comparing symbols with frequencies.
 *
 */

#include "Buffer.h"
#include "MinMax.h"

/**
 *	@def __BWT_ENABLE_THREADED_ENCODE
 *
 *	@brief enables CBurrowsWheelerTransform::ThreadedEncode()
 *
 *	This enables multi-thread implementation of Burrows-Wheeler transform. While
 *		it may be faster than single-threaded implementation, it's far from perfect
 *		(threads are not loaded equally). Therefore, for high-performance
 *		implementations, paralelism should be achieved another way.
 */
//#define __BWT_ENABLE_THREADED_ENCODE

/**
 *	@brief simple BWT encoder / decoder
 *
 *	Simple Burrows-Wheeler transform implementation. Uses indices instead
 *		of string copies to minimize memory usage. Also uses somewhat optimized
 *		version of memcpy, only available under windows (it is written in MASM).
 *
 *	@todo Create GAS port of assembly part, so this would be same fast in linux.
 */
class CBurrowsWheelerTransform {
private:
	class CWrapMemCmp;
	class CIota;
	class CSorter;
	class CMerger;

public:
	/**
	 *	@brief decoding function
	 *
	 *	Decodes data from r_t_src, outputs to r_t_dest.
	 *
	 *	@param[in] r_t_src is source data buffer
	 *	@param[out] r_t_dest is destination data buffer (original contents will be lost)
	 *
	 *	@return Returns true on success, false on failure.
	 */
	static bool Decode(const TBuffer &r_t_src, TBuffer &r_t_dest);

	/**
	 *	@brief encoding function
	 *
	 *	Encodes data from r_t_src, outputs to r_t_dest.
	 *
	 *	@param[in] r_t_src is source data buffer
	 *	@param[out] r_t_dest is destination data buffer (original contents will be lost)
	 *
	 *	@return Returns true on success, false on failure.
	 *
	 *	@note This doesn't work (is going to return false) with empty input buffer.
	 */
	static bool Encode(const TBuffer &r_t_src, TBuffer &r_t_dest);

#ifdef __BWT_ENABLE_THREADED_ENCODE

	/**
	 *	@brief encoding function
	 *
	 *	Encodes data from r_t_src, outputs to r_t_dest, works in parallel. While
	 *		this may be faster than single-threaded implementation, it's far from
	 *		perfect (threads are not loaded equally). Therefore, for high-performance
	 *		implementations, paralelism should be achieved another way.
	 *
	 *	@param[in] r_t_src is source data buffer
	 *	@param[out] r_t_dest is destination data buffer (original contents will be lost)
	 *	@param[in] n_thread_num is number of worker threads (must be power of two).
	 *
	 *	@return Returns true on success, false on failure.
	 *
	 *	@note Doesn't work (is going to return false) with empty input buffer.
	 *	@note This only gets compiled if __BWT_ENABLE_THREADED_ENCODE macro
	 *		is defined (not by default).
	 */
	static bool ThreadedEncode(const TBuffer &r_t_src, TBuffer &r_t_dest, int n_thread_num);

#endif // __BWT_ENABLE_THREADED_ENCODE
};

/**
 *	@brief basic move to front transformation implementation
 *
 *	Implements MTF as originally proposed, and it's variant MTF-1.
 */
class CMoveToFrontTransform {
public:
	/**
	 *	@brief MTF algorithm names
	 */
	enum {
		algo_MTF,	/**< original MTF */
		algo_MTF_1	/**< MTF-1 */
	};

	/**
	 *	@brief in-place decoding function
	 *
	 *	Decodes data in r_t_buffer.
	 *
	 *	@param[in,out] r_t_buffer is both source and destination data buffer
	 *	@param[in] n_algorithm is MTF algorithm, one of algo_MTF, algo_MTF_1
	 *
	 *	@return Returns true on success, false on failure.
	 *
	 *	@note Using different algorithm than the one used when encoding yields
	 *		different results.
	 */
	static void Decode(TBuffer &r_t_buffer, int n_algorithm = algo_MTF);

	/**
	 *	@brief in-place encoding function
	 *
	 *	Encodes data in r_t_buffer.
	 *
	 *	@param[in,out] r_t_buffer is both source and destination data buffer
	 *	@param[in] n_algorithm is MTF algorithm, one of algo_MTF, algo_MTF_1
	 *
	 *	@return Returns true on success, false on failure.
	 */
	static void Encode(TBuffer &r_t_buffer, int n_algorithm = algo_MTF);

	/**
	 *	@brief decoding function
	 *
	 *	Decodes data from r_t_src, outputs to r_t_dest.
	 *
	 *	@param[in] r_t_src is source data buffer
	 *	@param[out] r_t_dest is destination data buffer (original contents will be lost)
	 *	@param[in] n_algorithm is MTF algorithm, one of algo_MTF, algo_MTF_1
	 *
	 *	@return Returns true on success, false on failure.
	 *
	 *	@note Using different algorithm than the one used when encoding yields
	 *		different results.
	 */
	static bool Decode(const TBuffer &r_t_src, TBuffer &r_t_dest, int n_algorithm = algo_MTF);

	/**
	 *	@brief encoding function
	 *
	 *	Encodes data from r_t_src, outputs to r_t_dest.
	 *
	 *	@param[in] r_t_src is source data buffer
	 *	@param[out] r_t_dest is destination data buffer (original contents will be lost)
	 *	@param[in] n_algorithm is MTF algorithm, one of algo_MTF, algo_MTF_1
	 *
	 *	@return Returns true on success, false on failure.
	 */
	static bool Encode(const TBuffer &r_t_src, TBuffer &r_t_dest, int n_algorithm = algo_MTF);

private:
	static void _Encode(const TBuffer &r_t_src, TBuffer &r_t_dest, int n_algorithm);
	static void _Decode(const TBuffer &r_t_src, TBuffer &r_t_dest, int n_algorithm);
};

/**
 *	@brief simple run length coder
 *
 *	Implementation of RLE, optimized for packing MTF outputs (compression flag bit is LSB so,
 *		in theory, symbols with lower values are generated, in hope not to disturb symbol
 *		probabilities after MTF too much). It actually works with the Callgary corpus.
 */
class CRunLengthCodec {
private:
	/**
	  *	@brief RLE configuration enums
	  */
	enum {
		min_RunLength = 3	/**< @brief minimal length setting */
	};

public:
	/**
	 *	@brief decoding function
	 *
	 *	Decodes data from r_t_src, outputs to r_t_dest.
	 *
	 *	@param[in] r_t_src is source data buffer
	 *	@param[out] r_t_dest is destination data buffer (original contents will be lost)
	 *
	 *	@return Returns true on success, false on failure.
	 */
	static bool Decode(const TBuffer &r_t_src, TBuffer &r_t_dest);

	/**
	 *	@brief encoding function
	 *
	 *	Encodes data from r_t_src, outputs to r_t_dest.
	 *
	 *	@param[in] r_t_src is source data buffer
	 *	@param[out] r_t_dest is destination data buffer (original contents will be lost)
	 *
	 *	@return Returns true on success, false on failure.
	 */
	static bool Encode(const TBuffer &r_t_src, TBuffer &r_t_dest);
};

/**
 *	@brief modified dual-stream run length coder
 *
 *	Simple RLE, optimized for packing IF outputs (run lengths are stored in a second buffer).
 *		This implementation is similar to the RLE-EXP algorithm.
 */
class CModifiedRLECodec {
public:
	enum {
		n_min_run_length = 3 /**< @brief minimal run length to be encoded as compressed data */
	};

public:
	/**
	 *	@brief decoding function
	 *
	 *	Decodes data from r_t_src, r_t_src_runs and outputs to r_t_dest.
	 *
	 *	@param[in] r_t_src is source data buffer, containing encoded symbols
	 *	@param[in] r_t_src_runs is source data buffer, containing run lengths
	 *	@param[out] r_t_dest is destination data buffer (original contents will be lost)
	 *
	 *	@return Returns true on success, false on failure.
	 */
	static bool Decode(const TBuffer &r_t_src, const TBuffer &r_t_src_runs, TBuffer &r_t_dest);

	/**
	 *	@brief encoding function
	 *
	 *	Encodes data from r_t_src, outputs to r_t_dest and r_t_dest_runs.
	 *
	 *	@param[in] r_t_src is source data buffer
	 *	@param[out] r_t_dest is destination data buffer, containing encoded symbols
	 *		(original contents will be lost)
	 *	@param[in] r_t_dest_runs is source data buffer, containing run lengths
	 *		(original contents will be lost)
	 *
	 *	@return Returns true on success, false on failure.
	 */
	static bool Encode(const TBuffer &r_t_src, TBuffer &r_t_dest, TBuffer &r_t_dest_runs);
};

/**
 *	@brief a simple huffman tree template; features simple interface for common cases,
 *		and static interface for more advanced use
 *
 *	@param[in] CSymbol is encoded symbol data type, it should be an unsigned integer
 *	@param[in] n_max_code_bit_num is maximal length of huffman code
 *
 *	@note This facilitates encoding only (decoding is supposed to be very simple).
 */
template <class CSymbol = uint8_t, const int n_max_code_bit_num = 16>
class CHuffmanTree {
public:
	enum {
		max_CodeBitNum = n_max_code_bit_num /**< @brief maximal length of huffman code */
	};

	typedef uint32_t _TyCodeWord; /**< @brief code word storage */
	typedef CSymbol _TySymbol; /**< @brief symbol data type */
	typedef size_t _TyFrequency; /**< @brief symbol frequency data type */

	/**
	 *	@brief structure, holding symbol, it's frequency and eventualy also it's code
	 */
	struct TFrequency {
		_TySymbol n_symbol; // valid for leaf nodes only
		_TyFrequency n_frequency;

		_TyCodeWord n_code_word;
		int n_code_length;

		inline TFrequency(_TySymbol n_sym)
			:n_symbol(n_sym), n_frequency(0), n_code_word(0), n_code_length(-1)
		{}

		inline TFrequency(_TySymbol n_sym, _TyFrequency n_freq)
			:n_symbol(n_sym), n_frequency(n_freq), n_code_word(0), n_code_length(-1)
		{}

		inline bool operator <(const TFrequency &r_t_freq) const // lower frequency comparison (reversed)
		{
			return n_frequency > r_t_freq.n_frequency;
		}

		inline bool operator ==(_TySymbol n_sym) const // symbol equality comparison
		{
			return n_symbol == n_sym;
		}

		inline bool operator <(_TySymbol n_sym) const // symbol equality comparison
		{
			return n_symbol < n_sym;
		}

		inline operator _TySymbol() const // get symbol
		{
			return n_symbol;
		}
	};

protected:
	struct TNode {
		_TyFrequency n_frequency;
		const TNode *p_left, *p_right;

		inline TNode(size_t n_freq = 0, const TNode *p_l = 0, const TNode *p_r = 0)
			:n_frequency(n_freq), p_left(p_l), p_right(p_r)
		{}

		inline bool b_Leaf() const
		{
			_ASSERTE(!p_left == !p_right);
			return !p_left; // && !p_right
		}

		inline bool operator <(const TNode &r_t_node) const // lower frequency comparison (reversed)
		{
			return n_frequency > r_t_node.n_frequency;
		}

		static inline bool CompareFreq(const TNode *p_a, const TNode *p_b)
		{
			return *p_a < *p_b;
		}

		static inline const TNode *p_Ref(const TNode &r_t_node)
		{
			return &r_t_node;
		}
	};

	class CGoUp {
	protected:
		std::vector<const TNode*> &m_r_dest;
		int32_t &m_r_n_leaf_counter;

	public:
		inline CGoUp(std::vector<const TNode*> &r_dest, int32_t &r_n_leaf_counter)
			:m_r_dest(r_dest), m_r_n_leaf_counter(r_n_leaf_counter)
		{}

		inline void operator ()(const TNode *p_node)
		{
			if(p_node->b_Leaf())
				++ m_r_n_leaf_counter;
			else {
				_ASSERTE(m_r_dest.capacity() >= m_r_dest.size() + 2);
				m_r_dest.push_back(p_node->p_left);
				m_r_dest.push_back(p_node->p_right);
			}
		}
	};

#ifdef _DEBUG
	class CFindUnsorted {
	protected:
		const TNode *m_p_last;

	public:
		inline CFindUnsorted(const TNode *p_last)
			:m_p_last(p_last)
		{}

		inline bool operator ()(const TNode *p_next)
		{
			if(*p_next < *m_p_last)
				return true; // unsorted
			m_p_last = p_next;
			return false;
		}

		static inline bool b_IsSorted(std::vector<const TNode*> &r_queue)
		{
			return r_queue.size() < 2 || std::find_if(r_queue.begin() + 1,
				r_queue.end(), CFindUnsorted(r_queue.front())) == r_queue.end();
		}
	};
#endif // _DEBUG

protected:
	std::vector<TFrequency> m_freq_list;
	int32_t m_p_code_num[max_CodeBitNum];

public:
	/**
	 *	@brief calculates symbol frequencies from a sample of input data
	 *
	 *	@param[in] p_begin is the first element of input data
	 *	@param[in] p_end is one past last element of input data
	 *
	 *	@return Returns true on success, false on failure (not enough memory).
	 *
	 *	@note This allocates space for all the symbols
	 *		(may be unfeasible for large symbols types).
	 */
	bool CalculateSymbolFrequencies(const _TySymbol *p_begin, const _TySymbol *p_end)
	{
		_ASSERTE(SIZE_MAX > ((1 << (sizeof(_TySymbol) * 8 - 1)) | ((1 << (sizeof(_TySymbol) * 8 - 1)) - 1)));
		// make sure n_max_symbol_value will not overflow

		size_t n_max_symbol_value = size_t(1) << (sizeof(_TySymbol) * 8);
		if(!stl_ut::Resize_To_N(m_freq_list, n_max_symbol_value, TFrequency(_TySymbol(0))))
			return false;
		for(size_t i = 1; i < n_max_symbol_value; ++ i)
			m_freq_list[i].n_symbol = _TySymbol(i);
		// alloc/clear frequency list

		_ASSERTE(p_begin <= p_end);
		for(; p_begin != p_end; ++ p_begin) {
			++ m_freq_list[*p_begin].n_frequency;
			// calculate frequency

			_ASSERTE(m_freq_list[*p_begin].n_frequency);
			// owerflow shouldn't occur since sum of all frequencies
			// is buffer length which is size_t as well
		}
		// accumulate symbol frequencies

		return true;
	}

	/**
	 *	@brief builds huffman tree and assigns symbols and lengths to frequencies
	 *	@note This is only valid after CalculateSymbolFrequencies() was called.
	 */
	inline bool Assign_CodeWords()
	{
		return Assign_CodeWords(m_freq_list, m_p_code_num);
	}

	/**
	 *	@brief gets number of elements of code counts table
	 *	@return Returs size of code counts table, in elements.
	 *	@note This is only valid after Assign_CodeWords() was called.
	 */
	inline int n_CodeTable_Size() const
	{
		return max_CodeBitNum;
	}

	/**
	 *	@brief gets code counts table
	 *	@return Returs const pointer to the code counts table.
	 *	@note This is only valid after Assign_CodeWords() was called.
	 */
	inline const int32_t *p_CodeTable() const
	{
		return m_p_code_num;
	}

	/**
	 *	@brief gets size of symbols table
	 *	@return Returs size of symbols table.
	 *	@note This is only valid after Assign_CodeWords() was called.
	 */
	inline size_t n_SymbolTable_Size() const
	{
		return m_freq_list.size();
	}

	/**
	 *	@brief gets a copy of table of symbols 
	 *
	 *	This writes n_SymbolTable_Size() * sizeof(_TySymbol) bytes,
	 *	containing the table of symbols (a table of all encoded symbols,
	 *	sorted by cannonical codeword assignment) into the destination.
	 *
	 *	@param[out] p_dest is destination for the table of symbols (allocated by caller)
	 *	@param[in] n_space_bytes is free space in p_dest, in bytes
	 *		(must be at least n_SymbolTable_Size() * sizeof(_TySymbol))
	 *
	 *	@note This is only valid after Assign_CodeWords() was called.
	 */
	void Get_SymbolTable(_TySymbol *p_dest, size_t UNUSED(n_space_bytes)) const
	{
		_ASSERTE(n_space_bytes >= n_SymbolTable_Size() * sizeof(_TySymbol));

		for(size_t i = 0, n = m_freq_list.size(); i < n; ++ i, ++ p_dest)
			*p_dest = m_freq_list[i].n_symbol;
	}

	/**
	 *	@brief sorts frequencies by symbol for fast lookup
	 *	@note This is only valid after Assign_CodeWords() was called.
	 *	@note This must not be called between Assign_CodeWords() and
	 *		Get_SymbolTable() as it will disrupt symbol order.
	 */
	inline void SortFrequencies_BySymbol()
	{
		SortFrequencies_BySymbol(m_freq_list);
	}

	/**
	 *	@brief gets symbol information (codeword)
	 *
	 *	@param[in] t_sym is value of the symbol
	 *
	 *	@return Returns const reference to the symbol information.
	 *
	 *	@note This is only valid after Assign_CodeWords() was called.
	 *	@note This assumes that the symbol is present in the table,
	 *		errors are not handled.
	 */
	const TFrequency &r_LookupSymbol(const _TySymbol t_sym) const
	{
#if defined(_MSC_VER) && !defined(__MWERKS__) && _MSC_VER > 1200
		_ASSERTE(std::lower_bound(m_freq_list.begin(),
			m_freq_list.end(), t_sym, b_LowerSymbol) != m_freq_list.end()); // make sure the symbol exists
#endif // _MSC_VER && !__MWERKS__ && _MSC_VER > 1200
		/*return *std::lower_bound(m_freq_list.begin(), m_freq_list.end(), t_sym, b_LowerSymbol);*/
		return r_LookupSymbol(t_sym, m_freq_list); // otherwise get duplicate comdat in MSVC 6.0
	}

	/**
	 *	@brief builds huffman tree and assigns symbols and lengths to frequencies
	 *
	 *	@param[in,out] r_freq_list is list of symbol frequencies
	 *		(will remove zero-freq symbols and will fill code words upon successful return)
	 *	@param[out] p_code_num is filled with number of huffman codes of every length
	 *		from 0 up to max_CodeBitNum; must be allocated by caller to max_CodeBitNum
	 *
	 *	@return Returns true on success, false on failure.
	 */
	static bool Assign_CodeWords(std::vector<TFrequency> &r_freq_list, int32_t *p_code_num)
	{
		if(r_freq_list.size() >= ((1 << (max_CodeBitNum - 1)) | ((1 << (max_CodeBitNum - 1)) - 1)))
			return false;
		// too much symbols

		std::sort(r_freq_list.begin(), r_freq_list.end());
		// sort by symbol frequencies

		r_freq_list.erase(std::find_if(r_freq_list.begin(),
			r_freq_list.end(), FindZeroFreq), r_freq_list.end());
		// erase any symbols with zero frequencies

		std::vector<TNode> node_list;
		if(!stl_ut::Reserve_N(node_list, r_freq_list.size() * 2 - 1))
			return false;
		// create list of all nodes in the tree (including non-leaf nodes)

		for(size_t i = 0, n = r_freq_list.size(); i < n; ++ i)
			node_list.push_back(TNode(r_freq_list[i].n_frequency));
		// add nodes holding original symbol frequencies

		node_list.resize(node_list.size() + r_freq_list.size() - 1);
		// add new nodes which will be used as non-leaf nodes

		std::vector<const TNode*> q1, q2;
		// have two queues

		if(!stl_ut::Reserve_N(q1, r_freq_list.size()) ||
		   !stl_ut::Reserve_N(q2, /*(*/r_freq_list.size() /*+ 1) / 2*/)) // needs more space for tree traversal
			return false;
		// make sure there's plenty of space in both of them

		bool b_restart = false;
		for(;;) {
			_ASSERTE(q1.empty() && q1.capacity() + r_freq_list.size());
			q1.resize(r_freq_list.size());
			std::transform(node_list.begin(), node_list.begin() +
				r_freq_list.size(), q1.begin(), TNode::p_Ref);
			// add all original nodes to the first queue

			size_t n_free_node = r_freq_list.size();
			const size_t n_end_node = node_list.size();
			// index to the first free node and one node past the end of the list

			_ASSERTE(q2.empty());
			while(q1.size() + q2.size() > 1) {
				_ASSERTE(CFindUnsorted::b_IsSorted(q1));
				// make sure q1 is sorted

				const TNode *p_node[2];
				for(int i = 0; i < 2; ++ i) {
					if(q2.empty() || (!q1.empty() && q1.back()->n_frequency < q2.back()->n_frequency)) {
						p_node[i] = q1.back();
						q1.erase(q1.end() - 1);
						// node in the first queue has smaller freq or is the only left
					} else {
						p_node[i] = q2.back();
						q2.erase(q2.end() - 1);
						// node in second queue has smaller freq or is the only left
					}
				}
				// choose two nodes to merge

				_ASSERTE(n_free_node != n_end_node);
				TNode *p_new = &node_list[n_free_node ++];
				// get some unused node

				_ASSERTE(p_node[0]->n_frequency <= UINT32_MAX - p_node[1]->n_frequency);
				*p_new = TNode(p_node[0]->n_frequency + p_node[1]->n_frequency, p_node[0], p_node[1]);
				// assign frequency and children

				_ASSERTE(q2.capacity() > q2.size());
				q2.insert(q2.begin(), p_new);
				_ASSERTE(q2.size() <= (r_freq_list.size() + 1) / 2);
				// put it to second queue

				//std::sort(q2.begin(), q2.end(), TNode::CompareFreq); // not needed
				_ASSERTE(CFindUnsorted::b_IsSorted(q2));
				// make sure the nodes in q2 are sorted
			}
			// create huffman tree (O(n) method, described in wiki)

			_ASSERTE(n_free_node == n_end_node); // used all the nodes in the tree
			_ASSERTE(q1.empty() && q2.size() == 1); // root node is in q2

			memset(p_code_num, 0, max_CodeBitNum * sizeof(int32_t));
			// clear the code counts array

			if(q2.front()->b_Leaf())
				p_code_num[0] = 1; // single leaf node (extreme case)
			else {
				for(int n_level = -1; !q1.empty() || !q2.empty(); ++ n_level) {
					if(n_level >= max_CodeBitNum) {
						b_restart = true;
						break;
					}
					// compare only if there is more nodes

					if(n_level & 1) {
						_ASSERTE(!q2.empty() && q1.empty());
						std::for_each(q2.begin(), q2.end(), CGoUp(q1, p_code_num[n_level]));
						q2.clear();
					} else {
						_ASSERTE(!q1.empty() && q2.empty());
						std::for_each(q1.begin(), q1.end(), CGoUp(q2, p_code_num[n_level]));
						q1.clear();
					}
					// alternately move whole tree levels between q1 and q2
				}
			}
			// determine number of symbols with distinct lengths (note this kind of
			// non-recursive traversal destroys the tree, but needs no additional memory)

			if(!b_restart) {
				_TyCodeWord n_code_word = 0;
				for(size_t i = 0, j = 0, n_symbol_num = r_freq_list.size();; ++ j) {
					int n_length = j + 1;
					_ASSERTE(n_length <= max_CodeBitNum);
					for(int32_t k = 0; k < p_code_num[j]; ++ k, ++ i) {
						_ASSERTE(n_code_word < UINT32_MAX && !(n_code_word >> n_length));
						TFrequency &r_freq = r_freq_list[i];
						r_freq.n_code_word = n_code_word ++;
						r_freq.n_code_length = n_length;
					}
					// assign code words

					if(!(n_symbol_num -= p_code_num[j]))
						break;
					// we'we processed this much symbols

					_ASSERTE(n_code_word == ((n_code_word << 1) >> 1));
					n_code_word <<= 1;
					// shift code word (increase length)
				}
				// assign code words, fill-in code lengths

				break;
			} else {
				//printf("huffman restarts ...\r");
				// we need to modify codes so it would get built

				for(size_t i = 0, n = r_freq_list.size(); i < n; ++ i) {
					_TyFrequency &r_n_freq = node_list[i].n_frequency;
					r_n_freq = (r_n_freq / 2) | 1;
				}
				// change symbol probablility distributionx

				std::sort(node_list.begin(), node_list.begin() + r_freq_list.size());
				// sort leaf nodes by new frequencies

				q1.clear();
				q2.clear();
				b_restart = false;
				// clear for the next pass
			}
		}

		return true;
	}

	/**
	 *	@brief sorts frequencies by symbol for fast lookup
	 *	@param[in,out] freq_list is the list of symbol frequencies
	 */
	static inline void SortFrequencies_BySymbol(std::vector<TFrequency> &freq_list)
	{
		std::sort(freq_list.begin(), freq_list.end(), CompareSymbol);
	}

	/**
	 *	@brief gets symbol information (codeword)
	 *
	 *	@param[in] t_sym is value of the symbol
	 *	@param[in,out] freq_list is the list of symbol frequencies
	 *
	 *	@return Returns const reference to the symbol information.
	 *
	 *	@note This assumes that the symbol is present in the table,
	 *		errors are not handled.
	 */
	static inline const TFrequency &r_LookupSymbol(const _TySymbol t_sym, const std::vector<TFrequency> &freq_list)
	{
		return *std::lower_bound(freq_list.begin(), freq_list.end(), t_sym, b_LowerSymbol); // todo - see if this compiles with msvc 60
	}

protected:
	static bool b_LowerSymbol(_TySymbol n_sym, const TFrequency &r_freq) // symbol equality comparison for r_LookupSymbol()
	{
		return n_sym < r_freq.n_symbol;
	}

	static inline bool FindZeroFreq(const TFrequency &r_t_freq)
	{
		return !r_t_freq.n_frequency;
	}

	static inline bool CompareSymbol(const TFrequency &r_freq_a, const TFrequency &r_freq_b)
	{
		return r_freq_a.n_symbol < r_freq_b.n_symbol;
	}
};

/**
 *	@brief Huffman coder
 *
 *	Very simple and easy to use cannonical huffman (de)coder, operating on bytes.
 *	Stores (uncompressed) data length, canonical Huffman tree (symbol counts and
 *	symbols), followed by bitstream. last byte is padded with zeros.
 */
class CHuffmanCodec {
public:
	/**
	 *	@brief Huffman configuration enums
	 */
	enum {
		max_CodeBitNum = 16	/**< @brief Huffman code length limit (can be 1 to 31) */
	};

	typedef CHuffmanTree<uint8_t, max_CodeBitNum> CHuff8;	/**< @brief Huffman tree */

public:
	/**
	 *	@brief decoding function
	 *
	 *	Decodes data from r_t_in_buffer, outputs to r_t_out_buffer.
	 *
	 *	@param[in] r_t_in_buffer is source data buffer
	 *	@param[out] r_t_out_buffer is destination data buffer (original contents will be lost)
	 *
	 *	@return Returns true on success, false on failure.
	 */
	static bool Decode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer);

	/**
	 *	@brief encoding function
	 *
	 *	Encodes data from r_t_in_buffer, outputs to r_t_out_buffer.
	 *
	 *	@param[in] r_t_in_buffer is source data buffer
	 *	@param[out] r_t_out_buffer is destination data buffer (original contents will be lost)
	 *
	 *	@return Returns true on success, false on failure.
	 */
	static bool Encode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer);
};

/**
 *	@brief RLE-0 Huffman coder
 *
 *	Very simple and easy to use cannonical huffman (de)coder, operating on bytes.
 *	Stores (uncompressed) data length, canonical Huffman tree (symbol counts and
 *	symbols), followed by bitstream. last byte is padded with zeros.
 *
 *	This version actually builds two huffman trees to facilitate RLE-0 compression.
 */
class CRLE0_HuffmanCodec {
public:
	/**
	 *	@brief Huffman configuration enums
	 */
	enum {
		max_CodeBitNum = 16	/**< @brief Huffman code length limit (can be 1 to 31) */
	};

	typedef CHuffmanTree<uint16_t, max_CodeBitNum> CHuff16;	/**< @brief Huffman tree */
	typedef CHuffmanTree<uint8_t, max_CodeBitNum> CHuff8;	/**< @brief Huffman tree */

public:
	/**
	 *	@brief decoding function
	 *
	 *	Decodes data from r_t_in_buffer, outputs to r_t_out_buffer.
	 *
	 *	@param[in] r_t_in_buffer is source data buffer
	 *	@param[out] r_t_out_buffer is destination data buffer (original contents will be lost)
	 *
	 *	@return Returns true on success, false on failure.
	 */
	static bool Decode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer);

	/**
	 *	@brief encoding function
	 *
	 *	Encodes data from r_t_in_buffer, outputs to r_t_out_buffer.
	 *
	 *	@param[in] r_t_in_buffer is source data buffer
	 *	@param[out] r_t_out_buffer is destination data buffer (original contents will be lost)
	 *
	 *	@return Returns true on success, false on failure.
	 */
	static bool Encode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer);
};

/**
 *	@brief RLE-0 Huffman coder
 *
 *	Very simple and easy to use cannonical huffman (de)coder, operating on bytes.
 *	Stores (uncompressed) data length, canonical Huffman tree (symbol counts and
 *	symbols), followed by bitstream. last byte is padded with zeros.
 *
 *	This version actually builds three huffman trees to facilitate RLE-0 compression
 *	and context/sensitive encoding for symbols that come after the zero run.
 */
class CRLE0_HuffmanCodec_1 {
public:
	/**
	 *	@brief Huffman configuration enums
	 */
	enum {
		max_CodeBitNum = 16	/**< @brief Huffman code length limit (can be 1 to 31) */
	};

	typedef CHuffmanTree<uint16_t, max_CodeBitNum> CHuff16;	/**< @brief Huffman tree */
	typedef CHuffmanTree<uint8_t, max_CodeBitNum> CHuff8;	/**< @brief Huffman tree */

public:
	/**
	 *	@brief decoding function
	 *
	 *	Decodes data from r_t_in_buffer, outputs to r_t_out_buffer.
	 *
	 *	@param[in] r_t_in_buffer is source data buffer
	 *	@param[out] r_t_out_buffer is destination data buffer (original contents will be lost)
	 *
	 *	@return Returns true on success, false on failure.
	 */
	static bool Decode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer);

	/**
	 *	@brief encoding function
	 *
	 *	Encodes data from r_t_in_buffer, outputs to r_t_out_buffer.
	 *
	 *	@param[in] r_t_in_buffer is source data buffer
	 *	@param[out] r_t_out_buffer is destination data buffer (original contents will be lost)
	 *
	 *	@return Returns true on success, false on failure.
	 */
	static bool Encode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer);
};

/**
 *	@brief RLE-0 Huffman coder
 *
 *	Very simple and easy to use cannonical huffman (de)coder, operating on bytes.
 *	Stores (uncompressed) data length, canonical Huffman tree (symbol counts and
 *	symbols), followed by bitstream. last byte is padded with zeros.
 *
 *	This version actually builds two huffman trees to facilitate RLE-0 compression
 *	and context/sensitive encoding for symbols that come after the zero run.
 *	In this version (in contrast with CRLE0_HuffmanCodec_1), the run-lengths are not
 *	huffman encoded, and are stored as raw 16 bit numbers instead.
 */
class CRLE0_HuffmanCodec_2 {
public:
	/**
	 *	@brief Huffman configuration enums
	 */
	enum {
		max_CodeBitNum = 16	/**< @brief Huffman code length limit (can be 1 to 31) */
	};

	typedef CHuffmanTree<uint16_t, max_CodeBitNum> CHuff16;	/**< @brief Huffman tree */
	typedef CHuffmanTree<uint8_t, max_CodeBitNum> CHuff8;	/**< @brief Huffman tree */

public:
	/**
	 *	@brief decoding function
	 *
	 *	Decodes data from r_t_in_buffer, outputs to r_t_out_buffer.
	 *
	 *	@param[in] r_t_in_buffer is source data buffer
	 *	@param[out] r_t_out_buffer is destination data buffer (original contents will be lost)
	 *
	 *	@return Returns true on success, false on failure.
	 */
	static bool Decode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer);

	/**
	 *	@brief encoding function
	 *
	 *	Encodes data from r_t_in_buffer, outputs to r_t_out_buffer.
	 *
	 *	@param[in] r_t_in_buffer is source data buffer
	 *	@param[out] r_t_out_buffer is destination data buffer (original contents will be lost)
	 *
	 *	@return Returns true on success, false on failure.
	 */
	static bool Encode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer);
};

/**
 *	@brief RLE-0 Huffman coder
 *
 *	Very simple and easy to use cannonical huffman (de)coder, operating on bytes.
 *	Stores (uncompressed) data length, canonical Huffman tree (symbol counts and
 *	symbols), followed by bitstream. last byte is padded with zeros.
 *
 *	This version actually builds two huffman trees to facilitate RLE-0 compression
 *	(symbols and zero run lengths are both in the same tree) and context/sensitive
 *	encoding for symbols that come after the zero run.
 */
class CRLE0_HuffmanCodec_3 {
public:
	/**
	 *	@brief Huffman configuration enums
	 */
	enum {
		max_CodeBitNum = 16	/**< @brief Huffman code length limit (can be 1 to 31) */
	};

	typedef CHuffmanTree<uint16_t, max_CodeBitNum> CHuff16;	/**< @brief Huffman tree */
	typedef CHuffmanTree<uint8_t, max_CodeBitNum> CHuff8;	/**< @brief Huffman tree */

public:
	/**
	 *	@brief decoding function
	 *
	 *	Decodes data from r_t_in_buffer, outputs to r_t_out_buffer.
	 *
	 *	@param[in] r_t_in_buffer is source data buffer
	 *	@param[out] r_t_out_buffer is destination data buffer (original contents will be lost)
	 *
	 *	@return Returns true on success, false on failure.
	 */
	static bool Decode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer);

	/**
	 *	@brief encoding function
	 *
	 *	Encodes data from r_t_in_buffer, outputs to r_t_out_buffer.
	 *
	 *	@param[in] r_t_in_buffer is source data buffer
	 *	@param[out] r_t_out_buffer is destination data buffer (original contents will be lost)
	 *
	 *	@return Returns true on success, false on failure.
	 */
	static bool Encode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer);
};

template <class _TyHuffmanCodec = CHuffmanCodec, const size_t n_block_size = 16384>
class CBlockyHuffmanCodec {
public:
	typedef _TyHuffmanCodec CBlockCodec; /**< @brief block codec type */

	enum {
		block_Size = n_block_size /**< @brief size of the individual blocks, compressed with huffman codec */
	};

public:
	/**
	 *	@brief decoding function
	 *
	 *	Decodes data from r_t_in_buffer, outputs to r_t_out_buffer.
	 *
	 *	@param[in] r_t_in_buffer is source data buffer
	 *	@param[out] r_t_out_buffer is destination data buffer (original contents will be lost)
	 *
	 *	@return Returns true on success, false on failure.
	 */
	static bool Decode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
	{
		_ASSERTE(&r_t_in_buffer != &r_t_out_buffer);

		r_t_out_buffer.Clear(); // !!

		//FILE *p_fw = fopen("blocky_dec.txt", "w"); // debug

		TBuffer t_block_buffer, t_tmp;
		const uint8_t *p_src = r_t_in_buffer.p_Data(),
			*p_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
		while(p_src != p_end) {
			if(p_src + sizeof(uint32_t) > p_end)
				return false; // not enough data
			size_t n_huffman_slice_size = *(uint32_t*)p_src;
			p_src += sizeof(uint32_t);
			// get size of input slice ...

			t_block_buffer.Resize(n_huffman_slice_size, false);
			memcpy(t_block_buffer.p_Data(), p_src, n_huffman_slice_size);
			p_src += n_huffman_slice_size;
			// prepare the input buffer

			if(!CBlockCodec::Decode(t_block_buffer, t_tmp))
				return false;
			// decode using huffman

			//fprintf(p_fw, "block %d -> %d\n", t_tmp.n_Size(), t_block_buffer.n_Size()); // debug

			size_t n_output_offset = r_t_out_buffer.n_Size();
			if(!r_t_out_buffer.Grow(t_tmp.n_Size()))
				return false;
			memcpy(r_t_out_buffer.p_Data() + n_output_offset,
				t_tmp.p_Data(), t_tmp.n_Size());
			// append the decoded block to the output
		}

		//fclose(p_fw); // debug

		_ASSERTE(p_src == p_end); // make sure the entire sequence was processed

		return true;
	}

	/**
	 *	@brief encoding function
	 *
	 *	Encodes data from r_t_in_buffer, outputs to r_t_out_buffer.
	 *
	 *	@param[in] r_t_in_buffer is source data buffer
	 *	@param[out] r_t_out_buffer is destination data buffer (original contents will be lost)
	 *
	 *	@return Returns true on success, false on failure.
	 */
	static bool Encode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
	{
		_ASSERTE(&r_t_in_buffer != &r_t_out_buffer); // input and output buffer must not be the same

		r_t_out_buffer.Clear(); // !!

		//FILE *p_fw = fopen("blocky_enc.txt", "w"); // debug

		TBuffer t_block_buffer, t_tmp;
		for(size_t n_offset = 0; n_offset < r_t_in_buffer.n_Size(); n_offset += n_block_size) {
			size_t n_size = min(size_t(n_block_size), size_t(r_t_in_buffer.n_Size() - n_offset));
			t_block_buffer.Resize(n_size, false);
			memcpy(t_block_buffer.p_Data(), r_t_in_buffer.p_Data() + n_offset, n_size);
			// prepare the input buffer

			if(!CBlockCodec::Encode(t_block_buffer, t_tmp))
				return false;
			// encode using huffman

			//fprintf(p_fw, "block %d -> %d\n", t_block_buffer.n_Size(), t_tmp.n_Size()); // debug

			size_t n_output_offset = r_t_out_buffer.n_Size();
			if(!r_t_out_buffer.Grow(sizeof(uint32_t) + t_tmp.n_Size()))
				return false;
			_ASSERTE(t_tmp.n_Size() <= UINT32_MAX);
			*(uint32_t*)(r_t_out_buffer.p_Data() + n_output_offset) = uint32_t(t_tmp.n_Size());
			n_output_offset += sizeof(uint32_t);
			memcpy(r_t_out_buffer.p_Data() + n_output_offset, t_tmp.p_Data(), t_tmp.n_Size());
			// append output buffer with the length of compressed huffman data and the data
		}
		// perform the compression on blocks

		//fclose(p_fw); // debug

		return true;
	}

};

/**
 *	@brief inversion frequencies coder
 *
 *	Naive (sorted) inversion frequencies implementation.
 */
class CInversionFrequenciesCodec {
private:
	class CSortAsc;
	class CSortDesc;

public:
	/**
	 *	@brief decoding function
	 *
	 *	Decodes data from r_t_in_buffer, outputs to r_t_out_buffer.
	 *
	 *	@param[in] r_t_in_buffer is source data buffer
	 *	@param[out] r_t_out_buffer is destination data buffer (original contents will be lost)
	 *
	 *	@return Returns true on success, false on failure.
	 */
	static bool Decode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer);

	/**
	 *	@brief symbol sorting names for encoding
	 */
	enum {
		sort_NoSort = 0,		/**< @brief symbols are not sorted */
		sort_FreqAscending,		/**< @brief symbols are sorted with increasing frequency before encoding */
		sort_FreqDescending		/**< @brief symbols are sorted with decreasing frequency before encoding */
	};

	/**
	 *	@brief encoding function
	 *
	 *	Encodes data from r_t_in_buffer, outputs to r_t_out_buffer.
	 *
	 *	@param[in] r_t_in_buffer is source data buffer
	 *	@param[out] r_t_out_buffer is destination data buffer (original contents will be lost)
	 *	@param[in] n_permutation_type is type of permutation, applied to symbols before encoding
	 *		(one of sort_NoSort, sort_FreqAscending or sort_FreqDescending)
	 *
	 *	@return Returns true on success, false on failure.
	 */
	static bool Encode(const TBuffer &r_t_in_buffer,
		TBuffer &r_t_out_buffer, int n_permutation_type = sort_NoSort);

private:
	static inline bool Emit(TBuffer &r_t_out_buffer, uint32_t n_value);
};

#endif // __SIMPLE_COMPRESSION_INCLUDED
