/*
								+--------------------------------+
								|                                |
								|*** Hilbert curve generators ***|
								|                                |
								|  Copyright  -tHE SWINe- 2016  |
								|                                |
								|           Hilbert.h            |
								|                                |
								+--------------------------------+
*/

#pragma once
#ifndef __HILBERT_SPACE_FILLING_CURVES_INCLUDED
#define __HILBERT_SPACE_FILLING_CURVES_INCLUDED

/**
 *	@file Hilbert.h
 *	@brief Hilbert space-filling curve generators
 *	@author -tHE SWINe-
 *	@date 2016
 */

#include "Integer.h"
#include "Vector.h"

/**
 *	@brief fast Hilbert curve generator
 *
 *	This is almost an order of magnitude faster than \ref CAltHilbert
 *	although the advantage diminishes slightly as the the output size
 *	exceeds the size of the caches. Also, this algorithm is currently
 *	limited to two dimensions and cannot seek (both could be done).
 *
 *	@note The output matches that of \ref CAltHilbert.
 */
class CFastHilbert {
protected:
	/**
	 *	@brief a simple 2D rotation / translation transform
	 */
	class CTransform {
	protected:
		const Vector2i v_x; /**< horizontal axis direction */
		const Vector2i v_y; /**< vertical axis direction */
		const Vector2i v_offset; /**< translation vector */

	public:
		CTransform(const int *p_matrix, int n_orient, int n_size)
			:v_x(p_matrix[0], p_matrix[1]), v_y(p_matrix[2], p_matrix[3]),
			v_offset(p_matrix[6] + p_matrix[5 - n_orient] * n_size,
			p_matrix[7] + p_matrix[4 + n_orient] * n_size)
		{}

		inline Vector2i operator ()(Vector2i v_input) const
		{
			return v_x * v_input.x + v_y * v_input.y + v_offset;
		}
	};

	/**
	 *	@brief a simple 2D rotation / translation transform with compile-time coeffs
	 *
	 *	@tparam n_x_x is horizontal component of the horizontal axis direction
	 *	@tparam n_x_y is vertical component of the horizontal axis direction
	 *	@tparam n_y_x is horizontal component of the vertical axis direction
	 *	@tparam n_y_y is vertical component of the vertical axis direction
	 *	@tparam n_oriented_tr_x is horizontal orientable component of the offset
	 *	@tparam n_oriented_tr_y is vertical orientable component of the offset
	 *	@tparam n_translate_x is horizontal fixed direction component of the offset
	 *	@tparam n_translate_y is vertical fixed direction component of the offset
	 *	@tparam orient is orientation of the offset
	 */
	template <int n_x_x, int n_x_y, int n_y_x, int n_y_y, int n_oriented_tr_x,
		int n_oriented_tr_y, int n_translate_x, int n_translate_y, bool b_orient>
	class CTransformT {
	protected:
		const Vector2i m_v_off;

		enum {
			n_translate_x_scale = (b_orient)? n_oriented_tr_x : n_oriented_tr_y,
			n_translate_y_scale = (b_orient)? n_oriented_tr_y : n_oriented_tr_x
		};

	public:
		inline CTransformT(int n_size)
			:m_v_off(n_translate_x_scale * n_size + n_translate_x,
			n_translate_y_scale * n_size + n_translate_y)
		{}

		inline Vector2i operator ()(Vector2i v_input) const
		{
			return Vector2i(v_input.x * n_x_x + v_input.y * n_y_x,
				v_input.x * n_x_y + v_input.y * n_y_y) + m_v_off;
		}
	};

	/**
	 *	@brief merge three transforms to reuse the same input and generate three outputs
	 */
	template <class Op1, class Op2, class Op3, class OutIt>
	class CMergeTransforms {
	protected:
		const Op1 m_op1;
		const Op2 m_op2;
		const Op3 m_op3; // those are actually const
		OutIt *__restrict m_p_dest1;
		OutIt *__restrict m_p_dest2;
		OutIt *__restrict m_p_dest3; // try to explain that these do not alias

	public:
		CMergeTransforms(Op1 op1, Op2 op2, Op3 op3, OutIt *p_dest1, OutIt *p_dest2, OutIt *p_dest3)
			:m_op1(op1), m_op2(op2), m_op3(op3), m_p_dest1(p_dest1),
			m_p_dest2(p_dest2), m_p_dest3(p_dest3)
		{}

		inline void operator ()(Vector2i v_input)
		{
			*m_p_dest1 = m_op1(v_input); ++ m_p_dest1;
			*m_p_dest2 = m_op2(v_input); ++ m_p_dest2;
			*m_p_dest3 = m_op3(v_input); ++ m_p_dest3;
		}
	};

public:
	/**
	 *	@brief generates Hilbert curve, starting at <tt>(0, 0)</tt>
	 *
	 *	@param[out] r_output is filled with the generated coordinates (erased at the beginning)
	 *	@param[in] n_order is base two logarithm of the side of the curve (the generated points
	 *		will cover \f$2^{\text{n\_order}} \times 2^{\text{n\_order}}\f$ area
	 *	@return Returns true on success, false on failure.
	 *
	 *	@note This doesn't use explicitly recursive curve generation scheme.
	 */
	static bool Generate(std::vector<Vector2i> &r_output, unsigned int n_order)
	{
		if(n_order < 1) {
			r_output.clear();
			return true;
		}
		// handle empty curve

		r_output.clear();
		r_output.resize(4);
		r_output.front() = Vector2i(0, 0);
		{
			int a = n_order & 1, b = 1 - a;
			r_output[1] = Vector2i(a, b);
			r_output[2] = Vector2i(1, 1);
			r_output[3] = Vector2i(b, a);
		}
		// create 1st level hilbert with a specific rotation

		if(n_order > 1) {
			if(n_order * 2 > sizeof(size_t) * 8)
				return false; // would run out of bits
			size_t n_sequqnce_size = (size_t(1) << (n_order * 2));
			// calculate required size (4^n_order)

			if(n_sequqnce_size > r_output.max_size())
				return false;
			r_output.reserve(n_sequqnce_size);
			if(r_output.capacity() < n_sequqnce_size)
				return false;
			r_output.resize(n_sequqnce_size);
			// resize output

			for(size_t n_size = 2, n_area = 4; n_order > 1; n_size *= 2, n_area *= 4) {
				-- n_order; // here
#if 0
				const int p_transform[] = {
					 0, 1,		1,  0,		1, 0,		 0,  0,
					 0, 1,		1,  0,		1, 1,		 0,  0,
					-1, 0,		0, -1,		1, 2,		-1, -1
					//  x			y	  orient	 translate
				};
				// transformation matrices

				const int *p_mat = p_transform;
				for(int i = 1; i < 4; ++ i, p_mat += 8) {
					std::transform(r_output.begin(), r_output.begin() + n_area,
						r_output.begin() + n_area * i, CTransform(p_mat, n_order & 1, n_size));
				}
				// create three mirrors in the original array
#else
				if(n_order & 1)
					Mirrors<1>(r_output.begin(), r_output.begin() + n_area, int(n_size), n_area);
				else
					Mirrors<0>(r_output.begin(), r_output.begin() + n_area, int(n_size), n_area);
				// create three mirrors in the original array, use templates
#endif
			}
			// increase level by mirroring the original
		}

		return true;
	}

protected:
	template <class Op1, class Op2, class Op3, class OutIt>
	static inline CMergeTransforms<Op1, Op2, Op3, OutIt> MergeTransforms(Op1 op1,
		Op2 op2, Op3 op3, OutIt *p_dest1, OutIt *p_dest2, OutIt *p_dest3)
	{
		return CMergeTransforms<Op1, Op2, Op3, OutIt>(op1, op2, op3, p_dest1, p_dest2, p_dest3);
	}

	template <const int n_orientation>
	static inline void Mirrors(std::vector<Vector2i>::const_iterator p_src_begin_it,
		std::vector<Vector2i>::iterator p_src_end_dest_begin_it,
		int n_size, size_t n_area)
	{
		std::vector<Vector2i>::const_iterator p_end_it = p_src_end_dest_begin_it; // cast to const
#if 1 // sadly, this is better, not sure why
		std::vector<Vector2i>::iterator p_dest_begin = std::transform(p_src_begin_it,
			p_end_it, p_src_end_dest_begin_it,
			CTransformT<0, 1, 1, 0, 1, 0, 0, 0, n_orientation>(n_size));
		p_dest_begin = std::transform(p_src_begin_it, p_end_it, p_dest_begin,
			CTransformT<0, 1, 1, 0, 1, 1, 0, 0, n_orientation>(n_size));
		std::transform(p_src_begin_it, p_end_it, p_dest_begin,
			CTransformT<-1, 0, 0, -1, 1, 2, -1, -1, n_orientation>(n_size));
#else // 1
		std::for_each(p_src_begin_it, p_end_it, MergeTransforms(
			CTransformT<0, 1, 1, 0, 1, 0, 0, 0, n_orientation>(n_size),
			CTransformT<0, 1, 1, 0, 1, 1, 0, 0, n_orientation>(n_size),
			CTransformT<-1, 0, 0, -1, 1, 2, -1, -1, n_orientation>(n_size),
			&*(p_src_end_dest_begin_it), &*(p_src_end_dest_begin_it + n_area),
			&*(p_src_end_dest_begin_it + 2 * n_area)));
		// merge these loops so that they read only once and write three times
#endif // 1
	}
};

/**
 *	@brief an "alternative" Hilbert curve generator
 *
 *	This speeds up from ~362 msec for a 1024x1024 Hilbert curve to 139 msec
 *	by using C++ features (cca 20 msec by making the dimension compile-time
 *	constant; most of the improvement stems from choosing the integer types
 *	by required size).
 *
 *	@note This is based on a paper by A. R. Butz, Alternative Algorithm for
 *		Hilbert's Space-Filling Curve, IEEE Trans. Comp., April, 1971, that
 *		was reimplemented by Spencer W. Thomas (UMichigan) and subsequently
 *		also by Doug Moore (Rice). This code is based on porting their's to
 *		C++.
 *	@note The output matches that of \ref CFastHilbert.
 */
class CAltHilbert {
protected:
	template <class T, unsigned int n_size = sizeof(T)>
	class CSignedForSize {
	public:
		typedef T _TyResult;
	};

	template <class T>
	class CSignedForSize<T, 4> {
	public:
		typedef int32_t _TyResult;
	};

	template <class T>
	class CSignedForSize<T, 8> {
	public:
		typedef int64_t _TyResult;
	};

public:
	/**
	 *	@brief generates Hilbert curve, starting at <tt>(0, 0)</tt>
	 *
	 *	@param[out] r_output is filled with the generated coordinates (erased at the beginning)
	 *	@param[in] n_order is base two logarithm of the side of the curve (the generated points
	 *		will cover \f$2^{\text{n\_order}} \times 2^{\text{n\_order}}\f$ area
	 *	@return Returns true on success, false on failure.
	 *
	 *	@note This uses a bit-shifting technique described in A. R. Butz, Alternative Algorithm
	 *		for Hilbert's Space-Filling Curve, IEEE Trans. Comp., April, 1971.
	 */
	static bool Generate(std::vector<Vector2i> &r_output, int n_order)
	{
		if(n_order < 1) {
			r_output.clear();
			return true;
		}
		// handle empty curve

		size_t n_length = size_t(1) << n_order;
		n_length *= n_length;
		r_output.resize(n_length);
		/*if(2 * n_order <= 16) { // this one leads to slowdown
			for(size_t i = 0; i < n_length; ++ i)
				Index_to_Coordinates<2, uint16_t, uint16_t>(n_order, i, &r_output[i].x);
		} else*/ if(2 * n_order <= 32) {
			for(uint32_t i = 0; i < n_length; ++ i) {
				Index_to_Coordinates<2, uint32_t, uint32_t>(n_order, i, &r_output[i].x); // not using uint16_t is faster
				_ASSERTE((n_Coordinates_to_Index<2, uint32_t,
					uint32_t>(n_order, &r_output[i].x) == i)); // make sure this works
			}
		} else {
			for(uint64_t i = 0; i < n_length; ++ i) {
				Index_to_Coordinates<2, uint64_t, uint32_t>(n_order, i, &r_output[i].x);
				_ASSERTE((n_Coordinates_to_Index<2, uint64_t,
					uint32_t>(n_order, &r_output[i].x) == i)); // make sure this works
			}
		}
		// generate

		return true;
	}

	/**
	 *	@brief generates 3D Hilbert curve, starting at <tt>(0, 0, 0)</tt>
	 *
	 *	@param[out] r_output is filled with the generated coordinates (erased at the beginning)
	 *	@param[in] n_order is base two logarithm of the side of the curve (the generated points
	 *		will cover \f$2^{\text{n\_order}} \times 2^{\text{n\_order}}\f$ area
	 *	@return Returns true on success, false on failure.
	 *
	 *	@note This uses a bit-shifting technique described in A. R. Butz, Alternative Algorithm
	 *		for Hilbert's Space-Filling Curve, IEEE Trans. Comp., April, 1971.
	 */
	static bool Generate(std::vector<Vector3i> &r_output, int n_order)
	{
		if(n_order < 1) {
			r_output.clear();
			return true;
		}
		// handle empty curve

		size_t n_length = size_t(1) << n_order;
		n_length *= n_length;
		r_output.resize(n_length);
		/*if(3 * n_order <= 16) { // this one leads to slowdown
			for(size_t i = 0; i < n_length; ++ i)
				Index_to_Coordinates<3, uint16_t, uint16_t>(n_order, i, &r_output[i].x);
		} else*/ if(3 * n_order <= 32) {
			for(uint32_t i = 0; i < n_length; ++ i) {
				Index_to_Coordinates<3, uint32_t, uint32_t>(n_order, i, &r_output[i].x); // not using uint16_t is faster
				_ASSERTE((n_Coordinates_to_Index<3, uint32_t,
					uint32_t>(n_order, &r_output[i].x) == i)); // make sure this works
			}
		} else {
			for(uint64_t i = 0; i < n_length; ++ i) {
				Index_to_Coordinates<3, uint64_t, uint32_t>(n_order, i, &r_output[i].x);
				_ASSERTE((n_Coordinates_to_Index<3, uint64_t,
					uint32_t>(n_order, &r_output[i].x) == i)); // make sure this works
			}
		}
		// generate

		return true;
	}

	/**
	 *	@brief returns a 2D point for a given index and order
	 *
	 *	@param[in] n_index is zero-based index of the point on the curve
	 *	@param[in] n_order is base two logarithm of the curve side length
	 *
	 *	@return Returns the same 2D point as in
	 *		<tt>Generate(curve, n_order); return curve[n_index];</tt> but
	 *		without computing the positions of the other points.
	 */
	static Vector2i v_Point(size_t n_index, int n_order)
	{
		_ASSERTE(n_order < 8 * sizeof(size_t));
		size_t n_length = size_t(1) << n_order;
		_ASSERTE(n_index / n_length < n_length);
		Vector2i v_result;
		if(2 * n_order <= 32) {
			_ASSERTE(n_index <= UINT32_MAX);
			Index_to_Coordinates<2, uint32_t, uint32_t>(n_order, uint32_t(n_index), &v_result.x);
		} else {
			_ASSERTE(n_index <= UINT64_MAX);
			Index_to_Coordinates<2, uint64_t, uint64_t>(n_order, uint64_t(n_index), &v_result.x);
		}
		return v_result;
	}

	/**
	 *	@brief returns a 3D point for a given index and order
	 *
	 *	@param[in] n_index is zero-based index of the point on the curve
	 *	@param[in] n_order is base two logarithm of the curve side length
	 *
	 *	@return Returns the same 3D point as in
	 *		<tt>Generate(curve, n_order); return curve[n_index];</tt> but
	 *		without computing the positions of the other points.
	 */
	static Vector3i v_Point_3D(size_t n_index, int n_order)
	{
		_ASSERTE(n_order < 8 * sizeof(size_t));
		size_t n_length = size_t(1) << n_order;
		_ASSERTE(n_index / (n_length * n_length) < n_length);
		Vector3i v_result;
		if(2 * n_order <= 32) {
			_ASSERTE(n_index <= UINT32_MAX);
			Index_to_Coordinates<3, uint32_t, uint32_t>(n_order, uint32_t(n_index), &v_result.x);
		} else {
			_ASSERTE(n_index <= UINT64_MAX);
			Index_to_Coordinates<3, uint64_t, uint64_t>(n_order, uint64_t(n_index), &v_result.x);
		}
		return v_result;
	}

	/**
	 *	@brief returns an index for a given 2D point and order
	 *
	 *	@param[in] v_point is a point on the curve
	 *	@param[in] n_order is base two logarithm of the curve side length
	 *
	 *	@return Returns index of the point, the same as in
	 *		<tt>Generate(curve, n_order); return std::find(curve.begin(), curve.end(), v_point) - curve.begin();</tt>
	 *		but without computing the positions of the other points or searching.
	 */
	static size_t n_Index(Vector2i v_point, int n_order)
	{
		_ASSERTE(n_order < 8 * sizeof(size_t));
		size_t n_length = size_t(1) << n_order;
		_ASSERTE(v_point.x >= 0 && size_t(v_point.x) < n_length);
		_ASSERTE(v_point.y >= 0 && size_t(v_point.y) < n_length);
		if(2 * n_order <= 32)
			return n_Coordinates_to_Index<2, uint32_t, uint32_t>(n_order, &v_point.x);
		else
			return n_Coordinates_to_Index<2, uint64_t, uint64_t>(n_order, &v_point.x);
	}

	/**
	 *	@brief returns an index for a given 3D point and order
	 *
	 *	@param[in] v_point is a point on the curve
	 *	@param[in] n_order is base two logarithm of the curve side length
	 *
	 *	@return Returns index of the point, the same as in
	 *		<tt>Generate(curve, n_order); return std::find(curve.begin(), curve.end(), v_point) - curve.begin();</tt>
	 *		but without computing the positions of the other points or searching.
	 */
	static size_t n_Index(Vector3i v_point, int n_order)
	{
		_ASSERTE(n_order < 8 * sizeof(size_t));
		size_t n_length = size_t(1) << n_order;
		_ASSERTE(v_point.x >= 0 && size_t(v_point.x) < n_length);
		_ASSERTE(v_point.y >= 0 && size_t(v_point.y) < n_length);
		_ASSERTE(v_point.z >= 0 && size_t(v_point.z) < n_length);
		if(3 * n_order <= 32)
			return n_Coordinates_to_Index<3, uint32_t, uint32_t>(n_order, &v_point.x);
		else
			return n_Coordinates_to_Index<3, uint64_t, uint64_t>(n_order, &v_point.x);
	}

protected:
	template <class T>
	static inline void AdjustRotation(unsigned int &r_n_rotation,
		T &r_n_bits, unsigned int n_dims, T n_mask_one_less)
	{
		typedef typename CSignedForSize<T>::_TyResult S; // avoid warning about negating an unsigned number
		r_n_bits &= T(-S(r_n_bits)) & n_mask_one_less;
		while(r_n_bits) {
			r_n_bits >>= 1;
			++ r_n_rotation;
		}
		if(++ r_n_rotation >= n_dims)
			r_n_rotation -= n_dims;
		// r_n_rotation = (r_n_rotation + 1 + ffs(r_n_bits)) % n_dims;
	}

	template <class T>
	static inline T n_RotateRight(T n_value, unsigned int n_rotation, unsigned int n_width, T n_mask)
	{
		return ((n_value >> n_rotation) | (n_value << (n_width - n_rotation))) & n_mask;
	}

	template <class T>
	static inline T n_RotateLeft(T n_value, unsigned int n_rotation, unsigned int n_width, T n_mask)
	{
		return ((n_value << n_rotation) | (n_value >> (n_width - n_rotation))) & n_mask;
	}

	template <class T>
	static inline T n_Mask_NZ(T n_bits)
	{
		_ASSERTE(n_bits > 0);
		return (T(1) << (n_bits - 1)) | ((T(1) << (n_bits - 1)) - 1);
	}

	template <class T>
	static T n_BitTranspose(unsigned int n_dims, unsigned int n_bits, T n_coords_in)
	{
#if 1 // this first branch is much faster
		const unsigned int n_dims_1 = n_dims - 1;
		unsigned int n_bits_in = n_bits;
		T n_field_ends_in = 1;
		T n_mask_in = n_Mask_NZ(T(n_bits_in));
		T n_packed_coords = 0;

		for(unsigned int n_bits_left; (n_bits_left = n_bits_in / 2);) {
			const unsigned int n_shift = n_dims_1 * n_bits_left;
			const T n_fields_ends = n_field_ends_in | (n_field_ends_in << (n_shift + n_bits_left));
			const T n_mask = (n_fields_ends << n_bits_left) - n_fields_ends;
			T utCoords = 0;
			if(n_bits_in & 1) {
				const T inFieldStarts = n_field_ends_in << (n_bits_in-1);
				unsigned int oddShift = 2 * n_shift;
				for(unsigned int d = 0; d < n_dims; ++ d) {
					T in = n_coords_in & n_mask_in;
					n_coords_in >>= n_bits_in;
					n_packed_coords |= (in & inFieldStarts) <<	oddShift++;
					in &= ~inFieldStarts;
					in = (in | (in << n_shift)) & n_mask;
					utCoords |= in << (d*n_bits_left);
				}
			} else {
				for(unsigned int d = 0; d < n_dims; ++ d) {
					T in = n_coords_in & n_mask_in;
					n_coords_in >>= n_bits_in;
					in = (in | (in << n_shift)) & n_mask;
					utCoords |= in << (d * n_bits_left);
				}
			}
			n_coords_in = utCoords;
			n_bits_in = n_bits_left;
			n_field_ends_in = n_fields_ends;
			n_mask_in = n_mask;
		}
		n_packed_coords |= n_coords_in;
		return n_packed_coords;
#else // 1
		T n_packed_coords = 0;
		for(unsigned int d = 0; d < n_dims; ++ d) {
			T in = n_coords_in & n_Mask_NZ(T(n_bits));
			T out = 0;
			n_coords_in >>= n_bits;
			for(unsigned int b = n_bits; b --;) {
				out <<= n_dims;
				out |= (in >> b) & 1;
			}
			n_packed_coords |= out << d;
		}
		return n_packed_coords;
#endif // 1
	}

	template <unsigned int n_dims, class T>
	static T n_BitTranspose(unsigned int n_bits, T n_coords_in)
	{
#if 1 // this first branch is much faster
		const unsigned int n_dims_1 = n_dims - 1;
		unsigned int n_bits_in = n_bits;
		
		T n_field_ends_in = 1;
		T n_mask_in = n_Mask_NZ(T(n_bits_in));
		T n_packed_coords = 0;

		for(unsigned int n_bits_left; (n_bits_left = n_bits_in / 2);) {
			const unsigned int n_shift = n_dims_1 * n_bits_left;
			const T n_fields_ends = n_field_ends_in | (n_field_ends_in << (n_shift + n_bits_left));
			const T n_mask = (n_fields_ends << n_bits_left) - n_fields_ends;
			T utCoords = 0;
			if(n_bits_in & 1) {
				const T inFieldStarts = n_field_ends_in << (n_bits_in - 1);
				unsigned int oddShift = 2 * n_shift;
				for(unsigned int d = 0; d < n_dims; ++ d) {
					T in = n_coords_in & n_mask_in;
					n_coords_in >>= n_bits_in;
					n_packed_coords |= (in & inFieldStarts) << oddShift++;
					in &= ~inFieldStarts;
					in = (in | (in << n_shift)) & n_mask;
					utCoords |= in << (d*n_bits_left);
				}
			} else {
				for(unsigned int d = 0; d < n_dims; ++ d) {
					T in = n_coords_in & n_mask_in;
					n_coords_in >>= n_bits_in;
					in = (in | (in << n_shift)) & n_mask;
					utCoords |= in << (d * n_bits_left);
				}
			}
			n_coords_in = utCoords;
			n_bits_in = n_bits_left;
			n_field_ends_in = n_fields_ends;
			n_mask_in = n_mask;
		}
		n_packed_coords |= n_coords_in;
		return n_packed_coords;
#else // 1
		T n_packed_coords = 0;
		for(unsigned int d = 0; d < n_dims; ++ d) {
			T in = n_coords_in & n_Mask_NZ(T(n_bits));
			T out = 0;
			n_coords_in >>= n_bits;
			for(unsigned int b = n_bits; b --;) {
				out <<= n_dims;
				out |= (in >> b) & 1;
			}
			n_packed_coords |= out << d;
		}
		return n_packed_coords;
#endif // 1
	}

	/**
	 *	@brief convert coordinates of a point on a Hilbert curve to its index
	 *
	 *	@tparam n_dims is number of coordinates
	 *	@tparam bitmask_t is an unsigned type large enough to hold coordinates for all the dimensions
	 *	@tparam halfmask_t is an unsigned type large enough to hold a single coordinate
	 *	@tparam coord_t is integer type of the coordinates (can be signed)
	 *
	 *	@param[in] n_bits is number of bits per coordinate
	 *	@param[in] p_coord is array of coordinates (each using up to <tt>n_bits</tt>)
	 *
	 *	@return Returns the value of the index.
	 */
	template <unsigned int n_dims, class bitmask_t, class halfmask_t, class coord_t>
	static void Index_to_Coordinates(unsigned int n_bits, bitmask_t n_index, coord_t *p_coord)
	{
		_ASSERTE(n_dims * n_bits <= 8 * sizeof(bitmask_t));
		_ASSERTE(n_bits <= 8 * sizeof(halfmask_t));
		_ASSERTE(n_bits <= 8 * sizeof(coord_t));

		if(n_dims > 1) {
			bitmask_t n_packed_coords;
			const halfmask_t n_coord_mask = n_Mask_NZ(halfmask_t(n_bits));

			if(n_bits > 1) {
				const unsigned int n_total_bits = n_dims * n_bits;
				const halfmask_t n_dims_mask = n_Mask_NZ(halfmask_t(n_dims));
				const halfmask_t n_dims_mask_one_less = n_dims_mask >> 1; // for AdjustRotation()
				
				unsigned int n_rotation = 0;
				halfmask_t n_flip_bit = 0;
				const bitmask_t nthbits = n_Mask_NZ(bitmask_t(n_total_bits)) / n_dims_mask;
				n_index ^= (n_index ^ nthbits) >> 1;
				n_packed_coords = 0;
				unsigned int b = n_total_bits;
				do {
					halfmask_t n_bits = (n_index >> (b -= n_dims)) & n_dims_mask;
					n_packed_coords <<= n_dims;
					n_packed_coords |= n_RotateLeft(n_bits, n_rotation, n_dims, n_dims_mask) ^ n_flip_bit;
					n_flip_bit = halfmask_t(1) << n_rotation;
					AdjustRotation(n_rotation, n_bits, n_dims, n_dims_mask_one_less);
				} while(b);
				for(unsigned int b = n_dims; b < n_total_bits; b *= 2)
					n_packed_coords ^= n_packed_coords >> b;
				n_packed_coords = n_BitTranspose(n_bits, n_dims, n_packed_coords);
			} else
				n_packed_coords = n_index ^ (n_index >> 1);

			for(unsigned int d = 0; d < n_dims; ++ d) {
				p_coord[d] = n_packed_coords & n_coord_mask;
				n_packed_coords >>= n_bits;
			}
		} else
			p_coord[0] = n_index;
	}

	/**
	 *	@brief convert coordinates of a point on a Hilbert curve to its index
	 *
	 *	@tparam n_dims is number of coordinates
	 *	@tparam bitmask_t is an unsigned type large enough to hold coordinates for all the dimensions
	 *	@tparam halfmask_t is an unsigned type large enough to hold a single coordinate
	 *	@tparam coord_t is integer type of the coordinates (can be signed)
	 *
	 *	@param[in] n_bits is number of bits per coordinate
	 *	@param[in] p_coord is array of coordinates (each using up to <tt>n_bits</tt>)
	 *
	 *	@return Returns the value of the index.
	 */
	template <unsigned int n_dims, class bitmask_t, class halfmask_t, class coord_t>
	static bitmask_t n_Coordinates_to_Index(unsigned int n_bits, const coord_t *p_coord)
	{
		_ASSERTE(n_dims * n_bits <= 8 * sizeof(bitmask_t));
		_ASSERTE(n_bits <= 8 * sizeof(halfmask_t));
		_ASSERTE(n_bits <= 8 * sizeof(coord_t));

		if(n_dims > 1) {
			const unsigned int n_total_bits = n_dims * n_bits;

			bitmask_t n_packed_coords = 0;
			for(unsigned int d = n_dims; d --;) {
				n_packed_coords <<= n_bits;
				n_packed_coords |= p_coord[d];
			}

			bitmask_t n_index;
			if(n_bits > 1) {
				const halfmask_t n_dims_mask = n_Mask_NZ(halfmask_t(n_dims));
				const halfmask_t n_dims_mask_one_less = n_dims_mask >> 1; // for AdjustRotation()
				const bitmask_t nthbits = n_Mask_NZ(bitmask_t(n_total_bits)) / n_dims_mask;

				n_packed_coords = n_BitTranspose<n_dims>(n_bits, n_packed_coords);
				n_packed_coords ^= n_packed_coords >> n_dims;

				n_index = 0;
				unsigned int rotation = 0;
				halfmask_t n_flip_bit = 0;
				unsigned int b = n_total_bits;
				do {
					halfmask_t bits = (n_packed_coords >> (b -= n_dims)) & n_dims_mask;
					bits = n_RotateRight(n_flip_bit ^ bits, rotation, n_dims, n_dims_mask);
					n_index <<= n_dims;
					n_index |= bits;
					n_flip_bit = halfmask_t(1) << rotation;
					AdjustRotation(rotation, bits, n_dims, n_dims_mask_one_less);
				} while(b);
				n_index ^= nthbits >> 1;
			} else
				n_index = n_packed_coords;
			for(unsigned int d = 1; d < n_total_bits; d *= 2)
				n_index ^= n_index >> d;
			return n_index;
		} else
			return p_coord[0];
	}
};

#endif // !__HILBERT_SPACE_FILLING_CURVES_INCLUDED
