/*
								+--------------------------------+
								|                                |
								|   ***  Exact arithmetic  ***   |
								|                                |
								|  Copyright  -tHE SWINe- 2014  |
								|                                |
								|           Exact.inl            |
								|                                |
								+--------------------------------+
*/

#pragma once
#ifndef __EXACT_MULTIPRECISION_FLOATING_POINT_ARITHMETIC_INTERNAL_INCLUDED
#define __EXACT_MULTIPRECISION_FLOATING_POINT_ARITHMETIC_INTERNAL_INCLUDED

/**
 *	@file Exact.inl
 *	@author -tHE SWINe-
 *	@date 2014
 *	@brief internals for exact multi-precision floating-point arithmetic
 */

/*
 *								=== CExactBase ===
 */

template <class T>
inline void CExactBase<T>::DumpExpansion(unsigned int n_e_size, const TType *p_e)
{
	printf("expansion(len = %u", n_e_size);
	if(expansion_internal::ex_msvc6::CIsSameType<TType, double>::b_result) { // compile-time constant
		for(unsigned int i = 0; i < n_e_size; ++ i)
			printf(", " PRIxdouble, PRIxdoubleparams(double(p_e[i]))); // use precise notation
	} else if(expansion_internal::ex_msvc6::CIsSameType<TType, float>::b_result) { // compile-time constant
		for(unsigned int i = 0; i < n_e_size; ++ i)
			printf(", " PRIxfloat, PRIxfloatparams(float(p_e[i]))); // use precise notation
	} else {
		for(unsigned int i = 0; i < n_e_size; ++ i)
			printf(", %g", p_e[i]); // not sure how to print it, just print some numbers
	}
	printf(")\n");
	// not the cleanest implementation but the shortest for sure
}

template <class T>
void CExactBase<T>::DumpExpansion_Long(unsigned int n_e_size, const TType *p_e) // throw(std::bad_alloc)
{
	unsigned int n_first_nz = 0, n = n_e_size;
	while(n_first_nz + 1 < n && p_e[n_first_nz] == 0)
		++ n_first_nz;
	unsigned int n_last_nz = (n)? n - 1 : 0;
	while(n_last_nz > n_first_nz && p_e[n_last_nz] == 0)
		-- n_last_nz;
	// find the first and the last nonzero

	const TType f_first_nz = p_e[n_first_nz], f_last_nz = p_e[n_last_nz];
	_ASSERTE(f_first_nz <= f_last_nz); // assumes ascending order (as all the other ops)

	const CFloatUtils<TType>::TIntType n_mantissa_one = CFloatUtils<TType>::n_Mantissa_One();

	int n_leading_sign = CFloatUtils<TType>::n_Get_SignBit(f_last_nz);
	int n_leading_exp = CFloatUtils<TType>::n_Get_Exponent(f_last_nz) + 3; // add 3 to start as "1." with three leading zeros in the first nibble
	{
		CFloatUtils<TType>::TIntType n_mantissa = CFloatUtils<TType>::n_Get_Mantissa(f_last_nz);
		_ASSERTE(n_mantissa); // the number is supposed to be nonzero, so ...
		while(n_mantissa < n_mantissa_one) { // while denormal
			n_mantissa <<= 1; // multiply mantissa by 2
			-- n_leading_exp; // compensate in exponent
		}
		// normalize the leading exponent / mantissa
	}
	int n_trailing_exp = CFloatUtils<TType>::n_Get_Exponent(f_first_nz);
	{
		CFloatUtils<TType>::TIntType n_mantissa = CFloatUtils<TType>::n_Get_Mantissa(f_first_nz);
		_ASSERTE(n_mantissa); // the number is supposed to be nonzero, so ...
		while(n_mantissa < n_mantissa_one) { // while denormal
			n_mantissa <<= 1; // multiply mantissa by 2
			-- n_trailing_exp; // compensate in exponent
		}
		// normalize the trailing exponent / mantissa
	}
	int n_total_mantissa_bit_num = n_leading_exp + CFloatUtils<TType>::n_mantissa_bit_num - n_trailing_exp;

	std::vector<uint8_t> mantissa((n_total_mantissa_bit_num + 7) / 8, 0);

	for(int i = n_e_size; i > 0;) {
		-- i; // here

		TType f = p_e[i];
		_ASSERTE(CFloatUtils<TType>::b_Is_Finite(f)); // make sure it is a number
		if(!f)
			continue; // ...
		int n_sign = CFloatUtils<TType>::n_Get_SignBit(f);
		int n_exponent = CFloatUtils<TType>::n_Get_Exponent(f);
		CFloatUtils<TType>::TIntType n_mantissa = CFloatUtils<TType>::n_Get_Mantissa(f);

		_ASSERTE(n_mantissa); // the number is supposed to be nonzero, so ...
		while(n_mantissa < n_mantissa_one) { // while denormal
			n_mantissa <<= 1; // multiply mantissa by 2
			-- n_exponent; // compensate in exponent
		}
		// normalize the exponent / mantissa

		if(n_sign != n_leading_sign) {
			for(int n_shift = n_leading_exp - n_exponent; n_shift > 0;) {
				-- n_shift;
				_ASSERTE(n_shift > 0);
				if((mantissa[n_shift >> 3] & (1 << (7 - (n_shift & 7)))) != 0) {
					mantissa[n_shift >> 3] &= ~(1 << (7 - (n_shift & 7))); // zero the set bit
					break;
				}
				mantissa[n_shift >> 3] |= 1 << (7 - (n_shift & 7)); // flip the zero bit
			}
			// carry up

			n_mantissa = uint32_t(-1) - n_mantissa + 1;
			// get two's complement of the mantissa
		}
		// in case we are subtracting, flip the zero bits on the left of the mantissa
		// until a nonzero bit is found. clear that bit and stop. negate mantissa.

		for(int n_man_shift = CFloatUtils<TType>::n_mantissa_bit_num; n_man_shift > 0; -- n_exponent) {
			-- n_man_shift; // here

			int n_bit = (n_mantissa >> n_man_shift) & 1;
			int n_shift = n_leading_exp - n_exponent;
			_ASSERTE(!(mantissa[n_shift >> 3] & (1 << (7 - (n_shift & 7))))); // make sure that the corresponding bit is not set
			mantissa[n_shift >> 3] |= n_bit << (7 - (n_shift & 7)); // filling msb to lsb
		}
		// put mantissa bits to the global mantissa
	}

	while(!mantissa.empty() && mantissa.back() == 0)
		mantissa.erase(mantissa.end() - 1);
	// drop tailing zeroes

	if(n_leading_sign)
		printf("-");
	if(mantissa.empty())
		printf("0x0.0p+0"); // sort of non-standard, should be 0x1.0p<min-exponent> but here the width is undefined so the min exponent could be anything
	else {
		_ASSERTE((mantissa.front() & 0xf0) == 0x10);
		printf("0x1.%x", mantissa.front() & 0xf);
		for(unsigned int i = 1, n = mantissa.size() - 1; i < n; ++ i)
			printf("%02x", mantissa[i]); // all the digits, except for the first and the last one
		if(mantissa.back() & 0xf)
			printf("%02x", mantissa.back()); // two digit tail
		else
			printf("%01x", mantissa.back() >> 4); // single digit tail
		n_leading_exp -= 3; // subtract 3 back as we explicitly placed the decimal point
		printf("p%c%x", (n_leading_exp < 0)? '-' : '+', abs(n_leading_exp));
	}

	// t_odo - detect denormals and special numbers, assert them somehow
	// todo - test with denormals

	// could have output to string or to file to be able to output precise floats in a portable way
	// could have a scanf function to convert a long string back to an expansion to be able to read it
}

template <class T>
int CExactBase<T>::n_Mantissa_Width(unsigned int n_e_size, const TType *p_e)
{
	if(!n_e_size)
		return 0; // ...
	unsigned int n_first_nz = 0;
	while(n_first_nz + 1 < n_e_size && p_e[n_first_nz] == 0)
		++ n_first_nz;
	unsigned int n_last_nz = /*(n_e_size)?*/ n_e_size - 1 /*: 0*/;
	while(n_last_nz > n_first_nz && p_e[n_last_nz] == 0)
		-- n_last_nz;
	// find the first and the last nonzero

	const TType f_first_nz = p_e[n_first_nz], f_last_nz = p_e[n_last_nz];
	_ASSERTE(f_first_nz <= f_last_nz); // assumes ascending order (as all the other ops)

	if(n_first_nz == n_last_nz)
		return n_Bit_Bandwidth(CFloatUtils<TType>::n_Get_Mantissa(f_first_nz));
	// required for correct handling of denormals, otherwise
	// the below code would handle single digit expansions as well

	int n_leading_exp = CFloatUtils<TType>::n_Get_Exponent(f_last_nz);
	{
		CFloatUtils<TType>::TIntType n_mantissa = CFloatUtils<TType>::n_Get_Mantissa(f_last_nz);
		_ASSERTE(n_mantissa); // the number is supposed to be nonzero, so ...
		const CFloatUtils<TType>::TIntType n_mantissa_one = CFloatUtils<TType>::n_Mantissa_One();
		while(n_mantissa < n_mantissa_one) { // while denormal
			n_mantissa <<= 1; // multiply mantissa by 2
			-- n_leading_exp; // compensate in exponent
		}
		// normalize the leading exponent / mantissa
	}

	unsigned int n_bit_num = n_leading_exp/*CFloatUtils<TType>::n_Get_Exponent(f_last_nz)*/ -
		CFloatUtils<TType>::n_Get_Exponent(f_first_nz) + 1;
	// start with the difference of the exponents, plus the leading "1."

	{
		CFloatUtils<TType>::TIntType n_mantissa = CFloatUtils<TType>::n_Get_Mantissa(f_first_nz);
		if((f_first_nz < 0) != (f_last_nz < 0)) {
			const CFloatUtils<TType>::TIntType n_mantissa_orig = n_mantissa;
			n_mantissa = (CFloatUtils<TType>::TIntType(1) <<
				CFloatUtils<TType>::n_mantissa_bit_num) - n_mantissa;
			// negate the lower part of the mantissa without introducing bits in the upper part

			if(n_mantissa_orig & CFloatUtils<TType>::n_Mantissa_One()) {
				n_mantissa |= CFloatUtils<TType>::TIntType(1) <<
					(CFloatUtils<TType>::n_mantissa_bit_num - 1);
				// make sure the high mantissa bit is set (to denote where to start counting the bits)
			} else {
				n_mantissa |= CFloatUtils<TType>::TIntType(1) <<
					(CFloatUtils<TType>::n_mantissa_bit_num - 1);
				// this is a denormal number but the exponent points to the left so probably that is still correct
				// if there is only a single denormal number in the whole expansion then this branch is never taken
			}
		}
		/*while(n_mantissa && !(n_mantissa & 1))
			n_mantissa >>= 1;
		n_bit_num += n_Bit_Width(n_mantissa) - 1;*/
		n_bit_num += CFloatUtils<TType>::n_mantissa_bit_num - n_TrailingZero_Num(n_mantissa) - 1;
	}
	// add the number of bits of the fraction of the lowest number (not
	// sure what this does with denormals but it should generally work)

	return n_bit_num;
}

template <class T>
inline void CExactBase<T>::Two_Sum(TType &r_f_low, TType &r_f_high, const TType f_a, const TType f_b)
{
	r_f_high = TType(f_a + f_b);
	TInexactType b_virtual = TType(r_f_high - f_a);
	TType a_virtual = r_f_high - b_virtual;
	TType b_roundoff = f_b - b_virtual;
	TType a_roundoff = f_a - a_virtual;
	r_f_low = a_roundoff + b_roundoff;
}

template <class T>
inline void CExactBase<T>::Two_Diff(TType &r_f_low, TType &r_f_high, const TType f_a, const TType f_b)
{
	r_f_high = TType(f_a - f_b);
	TInexactType b_virtual = TType(f_a - r_f_high);
	TType a_virtual = r_f_high + b_virtual;
	TType b_roundoff = b_virtual - f_b;
	TType a_roundoff = f_a - a_virtual;
	r_f_low = a_roundoff + b_roundoff;
}

template <class T>
inline void CExactBase<T>::Fast_Two_Sum(TType &r_f_low, TType &r_f_high, const TType f_a, const TType f_b) // |f_a| must be greater than |f_b|, otherwise use Two_Sum()
{
	_ASSERTE((f_a > f_b) == (f_a > -f_b) || f_a == TType(0)); // fabs(f_a) >= fabs(f_b), or f_a is a spurious zero
	r_f_high = TType(f_a + f_b);
	TInexactType b_virtual = TType(r_f_high - f_a);
	r_f_low = f_b - b_virtual;
}

template <class T>
inline void CExactBase<T>::Fast_Two_Diff(TType &r_f_low, TType &r_f_high, const TType f_a, const TType f_b) // |f_a| must be greater than |f_b|, otherwise use Two_Diff()
{
	_ASSERTE((f_a > f_b) == (f_a > -f_b) || f_a == TType(0)); // fabs(f_a) >= fabs(f_b), or f_a is a spurious zero
	r_f_high = TType(f_a - f_b);
	TInexactType b_virtual = TType(f_a - r_f_high);
	r_f_low = b_virtual - f_b;
}

template <class T>
inline const T CExactBase<T>::f_Epsilon() // note that matlab's epsilon is twice this value (EPS, with no arguments, is the distance from 1.0 to the next larger double), which means one ULP
{
	static const TType f_epsilon = CFloatUtils<TType>::f_ULP(TType(1)) / 2;
	// Schewchuk says one digit smaller than ULP

#ifdef _DEBUG
	{
		const TType half = TType(0.5);
		TType epsilon = 1;
		TType check = 1, lastcheck;
		/* Repeatedly divide `epsilon' by two until it is too small to add to    */
		/*   one without causing roundoff.  (Also check if the sum is equal to   */
		/*   the previous sum, for machines that round up instead of using exact */
		/*   rounding.  Not that this library will work on such machines anyway. */
		do {
			lastcheck = check;
			epsilon *= half;
			check = TType(1) + epsilon;
		} while((check != TType(1)) && (check != lastcheck));
		// another way of computing epsilon, based on exactinit() by Schewchuk

		_ASSERTE(epsilon == f_epsilon);
		// debug check
	}
#endif // _DEBUG

	return f_epsilon;
}

template <class T>
inline const T CExactBase<T>::f_Splitter()
{
	static const TType f_splitter = CFloatUtils<TType>::f_MakeFloat(false,
		CFloatUtils<TType>::n_Mantissa_One(), -CFloatUtils<TType>::n_Get_Exponent(
		CFloatUtils<TType>::f_ULP(TType(1))) / 2 + 1) + TType(1); // definitely / 2 + 1 and not ( + 1) / 2, that gives wrong results
	// 1 / sqrt(epsilon) + 1

#ifdef _DEBUG
	static const TType f_epsilon = CFloatUtils<TType>::f_ULP(TType(1)) / 2;
	// Schewchuk says one digit smaller than ULP

	{
		bool every_other = true;
		const TType half = TType(0.5);
		TType epsilon = 1;
		TType splitter = 1;
		TType check = 1, lastcheck;
		/* Repeatedly divide `epsilon' by two until it is too small to add to    */
		/*   one without causing roundoff.  (Also check if the sum is equal to   */
		/*   the previous sum, for machines that round up instead of using exact */
		/*   rounding.  Not that this library will work on such machines anyway. */
		do {
			lastcheck = check;
			epsilon *= half;
			if(every_other)
				splitter *= TType(2);
			every_other = !every_other;
			check = TType(1) + epsilon;
		} while((check != TType(1)) && (check != lastcheck));
		splitter += TType(1);
		// another way of computing splitter, based on exactinit() by Schewchuk

		_ASSERTE(epsilon == f_epsilon);
		_ASSERTE(splitter == f_splitter);
		// debug check
	}
#endif // _DEBUG

	return f_splitter;
}

template <class T>
inline void CExactBase<T>::Split(TType &f_a_low, TType &f_a_high, const TType f_a)
{
	static const TType f_splitter = f_Splitter();
	// should degenerate to a constant value on reasonable compilers
	// or initialize once - reuse then, on unreasonable compilers

	TInexactType c = TType(f_splitter * f_a);
	TInexactType abig = TType(c - f_a);
	f_a_high = c - abig;
	f_a_low = f_a - f_a_high;
}

template <class T>
inline void CExactBase<T>::Square(TType &r_f_low, TType &r_f_high, const TType f_a)
{
	r_f_high = TType(f_a * f_a);
	TType f_a_low, f_a_high;
	Split(f_a_low, f_a_high, f_a);
	TType err1 = r_f_high - (f_a_high * f_a_high);
	TType err3 = err1 - ((f_a_high + f_a_high) * f_a_low);
	r_f_low = (f_a_low * f_a_low) - err3;
}

template <class T>
inline void CExactBase<T>::Two_Product(TType &r_f_low, TType &r_f_high, const TType f_a, const TType f_b)
{
	r_f_high = TType(f_a * f_b); // todo - this can be accelerated using FMA, see Yozo, Algorithms for quad-double precision floating point arithmetic, 2000
	TType f_a_low, f_a_high, f_b_low, f_b_high;
	Split(f_a_low, f_a_high, f_a);
	Split(f_b_low, f_b_high, f_b);
	TType err1 = r_f_high - (f_a_high * f_b_high);
	TType err2 = err1 - (f_a_low * f_b_high);
	TType err3 = err2 - (f_a_high * f_b_low);
	r_f_low = (f_a_low * f_b_low) - err3;
}

template <class T>
inline void CExactBase<T>::Two_Product_Presplit(TType &r_f_low, TType &r_f_high, const TType f_a,
	const TType f_a_low, const TType f_a_high, const TType f_b)
{
	r_f_high = TType(f_a * f_b);
	TType f_b_low, f_b_high;
	Split(f_b_low, f_b_high, f_b);
	TType err1 = r_f_high - (f_a_high * f_b_high);
	TType err2 = err1 - (f_a_low * f_b_high);
	TType err3 = err2 - (f_a_high * f_b_low);
	r_f_low = (f_a_low * f_b_low) - err3;
}

template <class T>
inline void CExactBase<T>::Two_Product_Presplit2(TType &r_f_low, TType &r_f_high, const TType f_a,
	const TType f_a_low, const TType f_a_high, const TType f_b, const TType f_b_low, const TType f_b_high)
{
	r_f_high = TType(f_a * f_b);
	TType err1 = r_f_high - (f_a_high * f_b_high);
	TType err2 = err1 - (f_a_low * f_b_high);
	TType err3 = err2 - (f_a_high * f_b_low);
	r_f_low = (f_a_low * f_b_low) - err3;
}

template <class T>
inline unsigned int CExactBase<T>::n_Grow_Expansion(unsigned int UNUSED(n_max_result_size), TType *p_result,
	unsigned int n_e_size, const TType *p_e, TType f_b) // p_e and p_result can be the same
{
	_ASSERTE(n_max_result_size >= n_e_size + 1);

	TType Q = f_b;
	for(unsigned int i = 0; i < n_e_size; ++ i) {
		const TType f_e_cur = p_e[i];
		Two_Sum(p_result[i], Q, Q, f_e_cur);
	}
	p_result[n_e_size] = Q;
	return n_e_size + 1;
}

template <class T>
inline unsigned int CExactBase<T>::n_Grow_Expansion_EliminateZeroes(unsigned int UNUSED(n_max_result_size), TType *p_result,
	unsigned int n_e_size, const TType *p_e, TType f_b) // p_e and p_result can be the same
{
	_ASSERTE(n_max_result_size >= n_e_size + 1);

	TType Q = f_b;
	unsigned int i = 0;
	for(unsigned int j = 0; j < n_e_size; ++ j) {
		const TType f_e_cur = p_e[j];
		TType hh;
		Two_Sum(hh, Q, Q, f_e_cur);
		if(hh != TType(0)) {
			p_result[i] = hh;
			++ i;
		}
	}
	if(Q != TType(0) || !i) { // don't allow zero-length expansions, that could cause troubles
		p_result[i] = Q;
		++ i;
	}
	return i;
}

template <class T>
inline unsigned int CExactBase<T>::n_Expansion_Sum(unsigned int UNUSED(n_max_result_size), TType *p_result,
	unsigned int n_e_size, const TType *p_e, unsigned int n_f_size, const TType *p_f) // p_e and p_result can be the same expansion
{
	_ASSERTE(p_result != p_f); // p_f and p_result may not be the same
	_ASSERTE(n_max_result_size > 0 && n_max_result_size >= n_e_size + n_f_size);
	_ASSERTE(n_f_size > 0);

	TType Q = p_f[0];
	for(unsigned int i = 0; i < n_e_size; ++ i) {
		const TType f_e_cur = p_e[i];
		Two_Sum(p_result[i], Q, Q, f_e_cur);
	}
	p_result[n_e_size] = Q;
	unsigned int n_dest = n_e_size;
	for(unsigned int j = 1; j < n_f_size; ++ j) {
		Q = p_f[j];
		for(unsigned int i = j; i <= n_dest; ++ i) {
			const TType hnow = p_result[i];
			Two_Sum(p_result[i], Q, Q, hnow);
		}
		p_result[++ n_dest] = Q;
	}
	return n_dest + 1;
}

template <class T>
inline unsigned int CExactBase<T>::n_Expansion_Difference(unsigned int UNUSED(n_max_result_size), TType *p_result,
	unsigned int n_f_size, const TType *p_f, unsigned int n_e_size, const TType *p_e) // p_e and p_result can be the same expansion
{
	_ASSERTE(p_result != p_f); // p_f and p_result may not be the same
	_ASSERTE(n_max_result_size > 0 && n_max_result_size >= n_e_size + n_f_size);
	_ASSERTE(n_f_size > 0);

	TType Q = p_f[0];
	for(unsigned int i = 0; i < n_e_size; ++ i) {
		const TType f_e_cur = p_e[i];
		Two_Diff(p_result[i], Q, Q, f_e_cur);
	}
	p_result[n_e_size] = Q;
	unsigned int n_dest = n_e_size;
	for(unsigned int j = 1; j < n_f_size; ++ j) {
		Q = p_f[j];
		for(unsigned int i = j; i <= n_dest; ++ i) {
			const TType hnow = p_result[i];
			Two_Sum(p_result[i], Q, Q, hnow);
		}
		p_result[++ n_dest] = Q;
	}
	return n_dest + 1;
}

template <class T>
inline unsigned int CExactBase<T>::n_Expansion_Sum_EliminateZeroes1(unsigned int UNUSED(n_max_result_size), TType *p_result,
	unsigned int n_e_size, const TType *p_e, unsigned int n_f_size, const TType *p_f) // p_e and p_result can be the same expansion
{
	_ASSERTE(p_result != p_f); // p_f and p_result may not be the same
	_ASSERTE(n_max_result_size > 0 && n_max_result_size >= n_e_size + n_f_size);
	_ASSERTE(n_f_size > 0);

	TType Q = p_f[0];
	unsigned int i = 0;
	for(; i < n_e_size; ++ i) {
		const TType &hnow = p_e[i];
		Two_Sum(p_result[i], Q, Q, hnow);
	}
	p_result[i] = Q;
	unsigned int n_last = i;
	for(unsigned int j = 1; j < n_f_size; ++ j) {
		Q = p_f[j];
		for(i = j; i <= n_last; ++ i) {
			TType hnow = p_result[i];
			Two_Sum(p_result[i], Q, Q, hnow);
		}
		p_result[++ n_last] = Q;
	}
	i = (unsigned int)-1;
	for(unsigned int j = 0; j <= n_last; ++ j) {
		TType hnow = p_result[j];
		if(hnow != TType(0))
			p_result[++ i] = hnow;
	}
	if(i == (unsigned int)-1) {
		_ASSERTE(p_result[0] == TType(0));
		return 1;
	} else
		return i + 1;
}

template <class T>
inline unsigned int CExactBase<T>::n_Expansion_Difference_EliminateZeroes1(unsigned int UNUSED(n_max_result_size), TType *p_result,
	unsigned int n_f_size, const TType *p_f, unsigned int n_e_size, const TType *p_e) // p_e and p_result can be the same expansion
{
	_ASSERTE(p_result != p_f); // p_f and p_result may not be the same
	_ASSERTE(n_max_result_size > 0 && n_max_result_size >= n_e_size + n_f_size);
	_ASSERTE(n_f_size > 0);

	TType Q = p_f[0];
	unsigned int i = 0;
	for(; i < n_e_size; ++ i) {
		const TType &hnow = p_e[i];
		Two_Diff(p_result[i], Q, Q, hnow);
	}
	p_result[i] = Q;
	unsigned int n_last = i;
	for(unsigned int j = 1; j < n_f_size; ++ j) {
		Q = p_f[j];
		for(i = j; i <= n_last; ++ i) {
			TType hnow = p_result[i];
			Two_Sum(p_result[i], Q, Q, hnow);
		}
		p_result[++ n_last] = Q;
	}
	i = (unsigned int)-1;
	for(unsigned int j = 0; j <= n_last; ++ j) {
		TType hnow = p_result[j];
		if(hnow != TType(0))
			p_result[++ i] = hnow;
	}
	if(i == (unsigned int)-1) {
		_ASSERTE(p_result[0] == TType(0));
		return 1;
	} else
		return i + 1;
}

template <class T>
inline unsigned int CExactBase<T>::n_Expansion_Sum_EliminateZeroes2(unsigned int UNUSED(n_max_result_size), TType *p_result,
	unsigned int n_e_size, const TType *p_e, unsigned int n_f_size, const TType *p_f) // p_e and p_result can be the same expansion
{
	_ASSERTE(p_result != p_f); // p_f and p_result may not be the same
	_ASSERTE(n_max_result_size > 0 && n_max_result_size >= n_e_size + n_f_size);
	_ASSERTE(n_f_size > 0);

	unsigned int i = 0;
	TType Q = p_f[0];
	for(unsigned int j = 0; j < n_e_size; ++ j) {
		TType hh;
		Two_Sum(hh, Q, Q, p_e[j]);
		if(hh != TType(0)) {
			p_result[i] = hh;
			++ i;
		}
	}
	p_result[i] = Q;
	unsigned int n_dest = i;
	for(unsigned int k = 1; k < n_f_size; ++ k) {
		i = 0;
		Q = p_f[k];
		for(unsigned int j = 0; j <= n_dest; ++ j) {
			TType hh;
			Two_Sum(hh, Q, Q, p_result[j]);
			if(hh != TType(0)) {
				p_result[i] = hh;
				++ i;
			}
		}
		p_result[i] = Q;
		n_dest = i;
	}
	return n_dest + 1;
}

template <class T>
inline unsigned int CExactBase<T>::n_Expansion_Difference_EliminateZeroes2(unsigned int UNUSED(n_max_result_size), TType *p_result,
	unsigned int n_f_size, const TType *p_f, unsigned int n_e_size, const TType *p_e) // p_e and p_result can be the same expansion
{
	_ASSERTE(p_result != p_f); // p_f and p_result may not be the same
	_ASSERTE(n_max_result_size > 0 && n_max_result_size >= n_e_size + n_f_size);
	_ASSERTE(n_f_size > 0);

	unsigned int i = 0;
	TType Q = p_f[0];
	for(unsigned int j = 0; j < n_e_size; ++ j) {
		TType hh;
		Two_Diff(hh, Q, Q, p_e[j]);
		if(hh != TType(0)) {
			p_result[i] = hh;
			++ i;
		}
	}
	p_result[i] = Q;
	unsigned int n_dest = i;
	for(unsigned int k = 1; k < n_f_size; ++ k) {
		i = 0;
		Q = p_f[k];
		for(unsigned int j = 0; j <= n_dest; ++ j) {
			TType hh;
			Two_Sum(hh, Q, Q, p_result[j]);
			if(hh != TType(0)) {
				p_result[i] = hh;
				++ i;
			}
		}
		p_result[i] = Q;
		n_dest = i;
	}
	return n_dest + 1;
}

template <class T>
inline unsigned int CExactBase<T>::n_Linear_Expansion_Sum(unsigned int UNUSED(n_max_result_size),
	const TType *p_result, unsigned int n_e_size, const TType *p_e, int n_f_size, const TType *p_f) // should be used for long expansions, otherwise n_Expansion_Sum is faster
{
	_ASSERTE(p_result != p_e && p_result != p_f); // p_result cannot be p_e or p_f
	_ASSERTE(n_max_result_size > 0 && n_max_result_size >= n_e_size + n_f_size);
	_ASSERTE(n_e_size > 0);
	_ASSERTE(n_f_size > 0);

	TType f_e_cur = *p_e, f_f_cur = *p_f, g0;
	unsigned int n_e_index = 0, n_f_index = 0;
	if((f_f_cur > f_e_cur) == (f_f_cur > -f_e_cur)) {
		g0 = f_e_cur;
		f_e_cur = p_e[++ n_e_index];
	} else {
		g0 = f_f_cur;
		f_f_cur = p_f[++ n_f_index];
	}
	TType Q, q;
	if((n_e_index < n_e_size) && ((n_f_index >= n_f_size) || ((f_f_cur > f_e_cur) == (f_f_cur > -f_e_cur)))) {
		Fast_Two_Sum(q, Q, f_e_cur, g0);
		f_e_cur = p_e[++ n_e_index];
	} else {
		Fast_Two_Sum(q, Q, f_f_cur, g0);
		f_f_cur = p_f[++ n_f_index];
	}
	unsigned int i = 0;
	for(; i < n_e_size + n_f_size - 2; ++ i) {
		TInexactType R;
		if((n_e_index < n_e_size) && ((n_f_index >= n_f_size) || ((f_f_cur > f_e_cur) == (f_f_cur > -f_e_cur)))) {
			Fast_Two_Sum(p_result[i], R, f_e_cur, q);
			f_e_cur = p_e[++ n_e_index];
		} else {
			Fast_Two_Sum(p_result[i], R, f_f_cur, q);
			f_f_cur = p_f[++ n_f_index];
		}
		Two_Sum(q, Q, Q, R);
	}
	p_result[i] = q;
	p_result[i + 1] = Q;
	return i + 2;
}

template <class T>
inline unsigned int CExactBase<T>::n_Linear_Expansion_Sum_EliminateZeroes(unsigned int UNUSED(n_max_result_size),
	const TType *p_result, unsigned int n_e_size, const TType *p_e, int n_f_size, const TType *p_f) // should be used for long expansions, otherwise n_Expansion_Sum is faster
{
	_ASSERTE(p_result != p_e && p_result != p_f); // p_result cannot be p_e or p_f
	_ASSERTE(n_max_result_size > 0 && n_max_result_size >= n_e_size + n_f_size);
	_ASSERTE(n_e_size > 0);
	_ASSERTE(n_f_size > 0);

	TType f_e_cur = *p_e, f_f_cur = *p_f, g0;
	unsigned int n_e_index = 0, n_f_index = 0;
	if((f_f_cur > f_e_cur) == (f_f_cur > -f_e_cur)) {
		g0 = f_e_cur;
		f_e_cur = p_e[++ n_e_index];
	} else {
		g0 = f_f_cur;
		f_f_cur = p_f[++ n_f_index];
	}
	TType Q, q;
	if((n_e_index < n_e_size) && ((n_f_index >= n_f_size) || ((f_f_cur > f_e_cur) == (f_f_cur > -f_e_cur)))) {
		Fast_Two_Sum(q, Q, f_e_cur, g0);
		f_e_cur = p_e[++ n_e_index];
	} else {
		Fast_Two_Sum(q, Q, f_f_cur, g0);
		f_f_cur = p_f[++ n_f_index];
	}
	unsigned int i = 0;
	for(unsigned int j = 2; j < n_e_size + n_f_size; ++ j) {
		TInexactType R;
		TType hh;
		if((n_e_index < n_e_size) && ((n_f_index >= n_f_size) || ((f_f_cur > f_e_cur) == (f_f_cur > -f_e_cur)))) {
			Fast_Two_Sum(hh, R, f_e_cur, q);
			f_e_cur = p_e[++ n_e_index];
		} else {
			Fast_Two_Sum(hh, R, f_f_cur, q);
			f_f_cur = p_f[++ n_f_index];
		}
		Two_Sum(q, Q, Q, R);
		if(hh != TType(0)) {
			p_result[i] = hh;
			++ i;
		}
	}
	if(q != TType(0)) {
		p_result[i] = q;
		++ i;
	}
	if(Q != TType(0)) {
		p_result[i] = Q;
		++ i;
	}
	_ASSERTE(i); // make sure the resulting expansion is not empty
	return i;
}

template <class T>
inline unsigned int CExactBase<T>::n_Fast_Expansion_Sum(unsigned int UNUSED(n_max_result_size),
	TType *p_result, unsigned int n_e_size, const TType *p_e, unsigned int n_f_size, const TType *p_f) // should be used only at the end and not mixed with standard sum or multiplications, only when user requests it (not in operator +)
{
	_ASSERTE(p_result != p_e && p_result != p_f); // p_result cannot be p_e or p_f
	_ASSERTE(n_max_result_size > 0 && n_max_result_size >= n_e_size + n_f_size);
	_ASSERTE(n_e_size > 0);
	_ASSERTE(n_f_size > 0);

	TType f_e_cur = *p_e, f_f_cur = *p_f;
	unsigned int n_e_index = 0, n_f_index = 0;
	TType Q;
	if((f_f_cur > f_e_cur) == (f_f_cur > -f_e_cur)) {
		Q = f_e_cur;
		f_e_cur = p_e[++ n_e_index];
	} else {
		Q = f_f_cur;
		f_f_cur = p_f[++ n_f_index];
	}
	unsigned int i = 0;
	if((n_e_index < n_e_size) && (n_f_index < n_f_size)) {
		if((f_f_cur > f_e_cur) == (f_f_cur > -f_e_cur)) {
			Fast_Two_Sum(p_result[0], Q, f_e_cur, Q);
			f_e_cur = p_e[++ n_e_index];
		} else {
			Fast_Two_Sum(p_result[0], Q, f_f_cur, Q);
			f_f_cur = p_f[++ n_f_index];
		}
		i = 1;
		while((n_e_index < n_e_size) && (n_f_index < n_f_size)) {
			if((f_f_cur > f_e_cur) == (f_f_cur > -f_e_cur)) {
				Two_Sum(p_result[i], Q, Q, f_e_cur);
				f_e_cur = p_e[++ n_e_index];
			} else {
				Two_Sum(p_result[i], Q, Q, f_f_cur);
				f_f_cur = p_f[++ n_f_index];
			}
			++ i;
		}
	}
	while(n_e_index < n_e_size) {
		Two_Sum(p_result[i], Q, Q, f_e_cur);
		f_e_cur = p_e[++ n_e_index];
		++ i;
	}
	while(n_f_index < n_f_size) {
		Two_Sum(p_result[i], Q, Q, f_f_cur);
		f_f_cur = p_f[++ n_f_index];
		++ i;
	}
	p_result[i] = Q;
	return i + 1;
}

template <class T>
inline unsigned int CExactBase<T>::n_Fast_Expansion_Sum_EliminateZeroes(unsigned int UNUSED(n_max_result_size),
	TType *p_result, unsigned int n_e_size, const TType *p_e, unsigned int n_f_size, const TType *p_f) // should be used only at the end and not mixed with standard sum or multiplications, only when user requests it (not in operator +)
{
	_ASSERTE(p_result != p_e && p_result != p_f); // p_result cannot be p_e or p_f
	_ASSERTE(n_max_result_size > 0 && n_max_result_size >= n_e_size + n_f_size);
	_ASSERTE(n_e_size > 0);
	_ASSERTE(n_f_size > 0);

	TType f_e_cur = *p_e, f_f_cur = *p_f;
	unsigned int n_e_index = 0, n_f_index = 0;
	TType Q;
	if((f_f_cur > f_e_cur) == (f_f_cur > -f_e_cur)) {
		Q = f_e_cur;
		f_e_cur = p_e[++ n_e_index];
	} else {
		Q = f_f_cur;
		f_f_cur = p_f[++ n_f_index];
	}
	unsigned int i = 0;
	if((n_e_index < n_e_size) && (n_f_index < n_f_size)) {
		TType hh;
		if((f_f_cur > f_e_cur) == (f_f_cur > -f_e_cur)) {
			Fast_Two_Sum(hh, Q, f_e_cur, Q);
			f_e_cur = p_e[++ n_e_index];
		} else {
			Fast_Two_Sum(hh, Q, f_f_cur, Q);
			f_f_cur = p_f[++ n_f_index];
		}
		if(hh != TType(0)) {
			p_result[0] = hh;
			i = 1;
		}
		while((n_e_index < n_e_size) && (n_f_index < n_f_size)) {
			TType hh;
			if((f_f_cur > f_e_cur) == (f_f_cur > -f_e_cur)) {
				Two_Sum(hh, Q, Q, f_e_cur);
				f_e_cur = p_e[++ n_e_index];
			} else {
				Two_Sum(hh, Q, Q, f_f_cur);
				f_f_cur = p_f[++ n_f_index];
			}
			if(hh != TType(0)) {
				p_result[i] = hh;
				++ i;
			}
		}
	}
	while(n_e_index < n_e_size) {
		TType hh;
		Two_Sum(hh, Q, Q, f_e_cur);
		f_e_cur = p_e[++ n_e_index];
		if(hh != TType(0)) {
			p_result[i] = hh;
			++ i;
		}
	}
	while(n_f_index < n_f_size) {
		TType hh;
		Two_Sum(hh, Q, Q, f_f_cur);
		f_f_cur = p_f[++ n_f_index];
		if(hh != TType(0)) {
			p_result[i] = hh;
			++ i;
		}
	}
	if(Q != TType(0) || !i) {
		p_result[i] = Q;
		++ i;
	}
	return i;
}

template <class T>
inline unsigned int CExactBase<T>::n_Scale_Expansion(unsigned int UNUSED(n_max_result_size),
	TType *p_result, unsigned int n_e_size, const TType *p_e, TType f_scale)
{
	_ASSERTE(p_result != p_e); // p_e and p_result cannot be the same
	_ASSERTE(n_max_result_size >= n_e_size * 2);
	_ASSERTE(n_e_size > 0);

	TType f_scale_high, f_scale_low;
	Split(f_scale_low, f_scale_high, f_scale);
	TInexactType Q;
	Two_Product_Presplit(p_result[0], Q, f_scale, f_scale_low, f_scale_high, p_e[0]);
	unsigned int i = 1;
	for(unsigned int j = 1; j < n_e_size; ++ j) {
		const TType f_e_cur = p_e[j];
		TInexactType product1;
		TType product0;
		Two_Product_Presplit(product0, product1, f_scale, f_scale_low, f_scale_high, f_e_cur);
		TInexactType sum;
		Two_Sum(p_result[i], sum, Q, product0);
		++ i;
		Two_Sum(p_result[i], Q, product1, sum); // this could be fast two sum, based on the paper? seems only in the last iteration.
		++ i;
	}
	p_result[i] = Q;
	return n_e_size + n_e_size;
}

template <class T>
inline unsigned int CExactBase<T>::n_Scale_Expansion_EliminateZeroes(unsigned int UNUSED(n_max_result_size),
	TType *p_result, unsigned int n_e_size, const TType *p_e, TType f_scale)
{
	_ASSERTE(p_result != p_e); // p_e and p_result cannot be the same
	_ASSERTE(n_max_result_size >= n_e_size * 2);
	_ASSERTE(n_e_size > 0);

	TType f_scale_high, f_scale_low;
	Split(f_scale_low, f_scale_high, f_scale);
	TInexactType Q;
	TType hh;
	Two_Product_Presplit(hh, Q, f_scale, f_scale_low, f_scale_high, p_e[0]);
	unsigned int i = 0;
	if(hh != TType(0)) {
		p_result[0] = hh;
		i = 1;
	}
	for(unsigned int j = 1; j < n_e_size; ++ j) {
		const TType f_e_cur = p_e[j];
		TInexactType product1;
		TType product0;
		Two_Product_Presplit(product0, product1, f_scale, f_scale_low, f_scale_high, f_e_cur);
		TInexactType sum;
		{
			TType hh;
			Two_Sum(hh, sum, Q, product0);
			if(hh != TType(0)) {
				p_result[i] = hh;
				++ i;
			}
		}
		{
			TType hh;
			Two_Sum(hh, Q, product1, sum); // this could be fast two sum, based on the paper? seems only in the last iteration.
			if(hh != TType(0)) {
				p_result[i] = hh;
				++ i;
			}
		}
	}
	if(Q != TType(0) || !i) {
		p_result[i] = Q;
		++ i;
	}
	return i;
}

template <class T>
inline unsigned int CExactBase<T>::n_Scale_Expansion_Split(unsigned int UNUSED(n_max_result_size),
	TType *p_result, unsigned int n_e_size, TType *p_e_low, TType *p_e_high, const TType *p_e,
	TType f_scale, TType f_scale_low, TType f_scale_high)
{
	_ASSERTE(p_result != p_e && p_e_low != p_result && p_e_high != p_result); // p_e and p_result cannot be the same
	_ASSERTE(p_e_low != p_e_high && p_e != p_e_low && p_e != p_e_high); // must be different arrays
	_ASSERTE(n_max_result_size >= n_e_size * 2);
	_ASSERTE(n_e_size > 0);

	Split(p_e_low[0], p_e_high[0], p_e[0]);
	TInexactType Q;
	Two_Product_Presplit2(p_result[0], Q, f_scale, f_scale_low, f_scale_high, p_e[0], p_e_low[0], p_e_high[0]);
	unsigned int i = 1;
	for(unsigned int j = 1; j < n_e_size; ++ j) {
		const TType f_e_cur = p_e[j];
		TType &f_e_low_cur = p_e_low[j], &f_e_high_cur = p_e_high[j];
		Split(f_e_low_cur, f_e_high_cur, f_e_cur);
		TInexactType product1;
		TType product0;
		Two_Product_Presplit2(product0, product1, f_scale, f_scale_low, f_scale_high, f_e_cur, f_e_low_cur, f_e_high_cur);
		TInexactType sum;
		Two_Sum(p_result[i], sum, Q, product0);
		++ i;
		Two_Sum(p_result[i], Q, product1, sum); // this could be fast two sum, based on the paper? seems only in the last iteration.
		++ i;
	}
	p_result[i] = Q;
	return n_e_size + n_e_size;
}

template <class T>
inline unsigned int CExactBase<T>::n_Scale_Expansion_Presplit(unsigned int UNUSED(n_max_result_size),
	TType *p_result, unsigned int n_e_size, const TType *p_e, const TType *p_e_low, const TType *p_e_high,
	TType f_scale, TType f_scale_low, TType f_scale_high)
{
	_ASSERTE(p_result != p_e && p_e_low != p_result && p_e_high != p_result); // p_e and p_result cannot be the same
	_ASSERTE(p_e_low != p_e_high && p_e != p_e_low && p_e != p_e_high); // must be different arrays
	_ASSERTE(n_max_result_size >= n_e_size * 2);
	_ASSERTE(n_e_size > 0);

	TInexactType Q;
	Two_Product_Presplit2(p_result[0], Q, f_scale, f_scale_low, f_scale_high, p_e[0], p_e_low[0], p_e_high[0]);
	unsigned int i = 1;
	for(unsigned int j = 1; j < n_e_size; ++ j) {
		const TType f_e_cur = p_e[j], f_e_low_cur = p_e_low[j], f_e_high_cur = p_e_high[j];
		TInexactType product1;
		TType product0;
		Two_Product_Presplit2(product0, product1, f_scale, f_scale_low, f_scale_high, f_e_cur, f_e_low_cur, f_e_high_cur);
		TInexactType sum;
		Two_Sum(p_result[i], sum, Q, product0);
		++ i;
		Two_Sum(p_result[i], Q, product1, sum); // this could be fast two sum, based on the paper? seems only in the last iteration.
		++ i;
	}
	p_result[i] = Q;
	return n_e_size + n_e_size;
}

// a product of two expansions is like the left expansion scaled by the first component
// of the right expansion, summed with the left expansion scaled by the next component
// of the right expansion, using expansion_sum
// there is opportunity to split the expansions, but some temp memory is needed for that
// this should be probably allocated by the static part, where the dimensions are known.

// expansion scale: doubles the length of the expansion
// expansion sum: sums the lengths of the expansions
// expansion product:
//		* in each step, the scaled left expansion is twice the length of the left expansion
//			* to presplit the left expansion, twice the left expansion size of temp storage is needed
//			* the operations can't be reordered, to avoid this temp storage, but can be minimized by putting the smaller factor on the left
//		* the sum of all products is twice the length of the left expansion times the length of the right expansion
//			* seems like the sums can be done in-place
// size is 2ab = 2ba, where a is size of expansion of A, and b is size of expansion of B (the order does not matter)

// t_odo - revert the order, CExpansion algorithms make sure that the right operand is the smaller one, to make
// handling of special cases easier (and because that likely requires fewer swaps, as the operators are left
// associative, making the left side of an expression (if without parentheses) potentially longer)
// just remember to multiply with the operands swapped, one needs to get all those temp arrays, should be obvious

// t_odo - n_Scale_Expansion_Split() - generates split of the left expansion, the right side already comes split
// t_odo - n_Scale_Expansion_Presplit() - uses a previously generated split of the left expansion, the right side already comes split
// t_odo - n_Expansion_Product() - use the two above and n_Expansion_Sum() to build the final product inplace
// t_odo - think about n_Expansion_Product_EliminateZeroes(); could a) filter out the zeros at the end,
//		b) filter out the zeros using n_Expansion_Sum_EliminateZeros() or
//		c) filter out the zeros also in n_Scale_Expansion_Split_EliminateZeros() and
//		n_Scale_Expansion_Presplit_EliminateZeros()

template <class T>
inline unsigned int CExactBase<T>::n_Expansion_Product(unsigned int n_max_result_size, TType *p_result,
	unsigned int n_e_size, const TType *p_e, TType *p_e_low, TType *p_e_high, unsigned int n_max_e_scaled_size, TType *p_e_scaled,
	unsigned int n_f_size, const TType *p_f)
{
	_ASSERTE(p_e != p_f || n_e_size == n_f_size); // either they are different expansions or they are the same
	_ASSERTE(/*p_f != p_e &&*/ p_e_low != p_f && p_e_high != p_f && p_e_scaled != p_f); // temp arrays for p_e cannot be the same as p_f, but p_e and p_f themselves can be the same when squaring an expansion
	_ASSERTE(p_result != p_e && p_e_low != p_result && p_e_high != p_result && p_e_scaled != p_result); // p_e and p_result cannot be the same
	_ASSERTE(p_f != p_result); // p_f and p_result cannot be the same
	_ASSERTE(p_e != p_e_low && p_e != p_e_high && p_e != p_e_scaled &&
		p_e_low != p_e_high && p_e_low != p_e_scaled && p_e_high != p_e_scaled); // p_e must be in four different arrays
	_ASSERTE(n_max_result_size >= 2 * n_e_size * n_f_size);
	_ASSERTE(n_max_e_scaled_size >= 2 * n_e_size);
	_ASSERTE(n_f_size > 0);

	// note that calculating square of an expansion would save the splits of p_f, which are already calculated for p_e

	unsigned int n_result_size;
	{
		TType fcur = p_f[0], flo, fhi;
		Split(flo, fhi, fcur);
		n_result_size = n_Scale_Expansion_Split(n_max_result_size, p_result, n_e_size,
			p_e_low, p_e_high, p_e, fcur, flo, fhi); // unnecessarily slow if n_f_size == 1, then a simple scale expansion would be faster

		//DumpExpansion(n_result_size, p_result); // debug
	}
	// calculate the first product, split p_e in the process

	for(unsigned int i = 1; i < n_f_size; ++ i) {
		TType f_f_cur = p_f[i], flo, fhi;
		Split(flo, fhi, f_f_cur);
		unsigned int n_esc_len = n_Scale_Expansion_Presplit(n_max_e_scaled_size, p_e_scaled,
			n_e_size, p_e, p_e_low, p_e_high, f_f_cur, flo, fhi);
		// calculate product with the next digit

		//DumpExpansion(n_esc_len, p_e_scaled); // debug

		n_result_size = n_Expansion_Sum(n_max_result_size, p_result,
			n_result_size, p_result, n_esc_len, p_e_scaled);
		// calculate sum with the previous scaled p_e
	}

	return n_result_size;
}

template <class T>
inline unsigned int CExactBase<T>::n_Expansion_Product_EliminateZeroes(unsigned int n_max_result_size, TType *p_result,
	unsigned int n_e_size, const TType *p_e, TType *p_e_low, TType *p_e_high, unsigned int n_max_e_scaled_size, TType *p_e_scaled,
	unsigned int n_f_size, const TType *p_f)
{
	_ASSERTE(p_e != p_f || n_e_size == n_f_size); // either they are different expansions or they are the same
	_ASSERTE(/*p_f != p_e &&*/ p_e_low != p_f && p_e_high != p_f && p_e_scaled != p_f); // temp arrays for p_e cannot be the same as p_f, but p_e and p_f themselves can be the same when squaring an expansion
	_ASSERTE(p_result != p_e && p_e_low != p_result && p_e_high != p_result && p_e_scaled != p_result); // p_e and p_result cannot be the same
	_ASSERTE(p_f != p_result); // p_f and p_result cannot be the same
	_ASSERTE(p_e != p_e_low && p_e != p_e_high && p_e != p_e_scaled &&
		p_e_low != p_e_high && p_e_low != p_e_scaled && p_e_high != p_e_scaled); // p_e must be in four different arrays
	_ASSERTE(n_max_result_size >= 2 * n_e_size * n_f_size);
	_ASSERTE(n_max_e_scaled_size >= 2 * n_e_size);
	_ASSERTE(n_f_size > 0);

	// note that calculating square of an expansion would save the splits of p_f, which are already calculated for p_e

	unsigned int n_result_size;
	{
		TType fcur = p_f[0], flo, fhi;
		Split(flo, fhi, fcur);
		n_result_size = n_Scale_Expansion_Split(n_max_result_size, p_result,
			n_e_size, p_e_low, p_e_high, p_e, fcur, flo, fhi); // unnecessarily slow if n_f_size == 1, then a simple scale expansion would be faster
		if(n_f_size == 1)
			n_result_size = n_EliminateZeroes(n_result_size, p_result); // saves me writing n_Scale_Expansion_Split_EliminateZeroes()
		// eliminate zeros here, if n_f_size == 1 then there will be no addition to eliminate the zeros in the loop below
	}
	// calculate the first product, split p_e in the process

	for(unsigned int i = 1; i < n_f_size; ++ i) {
		TType f_f_cur = p_f[i], flo, fhi;
		Split(flo, fhi, f_f_cur);
		unsigned int n_esc_len = n_Scale_Expansion_Presplit(n_max_e_scaled_size, p_e_scaled,
			n_e_size, p_e, p_e_low, p_e_high, f_f_cur, flo, fhi);
		// calculate product with the next digit

		n_result_size = n_Expansion_Sum_EliminateZeroes2(n_max_result_size, p_result,
			n_result_size, p_result, n_esc_len, p_e_scaled);
		// calculate sum with the previous scaled p_e
	}

	return n_result_size;
}

template <class T>
inline unsigned int CExactBase<T>::n_Expansion_Square(unsigned int n_max_result_size, TType *p_result,
	unsigned int n_e_size, const TType *p_e, TType *p_e_low, TType *p_e_high, unsigned int n_max_e_scaled_size, TType *p_e_scaled)
{
	_ASSERTE(p_result != p_e && p_e_low != p_result &&
		p_e_high != p_result && p_e_scaled != p_result); // p_e and p_result cannot be the same
	_ASSERTE(p_e != p_e_low && p_e != p_e_high && p_e != p_e_scaled &&
		p_e_low != p_e_high && p_e_low != p_e_scaled && p_e_high != p_e_scaled); // p_e must be in four different arrays
	_ASSERTE(n_max_result_size >= 2 * n_e_size * n_e_size);
	_ASSERTE(n_max_e_scaled_size >= 2 * n_e_size);
	_ASSERTE(n_e_size > 0);

	for(unsigned int i = 0; i < n_e_size; ++ i)
		Split(p_e_low[i], p_e_high[i], p_e[i]);
	// presplit all of p_e (not perfect for cache, but solves splitting the first element twice)

	unsigned int n_result_size;
	{
		TType fcur = p_e[0], flo = p_e_low[0], fhi = p_e_high[0];
		//Split(flo, fhi, fcur); // saved
		n_result_size = n_Scale_Expansion_Presplit(n_max_result_size, p_result, n_e_size,
			p_e, p_e_low, p_e_high, fcur, flo, fhi); // unnecessarily slow if n_f_size == 1, then a simple scale expansion would be faster
	}
	// calculate the first product, p_e is already split

	//DumpExpansion(n_result_size, p_result);

	for(unsigned int i = 1; i < n_e_size; ++ i) {
		const TType f_f_cur = p_e[i], flo = p_e_low[i], fhi = p_e_high[i];
		//Split(flo, fhi, fcur); // saved
		unsigned int n_esc_len = n_Scale_Expansion_Presplit(n_max_e_scaled_size, p_e_scaled,
			n_e_size, p_e, p_e_low, p_e_high, f_f_cur, flo, fhi);
		// calculate product with the next digit

		//DumpExpansion(n_esc_len, p_e_scaled);

		n_result_size = n_Expansion_Sum(n_max_result_size, p_result, n_result_size, p_result, n_esc_len, p_e_scaled);
		// calculate sum with the previous scaled p_e
	}

	return n_result_size;
}

template <class T>
inline unsigned int CExactBase<T>::n_Expansion_Square_EliminateZeroes(unsigned int n_max_result_size, TType *p_result,
	unsigned int n_e_size, const TType *p_e, TType *p_e_low, TType *p_e_high, unsigned int n_max_e_scaled_size, TType *p_e_scaled)
{
	_ASSERTE(p_result != p_e && p_e_low != p_result &&
		p_e_high != p_result && p_e_scaled != p_result); // p_e and p_result cannot be the same
	_ASSERTE(p_e != p_e_low && p_e != p_e_high && p_e != p_e_scaled &&
		p_e_low != p_e_high && p_e_low != p_e_scaled && p_e_high != p_e_scaled); // p_e must be in four different arrays
	_ASSERTE(n_max_result_size >= 2 * n_e_size * n_e_size);
	_ASSERTE(n_max_e_scaled_size >= 2 * n_e_size);
	_ASSERTE(n_e_size > 0);

	for(unsigned int i = 0; i < n_e_size; ++ i)
		Split(p_e_low[i], p_e_high[i], p_e[i]);
	// presplit all of p_e (not perfect for cache, but solves splitting the first element twice)

	unsigned int n_result_size;
	{
		TType fcur = p_e[0], flo = p_e_low[0], fhi = p_e_high[0];
		//Split(flo, fhi, fcur); // saved
		n_result_size = n_Scale_Expansion_Presplit(n_max_result_size, p_result,
			n_e_size, p_e, p_e_low, p_e_high, fcur, flo, fhi); // unnecessarily slow if n_f_size == 1, then a simple scale expansion would be faster

		if(n_e_size == 1) { // otherwise the eliminating addition in the loop below will take care of that
			n_result_size = n_EliminateZeroes(n_result_size, p_result); // overkill
#if 0 // todo - optimize this sometime
			{
				/*unsigned int i = 0;
				while(i < n_result_size && p_result[i] != T(0))
					++ i;*/
				// find the first zero; this is either 0 or 1

				unsigned int i = (p_result[0] == T(0))? n_result_size - 1 : 0;
				// find the first zero; this is either 0 or 1

				unsigned int n_dest = i; // position to write the next nonzero
				for(++ i; i < n_result_size; ++ i) { // we already know that i-th is zero, we can skip it
					if(p_result[i] != T(0)) {
						p_result[n_dest] = p_e[i];
						++ n_dest;
					}
				}
				// shift non-zeroes

				n_result_size = max(n_dest, 1U); // t_odo - drop zeroes
			}
#endif // 0
		}
		// eliminate zeros here
	}
	// calculate the first product, p_e is already split

	for(unsigned int i = 1; i < n_e_size; ++ i) {
		const TType f_f_cur = p_e[i], flo = p_e_low[i], fhi = p_e_high[i];
		//Split(flo, fhi, fcur); // saved
		unsigned int n_esc_len = n_Scale_Expansion_Presplit(n_max_e_scaled_size, p_e_scaled,
			n_e_size, p_e, p_e_low, p_e_high, f_f_cur, flo, fhi);
		// calculate product with the next digit

		n_result_size = n_Expansion_Sum_EliminateZeroes2(n_max_result_size, p_result,
			n_result_size, p_result, n_esc_len, p_e_scaled);
		// calculate sum with the previous scaled p_e
	}

	return n_result_size;
}

template <class T>
inline unsigned int CExactBase<T>::n_EliminateZeroes(unsigned int n_e_size, TType *p_e)
{
	unsigned int i = 0;
	while(i < n_e_size && p_e[i] != T(0))
		++ i;
	// find the first zero

	unsigned int n_dest = i; // position to write the next nonzero
	for(++ i; i < n_e_size; ++ i) { // we already know that i-th is zero, we can skip it
		if(p_e[i] != T(0)) {
			p_e[n_dest] = p_e[i];
			++ n_dest;
		}
	}
	// shift non-zeroes

	return max(n_dest, 1U); // t_odo - drop zeroes
}

// t_odo - n_Compress()
// t_odo - f_EstimateValue() // or approximate? see the dictionary // did, they are synonyms
// t_odo - start plugging it all in the templates

// t_odo - see how much approximate is the division
// t_odo - fix the (lo, hi) order of arguments
// t_odo - see if these functions can go in the first .inl

template <class T>
inline unsigned int CExactBase<T>::n_Compress(unsigned int UNUSED(n_max_result_size), TType *p_result, unsigned int n_e_size, const TType *p_e) // p_e and p_result may be the same
{
	enum {
		n_max_compressed_size = CMaxExpansionSize<T>::n_longest_expansion_size /**< @brief maximum size, given the type */
	};
	//const bool b_could_overrun = n_e_size > n_max_compressed_size; // unused
	// if the expansion is invalid (overlapping) or if n_max_compressed_size is not calculated correctly

	_ASSERTE(n_max_result_size >= min((unsigned int)n_max_compressed_size, n_e_size));
	_ASSERTE(n_e_size > 0);

#if 1
	// a slightly modified algorithm, designed not to overrun the destination array
	// in case it is shorter than n_e_size but sufficient for n_max_compressed_size

	unsigned int n_bottom = n_max_result_size;
	TType Q = p_e[n_e_size - 1]; // the last one
	if(n_max_result_size < n_e_size) {
		for(unsigned int i = n_e_size - 1; i > 0;) {
			-- i; // here
			TType f_e_cur = p_e[i], q;
			TInexactType Qnew;
			Fast_Two_Sum(q, Qnew, Q, f_e_cur); // can use the fast sum, the operands are ordered
			if(q != TType(0)) {
				if(Qnew != Qnew) { // detect NaN; if n_max_result_size < n_e_size and p_e contains a NaN, it could fill up to n_e_size elements in p_result, causing an out of bounds access
					p_result[0] = Qnew;
					return 1;
				}
				_ASSERTE(n_bottom > 0); // make sure we won't do out-of-bounds access in case n_e_size exceeds n_max_compressed_size
				p_result[-- n_bottom] = Qnew;
				Q = q;
			} else
				Q = Qnew; // q is zero, only Qnew remains
		}
	} else {
		for(unsigned int i = n_e_size - 1; i > 0;) {
			-- i; // here
			TType f_e_cur = p_e[i], q;
			TInexactType Qnew;
			Fast_Two_Sum(q, Qnew, Q, f_e_cur); // can use the fast sum, the operands are ordered
			if(q != TType(0)) {
				_ASSERTE(n_bottom > 0); // make sure we won't do out-of-bounds access in case n_e_size exceeds n_max_compressed_size
				p_result[-- n_bottom] = Qnew;
				Q = q;
			} else
				Q = Qnew; // q is zero, only Qnew remains
		}
		// a slightly faster branch without NaN detection when all of p_e always fits in p_result
	}
	// sum e backwards, leave the most significant coefficient in Q (unwritten to p_result),
	// n_bottom points to the last written coefficient in p_result (the one with the lowest index)

	if(Q != Q) {
		p_result[0] = Q;
		return 1;
	}
	// detect NaN

	unsigned int n_top = 0;
	for(unsigned int i = n_bottom; i < n_max_result_size; ++ i) {
		_ASSERTE(i < n_max_result_size); // we can exceed n_max_compressed_size here, as we were writing p_result from the end, but we still cant exceed n_max_result_size
		TType hnow = p_result[i], q;
		Fast_Two_Sum(q, Q, hnow, Q); // can use the fast sum, the operands are ordered
		if(q != TType(0)) {
			_ASSERTE(n_top < n_max_compressed_size); // make sure we won't do out-of-bounds access in case n_e_size exceeds n_max_compressed_size
			p_result[n_top] = q;
			++ n_top;
		}
	}
	// sum result forward, shift out any zeros left at the beginning by the previous loop,
	// leave the least significant coefficient in Q (again unwritten to p_result),
	// n_top points one past the last element that was written to p_result

	_ASSERTE(n_top < n_max_compressed_size); // make sure we won't do out-of-bounds access in case n_e_size exceeds n_max_compressed_size
	p_result[n_top] = Q;
#else // 1
	// the original algorithm by Schewchuk: won't work if n_e_size > n_max_compressed_size >= n_max_result_size

	unsigned int n_bottom = n_e_size;
	TType Q = p_e[-- n_bottom];
	for(unsigned int i = n_e_size - 1; i > 0;) {
		-- i; // here
		TType f_e_cur = p_e[i], q;
		TInexactType Qnew;
		Fast_Two_Sum(q, Qnew, Q, f_e_cur);
		if(q != TType(0)) {
			_ASSERTE(n_bottom < n_max_compressed_size); // make sure we won't do out-of-bounds access in case n_e_size exceeds n_max_compressed_size
			p_result[n_bottom] = Qnew;
			-- n_bottom;
			Q = q;
		} else
			Q = Qnew;
	}
	unsigned int n_top = 0;
	for(unsigned int i = n_bottom + 1; i < n_e_size; ++ i) {
		_ASSERTE(i < n_max_compressed_size); // make sure we won't do out-of-bounds access in case n_e_size exceeds n_max_compressed_size
		TType hnow = p_result[i], q;
		Fast_Two_Sum(q, Q, hnow, Q);
		if(q != TType(0)) {
			_ASSERTE(n_top < n_max_compressed_size); // make sure we won't do out-of-bounds access in case n_e_size exceeds n_max_compressed_size
			p_result[n_top] = q;
			++ n_top;
		}
	}
	_ASSERTE(n_top < n_max_compressed_size); // make sure we won't do out-of-bounds access in case n_e_size exceeds n_max_compressed_size
	p_result[n_top] = Q;
#endif // 1

	//_ASSERTE(n_top + 1 <= n_max_compressed_size); // not needed anymore, because of the assertion above
	// make sure that the expansion does not exceed the maximum projected size

	return n_top + 1;
}

template <class T>
inline typename CExactBase<T>::TInexactType CExactBase<T>::f_ApproximateValue(unsigned int n_e_size, const TType *p_e)
{
	_ASSERTE(n_e_size > 0);

	TType Q = p_e[0];
	for(unsigned int i = 1; i < n_e_size; ++ i)
		Q += p_e[i];
	return Q;
}

template <class T>
bool CExactBase<T>::b_ExpansionCheck(unsigned int n_e_size, const TType *p_e, bool b_allow_NaNs /*= false*/) // this only performs the check in debug
{
	if(!(n_e_size > 0))
		return false;
	// the size should always be nonzero

	unsigned int n_prev = 0;
	for(; n_prev < n_e_size; ++ n_prev) {
		if(!(p_e[n_prev] == p_e[n_prev] || b_allow_NaNs))
			return false; // check for NaNs
		if(p_e[n_prev] != TType(0) || p_e[n_prev] != p_e[n_prev])
			break;
	}
	// find the first nonzero and non-NaN

	for(unsigned int i = n_prev + 1; i < n_e_size; ++ i) {
		if(!(p_e[i] == p_e[i] || b_allow_NaNs))
			return false; // check for NaNs
		if(p_e[i] == TType(0) || p_e[i] != p_e[i])
			continue;
		// skip spurious zeroes and NaNs

		if(!(fabs(p_e[i]) > fabs(p_e[n_prev])))
			return false; // absolute value
		// ascending order

		//_ASSERTE(fabs(CFloatUtils<TType>::f_ULP(p_e[i])) > fabs(p_e[n_prev])); // this is not really the test, the exponents can be similar but the one bits must not overlap, so either it is smaller than ulp and it is ok, or if shifted to the same exponenet range, there will be no bits at the positions of the lowest bit of the next or above // t_odo - write that test
		if(fabs(CFloatUtils<TType>::f_ULP(p_e[i])) <= fabs(p_e[n_prev])) { // the numbers overlap
			CFloatUtils<TType>::TIntType n_mantissa_i, n_mantissa_prev;
			n_mantissa_i = CFloatUtils<TType>::n_Get_Mantissa(p_e[i]);
			n_mantissa_prev = CFloatUtils<TType>::n_Get_Mantissa(p_e[n_prev]);
			// get mantissas

			int n_prev_exponent = CFloatUtils<TType>::n_Get_Exponent(p_e[n_prev]);
			int n_cur_exponent = CFloatUtils<TType>::n_Get_Exponent(p_e[i]);
			int n_exponent_difference = n_cur_exponent - n_prev_exponent;
			if(!(n_exponent_difference >= 0))
				return false;
			// calculate exponent difference

			bool b_prev_is_denormal = n_prev_exponent == CFloatUtils<TType>::n_exponent_special_low;
			bool b_cur_is_denormal = n_cur_exponent == CFloatUtils<TType>::n_exponent_special_low;
			if(!(!b_cur_is_denormal || b_prev_is_denormal))
				return false; // if the current is denormal, then the previous must be as well (they are sorted by size)
			// detect denormals

			n_mantissa_prev >>= n_exponent_difference;
			if(!(n_mantissa_prev || b_prev_is_denormal))
				return false; // should be nonzero, as they are in the same exponent range
			n_mantissa_prev = n_RightFill_Ones(n_mantissa_prev);
			if(!(!(n_mantissa_i & n_mantissa_prev) || // make sure that the mantissas do not overlap
			   (b_cur_is_denormal && n_mantissa_prev == 1)))
				return false; // except if maybe both of the numbers are denormals and the mantissa of the previous is only a single 1 bit, probably set by the rounding (happens on AMD)
			// test mantissas for overlap
		}
		// (weak) non-overlapping property

		n_prev = i;
	}
	// make sure that the expansion is sorted and nonoverlapping

	return true;
}

// note that NaNs are allowed by default to not be 
template <class T>
void CExactBase<T>::ExpansionCheck(unsigned int n_e_size, const TType *p_e, bool b_allow_NaNs /*= false*/) // this only performs the check in debug
{
#ifdef _DEBUG
	_ASSERTE(n_e_size > 0);
	// the size should always be nonzero

	unsigned int n_prev = 0;
	for(; n_prev < n_e_size; ++ n_prev) {
		_ASSERTE(p_e[n_prev] == p_e[n_prev] || b_allow_NaNs); // check for NaNs
		if(p_e[n_prev] != TType(0) || p_e[n_prev] != p_e[n_prev])
			break;
	}
	// find the first nonzero and non-NaN

	for(unsigned int i = n_prev + 1; i < n_e_size; ++ i) {
		_ASSERTE(p_e[i] == p_e[i] || b_allow_NaNs); // check for NaNs
		if(p_e[i] == TType(0) || p_e[i] != p_e[i])
			continue;
		// skip spurious zeroes and NaNs

		_ASSERTE(fabs(p_e[i]) > fabs(p_e[n_prev])); // absolute value
		// ascending order

		//_ASSERTE(fabs(CFloatUtils<TType>::f_ULP(p_e[i])) > fabs(p_e[n_prev])); // this is not really the test, the exponents can be similar but the one bits must not overlap, so either it is smaller than ulp and it is ok, or if shifted to the same exponenet range, there will be no bits at the positions of the lowest bit of the next or above // t_odo - write that test
		if(fabs(CFloatUtils<TType>::f_ULP(p_e[i])) <= fabs(p_e[n_prev])) { // the numbers overlap
			CFloatUtils<TType>::TIntType n_mantissa_i, n_mantissa_prev;
			n_mantissa_i = CFloatUtils<TType>::n_Get_Mantissa(p_e[i]);
			n_mantissa_prev = CFloatUtils<TType>::n_Get_Mantissa(p_e[n_prev]);
			// get mantissas

			int n_prev_exponent = CFloatUtils<TType>::n_Get_Exponent(p_e[n_prev]);
			int n_cur_exponent = CFloatUtils<TType>::n_Get_Exponent(p_e[i]);
			int n_exponent_difference = n_cur_exponent - n_prev_exponent;
			_ASSERTE(n_exponent_difference >= 0);
			// calculate exponent difference

			bool b_prev_is_denormal = n_prev_exponent == CFloatUtils<TType>::n_exponent_special_low;
			bool b_cur_is_denormal = n_cur_exponent == CFloatUtils<TType>::n_exponent_special_low;
			_ASSERTE(!b_cur_is_denormal || b_prev_is_denormal); // if the current is denormal, then the previous must be as well (they are sorted by size)
			// detect denormals

			n_mantissa_prev >>= n_exponent_difference;
			_ASSERTE(n_mantissa_prev || b_prev_is_denormal); // should be nonzero, as they are in the same exponent range
			n_mantissa_prev = n_RightFill_Ones(n_mantissa_prev);
			_ASSERTE(!(n_mantissa_i & n_mantissa_prev) || // make sure that the mantissas do not overlap
				(b_cur_is_denormal && n_mantissa_prev == 1)); // except if maybe both of the numbers are denormals and the mantissa of the previous is only a single 1 bit, probably set by the rounding (happens on AMD)
			// test mantissas for overlap
		}
		// (weak) non-overlapping property

		n_prev = i;
	}
	// make sure that the expansion is sorted and nonoverlapping
#endif // _DEBUG
}

/*
 *								=== ~CExactBase ===
 */

template <class T, const unsigned int _n_max_coeff_num, const bool _b_allow_dynamic = false>
class CExpansion; // forward declaration

/**
 *	@brief internal and helper object implementation
 */
namespace expansion_internal {

struct TFromScalar_Tag {};
struct TFromScalar_ZeroPad_Tag {};
struct TFromArray_Tag {};
struct TFromArray_ZeroPad_Tag {};
struct TFromTwoSum_Tag {};
struct TFromTwoDiff_Tag {};
struct TFromTwoProd_Tag {};
struct TFromScalarSquare_Tag {};
struct TUninitializedConstruct_Tag {};
// keep those in expansion_internal to have expansion as empty as possible

extern const TUninitializedConstruct_Tag uninitialized_construct;

template <const int n_max_coeff_num>
class CExpansionSize {
public:
	inline CExpansionSize(unsigned int n_size = n_max_coeff_num)
	{
		_ASSERTE(n_max_coeff_num == n_size);
	}

	inline CExpansionSize(const CExpansionSize<n_max_coeff_num> UNUSED(&r_t_size))
	{}

	inline CExpansionSize<n_max_coeff_num> &operator =(unsigned int n_size)
	{
		_ASSERTE(n_max_coeff_num == n_size);
		return *this;
	}

	inline CExpansionSize<n_max_coeff_num> &operator =(const CExpansionSize<n_max_coeff_num> UNUSED(&r_t_size))
	{
		return *this;
	}

	inline operator unsigned int() const
	{
		return n_max_coeff_num;
	}
};

template <>
class CExpansionSize<-1> {
protected:
	int m_n_size;

public:
	inline CExpansionSize(unsigned int n_size = 0)
		:m_n_size(n_size)
	{}

	inline CExpansionSize<-1> &operator =(unsigned int n_size)
	{
		m_n_size = n_size;
		return *this;
	}

	inline operator unsigned int() const
	{
		return m_n_size;
	}
};

template <const bool b_expression>
class CStaticAssert {
public:
	typedef void ILLEGAL_EMPTY_EXPANSION; // expansions of length 0 are not permitted
	typedef void EXPANSION_TOO_SHORT; // the result does not fit in the expansion
	//typedef void EXPANSION_TOO_LONG; // the result does fits in the expansion but the expansion is longer and memory / computation will be wasted
	typedef void MIXING_EXPANSIONS_OF_DIFFERENT_DATA_TYPES; // this often happens when using scalars: all the scalars need to be of the same type as the expansion (there is risk of losing precision / creating overlapping expansions, that is why everything is typed so strongly)
	typedef void USELESS_TRIM_PERFORMED; // when using ::t_Trimmed() with larger size than the maximum size of the source

	class CInternalError {
	public:
		typedef void COMMUTATIVE_OPERANDS_NOT_SORTED_BY_SIZE; // if this triggers, then the implementation of the operation, from which this is compiled, is wrong
		typedef void SQUARE_EXPANSION_SIZE_NOT_CONSISTENT; // if this triggers, then the formula in CSquareCoeffNum is wrong
	};
};

template <>
class CStaticAssert<false> {
public:
	class CInternalError {};
};

template <const int n_left_coeff_num, const int n_right_coeff_num,
	const bool b_left_or_right_allows_dynamic/*, const bool b_null_elimination*/> // no explicit null elimination; if dynamic then eliminate
class CAdditionAlgorithm {
public:
	template <const int _n_right_coeff_num/*,
		const bool _b_null_elimination*/ MSVC_OMMIT_ARG(class GppDummy)> // no explicit null elimination; if dynamic then eliminate
	class CRightSpecialization {
	public:
		enum {
			n_max_coeff_num = n_left_coeff_num + _n_right_coeff_num,
			b_allow_dynamic = b_left_or_right_allows_dynamic //|| _b_null_elimination
		};

		template <class T>
		static inline unsigned int n_Run(T *p_dest, unsigned int n_left_length, const T *p_left,
			unsigned int n_right_length, const T *p_right)
		{
			if(b_left_or_right_allows_dynamic) { // compile-time constant
				_ASSERTE(n_left_length <= n_left_coeff_num && n_right_length <= _n_right_coeff_num);
				return CExactBase<T>::n_Expansion_Sum_EliminateZeroes1(n_max_coeff_num, p_dest,
					n_left_length, p_left, n_right_length, p_right);
				// dynamic size, need loops
			} else {
				_ASSERTE(n_left_length == n_left_coeff_num && n_right_length == _n_right_coeff_num);
				return CExactBase<T>::n_Expansion_Sum(n_max_coeff_num, p_dest,
					n_left_coeff_num, p_left, _n_right_coeff_num, p_right);
				// can unroll loops here
			}
		}

		/*template <class T>
		static inline unsigned int n_Run(T *p_dest, unsigned int n_left_length, const T *p_left,
			unsigned int n_right_length, const T f_right) // is this ever needed?
		{
			_ASSERTE(n_right_length == 1);
			return n_Run(p_dest, n_left_length, p_left, 1, &f_right); // handle scalars
		}*/
	};

	template <MSVC_OMMIT(class GppDummy)>
	class CRightSpecialization<1/*, false*/ MSVC_OMMIT_ARG(GppDummy)> { // handle scalar addition
	public:
		enum {
			n_max_coeff_num = n_left_coeff_num + 1,
			b_allow_dynamic = b_left_or_right_allows_dynamic
		};

		template <class T>
		static inline unsigned int n_Run(T *p_dest, unsigned int n_left_length, const T *p_left,
			unsigned int n_right_length, const T f_right)
		{
			_ASSERTE(n_right_length == 1);
			if(b_left_or_right_allows_dynamic) // compile-time constant
				return CExactBase<T>::n_Grow_Expansion_EliminateZeroes(n_max_coeff_num, p_dest, n_left_length, p_left, f_right);
			else
				return CExactBase<T>::n_Grow_Expansion(n_max_coeff_num, p_dest, n_left_coeff_num, p_left, f_right);
		}

		template <class T>
		static inline unsigned int n_Run(T *p_dest, unsigned int n_left_length, const T *p_left,
			unsigned int n_right_length, const T *p_right) // required in case someone uses expansions of length 1
		{
			_ASSERTE(n_right_length == 1);
			if(b_left_or_right_allows_dynamic) // compile-time constant
				return CExactBase<T>::n_Grow_Expansion_EliminateZeroes(n_max_coeff_num, p_dest, n_left_length, p_left, *p_right);
			else
				return CExactBase<T>::n_Grow_Expansion(n_max_coeff_num, p_dest, n_left_coeff_num, p_left, *p_right);
		}
	};

	/*template <MSVC_OMMIT(class GppDummy)>
	class CRightSpecialization<1, true MSVC_OMMIT_ARG(GppDummy)> { // handle scalar addition with zero elimination
	public:
		enum {
			n_max_coeff_num = n_left_coeff_num + 1,
			b_allow_dynamic = b_left_or_right_allows_dynamic
		};

		template <class T>
		static inline unsigned int n_Run(T *p_dest, unsigned int n_left_length, const T *p_left,
			unsigned int n_right_length, const T f_right)
		{
			if(b_left_or_right_allows_dynamic) { // compile-time constant
				return CExactBase<T>::n_Grow_Expansion_EliminateZeroes(n_max_coeff_num,
					p_dest, n_left_length, p_left, f_right);
			} else {
				return CExactBase<T>::n_Grow_Expansion_EliminateZeroes(n_max_coeff_num,
					p_dest, n_left_coeff_num, p_left, f_right);
			}
		}
	};*/

protected:
	typedef CRightSpecialization<n_right_coeff_num/*, b_null_elimination*/
		MSVC_OMMIT_ARG(void)> TRightSpecialized;

	enum {
		b_sorted = n_left_coeff_num >= n_right_coeff_num
	};

	typedef typename CStaticAssert<b_sorted>::CInternalError::COMMUTATIVE_OPERANDS_NOT_SORTED_BY_SIZE CAssert0;

public:
	enum {
		n_max_coeff_num = TRightSpecialized::n_max_coeff_num,
		b_allow_dynamic = TRightSpecialized::b_allow_dynamic
	};

	template <class T>
	static inline unsigned int n_Run(T *p_dest, unsigned int n_left_length, const T *p_left,
		unsigned int n_right_length, const T *p_right)
	{
		_ASSERTE(!b_left_or_right_allows_dynamic || (n_left_length > 0 && n_left_length <=
			n_left_coeff_num && n_right_length > 0 && n_right_length <= n_right_coeff_num));
		_ASSERTE(b_left_or_right_allows_dynamic || (n_left_length == n_left_coeff_num &&
			n_right_length == n_right_coeff_num));
		return TRightSpecialized::n_Run(p_dest, n_left_length, p_left, n_right_length, p_right);
	}

	template <class T>
	static inline unsigned int n_Run(T *p_dest, unsigned int n_left_length, const T *p_left,
		unsigned int n_right_length, const T f_right) // handle scalars
	{
		_ASSERTE(!b_left_or_right_allows_dynamic || (n_left_length > 0 && n_left_length <=
			n_left_coeff_num && n_right_length > 0 && n_right_length <= n_right_coeff_num));
		_ASSERTE(b_left_or_right_allows_dynamic || (n_left_length == n_left_coeff_num &&
			n_right_length == n_right_coeff_num));
		_ASSERTE(n_right_length == 1);
		return TRightSpecialized::n_Run(p_dest, n_left_length, p_left, 1, f_right); // handle scalars
	} // is this even required? it is, but the traits for scalars could be easily rewritten to return a pointer, and all this would be easier, inlining would potentially optimize the pointer away // todo - benchmark, test
};

/*template <>
class CAdditionAlgorithm<2, 2, false/ *, false* /> { // concept test only
public:
	enum {
		n_left_coeff_num = 2,
		n_right_coeff_num = 2,

		n_max_coeff_num = n_left_coeff_num + n_right_coeff_num,
		b_allow_dynamic = false
	};

	template <class T>
	static inline unsigned int n_Run(T *p_dest, unsigned int n_left_length, const T *p_left,
		unsigned int n_right_length, const T *p_right)
	{
		printf("using optimized addition\n");
		if(b_allow_dynamic) { // compile-time constant
			return CExactBase<T>::n_Expansion_Sum(n_max_coeff_num, p_dest,
				n_left_length, p_left, n_right_length, p_right);
			// dynamic size, need loops
		} else {
			_ASSERTE(n_left_length == n_left_coeff_num && n_right_length == n_right_coeff_num);
			return CExactBase<T>::n_Expansion_Sum(n_max_coeff_num, p_dest,
				n_left_coeff_num, p_left, n_right_coeff_num, p_right);
			// can unroll loops here
		}
	}
};*/

template <const int n_left_coeff_num, const int n_right_coeff_num,
	const bool b_left_or_right_allows_dynamic>
class CSubtractionAlgorithm {
public:
	template <const int _n_right_coeff_num MSVC_OMMIT_ARG(class GppDummy)>
	class CRightSpecialization {
	public:
		enum {
			n_max_coeff_num = n_left_coeff_num + _n_right_coeff_num,
			b_allow_dynamic = b_left_or_right_allows_dynamic
		};

		template <class T>
		static inline unsigned int n_Run(T *p_dest, unsigned int n_left_length, const T *p_left,
			unsigned int n_right_length, const T *p_right)
		{
			if(b_left_or_right_allows_dynamic) { // compile-time constant
				_ASSERTE(n_left_length <= n_left_coeff_num && n_right_length <= _n_right_coeff_num);
				return CExactBase<T>::n_Expansion_Difference_EliminateZeroes1(n_max_coeff_num, p_dest,
					n_left_length, p_left, n_right_length, p_right);
				// dynamic size, need loops
			} else {
				_ASSERTE(n_left_length == n_left_coeff_num && n_right_length == _n_right_coeff_num);
				return CExactBase<T>::n_Expansion_Difference(n_max_coeff_num, p_dest,
					n_left_coeff_num, p_left, _n_right_coeff_num, p_right);
				// can unroll loops here
			}
		}
	};

	template <MSVC_OMMIT(class GppDummy)>
	class CRightSpecialization<1/*, false*/ MSVC_OMMIT_ARG(GppDummy)> { // handle scalar addition
	public:
		enum {
			n_max_coeff_num = n_left_coeff_num + 1,
			b_allow_dynamic = b_left_or_right_allows_dynamic
		};

		template <class T>
		static inline unsigned int n_Run(T *p_dest, unsigned int n_left_length, const T *p_left,
			unsigned int n_right_length, const T f_right)
		{
			_ASSERTE(n_right_length == 1);
			if(b_left_or_right_allows_dynamic) // compile-time constant
				return CExactBase<T>::n_Grow_Expansion_EliminateZeroes(n_max_coeff_num, p_dest, n_left_length, p_left, -f_right);
			else
				return CExactBase<T>::n_Grow_Expansion(n_max_coeff_num, p_dest, n_left_coeff_num, p_left, -f_right);
		}

		template <class T>
		static inline unsigned int n_Run(T *p_dest, unsigned int n_left_length, const T *p_left,
			unsigned int n_right_length, const T *p_right) // required in case someone uses expansions of length 1
		{
			_ASSERTE(n_right_length == 1);
			if(b_left_or_right_allows_dynamic) // compile-time constant
				return CExactBase<T>::n_Grow_Expansion_EliminateZeroes(n_max_coeff_num, p_dest, n_left_length, p_left, -*p_right);
			else
				return CExactBase<T>::n_Grow_Expansion(n_max_coeff_num, p_dest, n_left_coeff_num, p_left, -*p_right);
		}
	};

protected:
	typedef CRightSpecialization<n_right_coeff_num MSVC_OMMIT_ARG(void)> TRightSpecialized;

	/*enum {
		b_sorted = n_left_coeff_num >= n_right_coeff_num
	};

	typedef typename CStaticAssert<b_sorted>::CInternalError::COMMUTATIVE_OPERANDS_NOT_SORTED_BY_SIZE CAssert0;*/
	// can't sort operands in subtraction!

public:
	enum {
		n_max_coeff_num = TRightSpecialized::n_max_coeff_num,
		b_allow_dynamic = TRightSpecialized::b_allow_dynamic
	};

	template <class T>
	static inline unsigned int n_Run(T *p_dest, unsigned int n_left_length, const T *p_left,
		unsigned int n_right_length, const T *p_right)
	{
		_ASSERTE(!b_left_or_right_allows_dynamic || (n_left_length > 0 && n_left_length <=
			n_left_coeff_num && n_right_length > 0 && n_right_length <= n_right_coeff_num));
		_ASSERTE(b_left_or_right_allows_dynamic || (n_left_length == n_left_coeff_num &&
			n_right_length == n_right_coeff_num));
		return TRightSpecialized::n_Run(p_dest, n_left_length, p_left, n_right_length, p_right);
	}

	template <class T>
	static inline unsigned int n_Run(T *p_dest, unsigned int n_left_length, const T *p_left,
		unsigned int n_right_length, const T f_right) // handle scalars
	{
		_ASSERTE(!b_left_or_right_allows_dynamic || (n_left_length > 0 && n_left_length <=
			n_left_coeff_num && n_right_length > 0 && n_right_length <= n_right_coeff_num));
		_ASSERTE(b_left_or_right_allows_dynamic || (n_left_length == n_left_coeff_num &&
			n_right_length == n_right_coeff_num));
		_ASSERTE(n_right_length == 1);
		return TRightSpecialized::n_Run(p_dest, n_left_length, p_left, 1, f_right); // handle scalars
	} // is this even required? it is, but the traits for scalars could be easily rewritten to return a pointer, and all this would be easier, inlining would potentially optimize the pointer away // todo - benchmark, test
};

template <const int n_left_coeff_num, const int n_right_coeff_num,
	const bool b_left_or_right_allows_dynamic>
class CMultiplicationAlgorithm {
public:
	template <const int _n_right_coeff_num MSVC_OMMIT_ARG(class GppDummy)>
	class CRightSpecialization {
	public:
		enum {
			n_max_coeff_num = 2 * n_left_coeff_num * n_right_coeff_num,
			b_allow_dynamic = b_left_or_right_allows_dynamic
		};

		template <class T>
		static inline unsigned int n_Run(T *p_dest, unsigned int n_left_length, const T *p_left,
			unsigned int n_right_length, const T *p_right)
		{
			T p_right_hi[_n_right_coeff_num], p_right_lo[_n_right_coeff_num], p_right_sc[2 * _n_right_coeff_num];
			// need some temp arrays

			if(b_left_or_right_allows_dynamic) { // compile-time constant
				_ASSERTE(n_left_length <= n_left_coeff_num && n_right_length <= n_right_coeff_num);
				return CExactBase<T>::n_Expansion_Product_EliminateZeroes(n_max_coeff_num, p_dest, n_right_length, p_right,
					p_right_lo, p_right_hi, 2 * _n_right_coeff_num, p_right_sc, n_left_length, p_left);
				// dynamic size, need loops
			} else {
				_ASSERTE(n_left_length == n_left_coeff_num && n_right_length == _n_right_coeff_num);
				return CExactBase<T>::n_Expansion_Product(n_max_coeff_num, p_dest, _n_right_coeff_num, p_right,
					p_right_lo, p_right_hi, 2 * _n_right_coeff_num, p_right_sc, n_left_coeff_num, p_left);
				// can unroll loops here
			}
			// note that right and left is swapped, as right is ordered to be smaller,
			// and then we can do away with smaller temp arrays
		}

		/*template <class T>
		static inline unsigned int n_Run(T *p_dest, unsigned int n_left_length, const T *p_left,
			unsigned int n_right_length, const T f_right) // is this ever needed?
		{
			_ASSERTE(n_right_length == 1);
			return n_Run(p_dest, n_left_length, p_left, 1, &f_right); // handle scalars
		}*/
	};

	template <MSVC_OMMIT(class GppDummy)>
	class CRightSpecialization<1/*, false*/ MSVC_OMMIT_ARG(GppDummy)> { // handle scalar multiplication
	public:
		enum {
			n_max_coeff_num = n_left_coeff_num * 2,
			b_allow_dynamic = b_left_or_right_allows_dynamic
		};

		template <class T>
		static inline unsigned int n_Run(T *p_dest, unsigned int n_left_length, const T *p_left,
			unsigned int n_right_length, const T f_right)
		{
			if(b_left_or_right_allows_dynamic) // compile-time constant
				return CExactBase<T>::n_Scale_Expansion_EliminateZeroes(n_max_coeff_num, p_dest, n_left_length, p_left, f_right);
			else
				return CExactBase<T>::n_Scale_Expansion(n_max_coeff_num, p_dest, n_left_coeff_num, p_left, f_right);
		}

		template <class T>
		static inline unsigned int n_Run(T *p_dest, unsigned int n_left_length, const T *p_left,
			unsigned int n_right_length, const T *p_right) // required in case someone uses expansions of length 1
		{
			_ASSERTE(n_right_length == 1);
			if(b_left_or_right_allows_dynamic) // compile-time constant
				return CExactBase<T>::n_Scale_Expansion_EliminateZeroes(n_max_coeff_num, p_dest, n_left_length, p_left, *p_right);
			else
				return CExactBase<T>::n_Scale_Expansion(n_max_coeff_num, p_dest, n_left_coeff_num, p_left, *p_right);
		}
	};

protected:
	typedef CRightSpecialization<n_right_coeff_num MSVC_OMMIT_ARG(void)> TRightSpecialized;

	enum {
		b_sorted = n_left_coeff_num >= n_right_coeff_num
	};

	typedef typename CStaticAssert<b_sorted>::CInternalError::COMMUTATIVE_OPERANDS_NOT_SORTED_BY_SIZE CAssert0;

public:
	enum {
		n_max_coeff_num = TRightSpecialized::n_max_coeff_num,
		b_allow_dynamic = TRightSpecialized::b_allow_dynamic
	};

	template <class T>
	static inline unsigned int n_Run(T *p_dest, unsigned int n_left_length, const T *p_left,
		unsigned int n_right_length, const T *p_right)
	{
		_ASSERTE(!b_left_or_right_allows_dynamic || (n_left_length > 0 && n_left_length <=
			n_left_coeff_num && n_right_length > 0 && n_right_length <= n_right_coeff_num));
		_ASSERTE(b_left_or_right_allows_dynamic || (n_left_length == n_left_coeff_num &&
			n_right_length == n_right_coeff_num));
		return TRightSpecialized::n_Run(p_dest, n_left_length, p_left, n_right_length, p_right);
	}

	template <class T>
	static inline unsigned int n_Run(T *p_dest, unsigned int n_left_length, const T *p_left,
		unsigned int n_right_length, const T f_right) // handle scalars
	{
		_ASSERTE(!b_left_or_right_allows_dynamic || (n_left_length > 0 && n_left_length <=
			n_left_coeff_num && n_right_length > 0 && n_right_length <= n_right_coeff_num));
		_ASSERTE(b_left_or_right_allows_dynamic || (n_left_length == n_left_coeff_num &&
			n_right_length == n_right_coeff_num));
		_ASSERTE(n_right_length == 1);
		return TRightSpecialized::n_Run(p_dest, n_left_length, p_left, 1, f_right); // handle scalars
	} // is this even required? it is, but the traits for scalars could be easily rewritten to return a pointer, and all this would be easier, inlining would potentially optimize the pointer away // todo - benchmark, test
};

template <const int _n_max_coeff_num>
class CSquareCoeffNum {
public:
	enum {
		n_result = (_n_max_coeff_num == 2)? 6 : _n_max_coeff_num * _n_max_coeff_num * 2
		//_n_max_coeff_num * _n_max_coeff_num * 2 - (_n_max_coeff_num == 2) * 2
	};
};

template <const int n_left_coeff_num, const bool b_left_allows_dynamic>
class CSquareAlgorithm {
public:
	template <const int _n_left_coeff_num MSVC_OMMIT_ARG(class GppDummy)>
	class CLeftSpecialization {
	public:
		enum {
			n_max_coeff_num = 2 * n_left_coeff_num * n_left_coeff_num,
			b_allow_dynamic = b_left_allows_dynamic
		};

		template <class T>
		static inline unsigned int n_Run(T *p_dest, unsigned int n_left_length, const T *p_left)
		{
			T p_left_hi[_n_left_coeff_num], p_left_lo[_n_left_coeff_num], p_left_sc[2 * _n_left_coeff_num];
			// need some temp arrays

			if(b_left_allows_dynamic) { // compile-time constant
				_ASSERTE(n_left_length <= n_left_coeff_num);
				return CExactBase<T>::n_Expansion_Square_EliminateZeroes(n_max_coeff_num, p_dest, n_left_length, p_left,
					p_left_lo, p_left_hi, 2 * _n_left_coeff_num, p_left_sc);
				// dynamic size, need loops
			} else {
				_ASSERTE(n_left_length == n_left_coeff_num);
				return CExactBase<T>::n_Expansion_Square(n_max_coeff_num, p_dest, n_left_length, p_left,
					p_left_lo, p_left_hi, 2 * _n_left_coeff_num, p_left_sc);
				// can unroll loops here
			}
			// note that right and left is swapped, as right is ordered to be smaller,
			// and then we can do away with smaller temp arrays
		}
	};

	template <MSVC_OMMIT(class GppDummy)>
	class CLeftSpecialization<2 MSVC_OMMIT_ARG(GppDummy)> { // handle scalar multiplication
	public:
		enum {
			n_max_coeff_num = 6, // optimized
			b_allow_dynamic = b_left_allows_dynamic || false
		};

		template <class T>
		static inline unsigned int n_Run(T *p_dest, unsigned int n_left_length, const T *p_left)
		{
			if(b_left_allows_dynamic && n_left_length < 2) {
				_ASSERTE(n_left_length == 1);
				CExactBase<T>::Square(p_dest[0], p_dest[1], *p_left); // ascending order
				if(p_dest[0] == T(0)) { // in case the multiplication was precise
					p_dest[0] = p_dest[1];
					return 1;
				} else
					return 2;
			} else {
				/* An expansion of length two can be squared more quickly than finding the   */
				/*   product of two different expansions of length two, and the result is    */
				/*   guaranteed to have no more than six (rather than eight) components.     */

				/*
				#define Two_One_Sum(A, B, C, D, E, F) do { \
					CExactBase<T>::Two_Sum(F, _i, B, C); \
					CExactBase<T>::Two_Sum(E, D, A, _i); } while(0)

				#define Two_Two_Sum(A, B, C, D, E, F, G, H) do { \
					CExactBase<T>::Two_Sum(H, _i, B, D); \
					CExactBase<T>::Two_Sum(_0, _j, A, _i); \
					CExactBase<T>::Two_Sum(G, _i, _0, C); \
					CExactBase<T>::Two_Sum(F, E, _j, _i); } while(0)

				#define Two_Square(a1, a0, x5, x4, x3, x2, x1, x0) do { \
					CExactBase<T>::Square(x0, _j, a0); \
					_0 = a0 + a0; \
					CExactBase<T>::Two_Product(_1, _k, a1, _0); \
					Two_One_Sum(_k, _1, _j, _l, _2, x1); \
					CExactBase<T>::Square(_1, _j, a1); \
					Two_Two_Sum(_j, _1, _l, _2, x5, x4, x3, x2); } while(0)
				*/

				const T a1 = p_left[1], a0 = p_left[0]; // source
				T &x0 = p_dest[0], &x1 = p_dest[1], &x2 = p_dest[2],
					&x3 = p_dest[3], &x4 = p_dest[4], &x5 = p_dest[5]; // destination

				/*T _i, _j, _k, _l, _0, _1, _2; // temporaries
				CExactBase<T>::Square(x0, _j, a0);
				_0 = a0 + a0;
				CExactBase<T>::Two_Product(_1, _k, a1, _0);
				CExactBase<T>::Two_Sum(x1, _i, _1, _j);
				CExactBase<T>::Two_Sum(_2, _l, _k, _i);
				CExactBase<T>::Square(_1, _j, a1);
				CExactBase<T>::Two_Sum(x2, _i, _1, _2);
				CExactBase<T>::Two_Sum(_0, _j, _j, _i);
				CExactBase<T>::Two_Sum(x3, _i, _0, _l);
				CExactBase<T>::Two_Sum(x4, x5, _j, _i);*/
				// algorithm

				/*CExactBase<T>::Square(x0, _j, a0);
				_0 = a0 + a0;
				CExactBase<T>::Two_Product(_1, _k, a1, _0);
				CExactBase<T>::Two_Sum(x1, _i, _1, _j);
				CExactBase<T>::Two_Sum(_2, _l, _k, _i);
				CExactBase<T>::Square(_1, _j, a1);
				CExactBase<T>::Two_Sum(x2, _i, _1, _2);
				CExactBase<T>::Two_Sum(_0, _j, _j, _i);
				CExactBase<T>::Two_Sum(x3, _i, _0, _l);
				CExactBase<T>::Two_Sum(x4, x5, _j, _i);*/
				// double check macro expansion

				/*T t0;
				CExactBase<T>::Square(x0, t0, a0);
				T t1, t2;
				CExactBase<T>::Two_Product(t1, t2, a1, a0 + a0);
				T t3;
				CExactBase<T>::Two_Sum(x1, t3, t2, t0); // t0, t2 dead
				T t4, t5;
				CExactBase<T>::Two_Sum(t5, t4, t1, t3); // t1, t3 dead
				T t6, t7;
				CExactBase<T>::Square(t7, t6, a1);
				T t8;
				CExactBase<T>::Two_Sum(x2, t8, t7, t5); // t5, t7 dead
				T t9, ta;
				CExactBase<T>::Two_Sum(ta, t9, t6, t8); // t6, t8 dead
				T tb;
				CExactBase<T>::Two_Sum(x3, tb, ta, t4); // t4, ta dead
				CExactBase<T>::Two_Sum(x4, x5, t9, tb); // t9, tb dead
				// algorithm*/

				T t9, tb;
				{
					T t4, ta;
					{
						T t6, t8;
						{
							T t5;
							{
								T t1, t3;
								{
									T t0;
									CExactBase<T>::Square(x0, t0, a0);
									T t2;
									CExactBase<T>::Two_Product(t2, t1, a1, a0 + a0);
									CExactBase<T>::Two_Sum(x1, t3, t2, t0); // t0, t2 dead
								}
								CExactBase<T>::Two_Sum(t5, t4, t1, t3); // t1, t3 dead
							}
							T t7;
							CExactBase<T>::Square(t7, t6, a1);
							CExactBase<T>::Two_Sum(x2, t8, t7, t5); // t5, t7 dead
						}
						CExactBase<T>::Two_Sum(ta, t9, t6, t8); // t6, t8 dead
					}
					CExactBase<T>::Two_Sum(x3, tb, ta, t4); // t4, ta dead
				}
				CExactBase<T>::Two_Sum(x4, x5, t9, tb); // t9, tb dead
				// algorithm with explicit scoping; todo - test if it is any faster
			}

			if(b_left_allows_dynamic) { // compile-time constant
				/*unsigned int i = 0;
				while(i < 6 && p_dest[i] != T(0))
					++ i;
				// find the first zero

				unsigned int n_dest = i; // position to write the next nonzero
				for(++ i; i < 6; ++ i) { // we already know that i-th is zero, we can skip it
					if(p_dest[i] != T(0)) {
						p_dest[n_dest] = p_dest[i];
						++ n_dest;
					}
				}
				// shift non-zeroes

				return n_dest;*/ // t_odo - drop zeroes
				return CExactBase<T>::n_EliminateZeroes(6, p_dest);
			} else
				return 6;
		}
	};

	template <MSVC_OMMIT(class GppDummy)>
	class CLeftSpecialization<1 MSVC_OMMIT_ARG(GppDummy)> { // handle scalar multiplication
	public:
		enum {
			n_max_coeff_num = 2,
			b_allow_dynamic = b_left_allows_dynamic || false
		};

		template <class T>
		static inline unsigned int n_Run(T *p_dest, unsigned int n_left_length, const T *p_left)
		{
			CExactBase<T>::Square(p_dest[0], p_dest[1], *p_left); // ascending order
			if(b_left_allows_dynamic) // compile-time constant
				if(p_dest[0] == T(0)) { // in case the multiplication was precise
					p_dest[0] = p_dest[1];
					return 1;
				} else
					return 2;
			else
				return 2;
		}
	};

protected:
	typedef CLeftSpecialization<n_left_coeff_num MSVC_OMMIT_ARG(void)> TSpecialized;

public:
	enum {
		n_max_coeff_num = TSpecialized::n_max_coeff_num,
		b_allow_dynamic = TSpecialized::b_allow_dynamic
	};

	typedef typename CStaticAssert<CSquareCoeffNum<n_left_coeff_num>::n_result ==
		n_max_coeff_num>::CInternalError::SQUARE_EXPANSION_SIZE_NOT_CONSISTENT CAssert0; // make sure that the number of coefficients calculated by the implementation matches the one calculated by CSquareCoeffNum, which is required by msvc 6.0 for the result expansion type calculation

	template <class T>
	static inline unsigned int n_Run(T *p_dest, unsigned int n_left_length, const T *p_left)
	{
		_ASSERTE(!b_left_allows_dynamic || (n_left_length > 0 && n_left_length <= n_left_coeff_num));
		_ASSERTE(b_left_allows_dynamic || n_left_length == n_left_coeff_num);
		return TSpecialized::n_Run(p_dest, n_left_length, p_left);
	}
};

namespace ex_msvc6 {

/**
 *	@brief type comparison environment (msvc partial specialization issue workarround)
 *	@tparam A is the first type to compared
 */
template <class A>
struct CCompareType {
	/**
	 *	@brief type comparison template
	 *
	 *	@tparam B is the second type to be compared
	 *	@tparam _GppDummy is compatibility workarround for g++ (full independent specialization inside a template)
	 */
	template <class B MSVC_OMMIT_ARG(class _GppDummy)>
	struct CCompare {
		/**
		 *	@brief result, stored as enum
		 */
		enum {
			b_result = false /**< @brief comparison result */
		};
	};

	/**
	 *	@brief type comparison template (specialization for the types being equal)
	 *	@tparam _GppDummy is compatibility workarround for g++ (full independent specialization inside a template)
	 */
	template <MSVC_OMMIT(class _GppDummy)>
	struct CCompare<A MSVC_OMMIT_ARG(_GppDummy)> {
		/**
		 *	@brief result, stored as enum
		 */
		enum {
			b_result = true /**< @brief comparison result */
		};
	};
};

/**
 *	@brief template for comparing two data types
 *
 *	@tparam TA is the first type
 *	@tparam TB is the second type
 */
template <class TA, class TB>
struct CIsSameType {
	/**
	 *	@brief result, stored as enum
	 */
	enum {
		b_result = CCompareType<TA>::IS_OMMIT(template)
			CCompare<TB MSVC_OMMIT_ARG(void)>::b_result /**< @brief result of comparison (the types are different) */
	};
};

#if 0 // solved differently, not needed anymore
/**
 *	@brief no-op unary specialization wrapper (workarround for msvc
 *		which requires model of the functor for syntax analysis)
 */
class CNoOpSpecializerWrapper {
private:
	/**
	 *	@brief no-op unary transformation template
	 *
	 *	@tparam T is dummy parameter
	 *	@tparam _n_max_coeff_num is dummy parameter
	 *	@tparam _b_allow_dynamic is dummy parameter
	 *
	 *	@note This is inaccessible btw, will cause error if instantiated.
	 */
	template <class T, const unsigned int _n_max_coeff_num, const bool _b_allow_dynamic>
	struct CSpecialize {
		typedef void CResult; /**< @brief result */
	};
};
#endif // 0

#if 0
/**
 *	@brief this is used to defer template instantiation
 *	@tparam CTypeWithResult is type of the function template with member type CResult
 */
template <class CTypeWithResult>
struct CDeferResultEval { // defers compilation, makes compiler happy, all is good.
	typedef typename CTypeWithResult::CResult CResult; /**< @brief result type */
}; // does not seem to be required
#endif // 0

} // ~ex_msvc6


/**
 *	@brief chooses one of two types, based on a compile-time flag
 *
 *	@tparam CFirst is the first type
 *	@tparam CSecond is the second type
 *	@tparam b_choose_first is flag that chooses the first (true) or the second (false) type
 */
template <class CFirst, class CSecond, bool b_choose_first>
class CChooseType {
protected:
	/**
	 *	@brief compares the flag (msvc partial specialization issue workarround)
	 *
	 *	@tparam b_first is flag that chooses the first (true) or the second (false) type
	 *	@tparam GppDummy is compatibility workarround for g++ (full independent specialization inside a template)
	 */
	template <const bool b_first MSVC_OMMIT_ARG(class GppDummy)>
	struct CChoose {
		typedef CFirst CResult; /**< @brief result type */
	};

	/**
	 *	@brief compares the flag (specialization for the flag being false)
	 *	@tparam GppDummy is compatibility workarround for g++ (full independent specialization inside a template)
	 */
	template <MSVC_OMMIT(class GppDummy)>
	struct CChoose<false MSVC_OMMIT_ARG(GppDummy)> {
		typedef CSecond CResult; /**< @brief result type */
	};

public:
	typedef typename CChoose<b_choose_first MSVC_OMMIT_ARG(void)>::CResult CResult; /**< @brief result type */
};

#if defined(_MSC_VER) && !defined(__MWERKS__) && _MSC_VER <= 1200

template <class TyExpansion>
class CExpansionTraits {
public:
	typedef TyExpansion TExpansion;
	typedef typename TyExpansion::TType TType;
	enum {
		n_max_coeff_num = TyExpansion::n_max_coeff_num,
		b_allow_dynamic = TyExpansion::b_allow_dynamic
	};

	static inline unsigned int n_Size(const TyExpansion &r_exp);
	static const inline TType *p_Data(const TyExpansion &r_exp);
	static inline TType *p_Data(TyExpansion &r_exp);
	// in msvc 6.0 these functions need to be outside, otherwise they cause internal compiler error
};

template <class TyExpansion>
inline unsigned int CExpansionTraits<TyExpansion>::n_Size(const TyExpansion &r_exp)
{
	return (b_allow_dynamic)? r_exp.n_Size() : n_max_coeff_num;
}

template <class TyExpansion>
inline const CExpansionTraits<TyExpansion>::TType *CExpansionTraits<TyExpansion>::p_Data(const TyExpansion &r_exp)
{
	return &const_cast<TyExpansion&>(r_exp).f_At(0)/*[0]*/;
}

template <class TyExpansion>
inline CExpansionTraits<TyExpansion>::TType *CExpansionTraits<TyExpansion>::p_Data(TyExpansion &r_exp)
{
	return &r_exp.f_At(0)/*r_exp[0]*/;
}

#define DECLARE_INTEGRAL_EXPANSION_TRAITS(integral_type) \
	template <> \
	class CExpansionTraits<integral_type> { \
	public: \
		typedef integral_type TType; \
		enum { \
			n_max_coeff_num = 1, \
			b_allow_dynamic = false \
		}; \
		static inline unsigned int n_Size(TType UNUSED(f_scalar)) \
		{ \
			return 1; \
		} \
		static inline TType p_Data(TType f_scalar) \
		{ \
			return f_scalar; \
		} \
	}

DECLARE_INTEGRAL_EXPANSION_TRAITS(long double);
DECLARE_INTEGRAL_EXPANSION_TRAITS(double);
DECLARE_INTEGRAL_EXPANSION_TRAITS(float);

#else // _MSC_VER && !__MWERKS__ && _MSC_VER <= 1200

template <class TyExpansion>
class CExpansionTraits {
public:
	typedef TyExpansion TType;
	enum {
		n_max_coeff_num = 1,
		b_allow_dynamic = false
	};

	static inline unsigned int n_Size(TType UNUSED(f_scalar))
	{
		return 1;
	}

	static inline TType p_Data(TType f_scalar) // returns value
	{
		return f_scalar;
	}
};

template <class T, const unsigned int _n_max_coeff_num, const bool _b_allow_dynamic>
class CExpansionTraits<CExpansion<T, _n_max_coeff_num, _b_allow_dynamic> > {
public:
	typedef CExpansion<T, _n_max_coeff_num, _b_allow_dynamic> TExpansion;
	typedef T TType;
	enum {
		n_max_coeff_num = _n_max_coeff_num,
		b_allow_dynamic = _b_allow_dynamic
	};

	static inline unsigned int n_Size(const TExpansion &r_exp)
	{
		return (b_allow_dynamic)? r_exp.n_Size() : n_max_coeff_num;
	}

	static inline TType *p_Data(TExpansion &r_exp)
	{
		return r_exp.m_p_value;
	}

	static inline const TType *p_Data(const TExpansion &r_exp)
	{
		return r_exp.m_p_value;
	}
};

#endif // _MSC_VER && !__MWERKS__ && _MSC_VER <= 1200

template <class CLeftExpansion, class CRightExpansion>
class CCompatibleTypeCheck {
public:
	typedef typename CExpansionTraits<CLeftExpansion>::TType TType;
	typedef typename CExpansionTraits<CRightExpansion>::TType TTypeR;

	enum {
		b_compatible_types = ex_msvc6::CIsSameType<TType, TTypeR>::b_result
	};

	typedef typename CStaticAssert<b_compatible_types>::MIXING_EXPANSIONS_OF_DIFFERENT_DATA_TYPES CAssert;
};

template <class CBaseOperationImpl>
class CBinaryOperation_SwapOperands {
public:
	enum {
		n_max_coeff_num = CBaseOperationImpl::n_max_coeff_num,
		b_allow_dynamic = CBaseOperationImpl::b_allow_dynamic
	};

	template <class T>
	static inline unsigned int n_Run(T *p_dest, unsigned int n_left_length, const T *p_left,
		unsigned int n_right_length, const T *p_right)
	{
		return CBaseOperationImpl::n_Run(p_dest, n_right_length, p_right, n_left_length, p_left);
	}
};

// this is slightly different from addition and subtraction operations,
// which only compile when called. this compiles when any expansion is instantiated
// and that gives msvc 6.0 more trouble, as at the time this is needed, CExpansion
// is not a type yet, that is why CTargetExpansion is provided (it is impossible
// to get CLeftExpansion::anything)
template <class CLeftExpansion, class CTargetExpansion> // can cast from dynamic to static or vice versa
class CDynamicCastResult {
public:
	typedef CLeftExpansion TExpansion;
	typedef CTargetExpansion TTargetExpansion;

	enum {
		b_type_changes = ex_msvc6::CIsSameType<CLeftExpansion, CTargetExpansion>::b_result
	};

protected:
	template <const bool b_old_was_same_type MSVC_OMMIT_ARG(class GppDummy)>
	class CCastImplementation {
	public:
		typedef TTargetExpansion CResult; // returns a copy

	/*public:
		static inline CResult t_Run(const TExpansion &r_t_expansion) // unused
		{
			return CResult(r_t_expansion);
			// this conversion is actually already implemented in the ctor, with more checks
		}*/
	};

	template <MSVC_OMMIT(class GppDummy)>
	class CCastImplementation<true MSVC_OMMIT_ARG(GppDummy)> {
	public:
		typedef const TTargetExpansion &CResult; // returns const reference

	/*public:
		static inline CResult t_Run(const TExpansion &r_t_expansion) // unused
		{
			return r_t_expansion; // very simple, it was already dynamic
		}*/
	};

public:
	typedef CCastImplementation<b_type_changes MSVC_OMMIT_ARG(void)> CImplementation;
	typedef typename CImplementation::CResult CResult;
};

template <class CLeftExpansion, class COldScalarType, class CTargetScalarType>
class CTypeCastResult {
public:
	typedef CLeftExpansion TExpansion;
	typedef CExpansionTraits<TExpansion> TTraits;
	typedef CTargetScalarType TNewScalar;

protected:
	enum {
		n_coeff_num = TTraits::n_max_coeff_num,
		b_allow_dynamic = TTraits::b_allow_dynamic
	};

public:
	typedef ::CExpansion<TNewScalar, n_coeff_num, b_allow_dynamic> CResult;
	typedef CExpansionTraits<CResult> TResultTraits;

protected:
	template <class COldScalatType MSVC_OMMIT_ARG(class GppDummy)>
	class CCastImplementation {
	public:
		static inline CResult t_Run(const TExpansion &r_t_expansion)
		{
			const COldScalatType *p_value = TTraits::p_Data(r_t_expansion);
			CResult result(uninitialized_construct);
			TNewScalar *p_result = TResultTraits::p_Data(result);
			const unsigned int n = TTraits::n_Size(r_t_expansion);
			for(unsigned int i = 0; i < n; ++ i)
				p_result[i] = TNewScalar(p_value[i]);
			result.Uninitialized_Contract(n);
			return result;
		}
	};

	template <MSVC_OMMIT(class GppDummy)>
	class CCastImplementation<TNewScalar MSVC_OMMIT_ARG(GppDummy)> {
	public:
		static inline CResult t_Run(const TExpansion &r_t_expansion)
		{
			return r_t_expansion; // very simple, it is already the requested type
		}
	};

public:
	typedef CCastImplementation<COldScalarType MSVC_OMMIT_ARG(void)> CImplementation;
};

template <const bool _b_needs_compact, class CResultType>
class CCompactionAlgorithm {
protected:
	template <const bool b_needs_compact MSVC_OMMIT_ARG(class GppDummy)>
	class CCompact {
	public:
		template <class CInputType>
		static CResultType Run(const CInputType &r_t_exp)
		{
			typedef typename CExpansionTraits<CInputType>::TType TScalar; // scalar tyoe
			enum {
				n_max_coeff_num = CExpansionTraits<CInputType>::n_max_coeff_num,
				/*n_max_fully_significant_coeff_num = (n_max_coeff_num <
					CMaxExpansionSize<TScalar>::n_longest_expansion_size)? n_max_coeff_num :
					CMaxExpansionSize<TScalar>::n_longest_expansion_size*/ // maximum size of an expansion where all the coefficients' mantissas are fully significant
				n_max_fully_significant_coeff_num = CExpansionTraits<CResultType>::n_max_coeff_num // ut should be this, really
			};

			unsigned int n_input_size = CExpansionTraits<CInputType>::n_Size(r_t_exp);
			if(n_input_size > n_max_fully_significant_coeff_num) {
				CResultType result(uninitialized_construct);
				result.Uninitialized_Contract(CExactBase<TScalar>::n_Compress(CResultType::n_max_coeff_num,
					expansion_internal::CExpansionTraits<CResultType>::p_Data(result), n_input_size,
					expansion_internal::CExpansionTraits<CInputType>::p_Data(r_t_exp))); // msvc 6.0 needs to use Uninitialized_Contract() and expansion traits here
				return result;
				// implement compression to a potentially static type

				//return r_t_exp.t_Compressed();
				// incurs an extra conversion from a dynamic type back to static (in case CResultType is static)
			} else {
				return CResultType(CExpansionTraits<CInputType>::n_Size(r_t_exp),
					CExpansionTraits<CInputType>::p_Data(r_t_exp), expansion::from_array_zero_pad);
				// efficient
			}
		}
	};

	template <MSVC_OMMIT(class GppDummy)>
	class CCompact<false MSVC_OMMIT_ARG(GppDummy)> {
	public:
		template <class CInputType>
		static CResultType Run(const CInputType &r_t_exp)
		{
			return r_t_exp; // let the copy-constructors take care of things
		}
	};

public:
	typedef CCompact<_b_needs_compact MSVC_OMMIT_ARG(void)> CAlgorithm;

	template <class CInputType>
	inline CResultType operator ()(const CInputType &r_t_exp) const
	{
		return CAlgorithm::Run(r_t_exp);
	}
};

#if 0 // unused

template <class COutputType, const bool _b_connect>
class CReturnConnector { // hack for simpler implementation where return value conversion may not be permitted
protected:
	template <const bool b_connect MSVC_OMMIT_ARG(class GppDummy)>
	class CConnect {
	public:
		template <class CInputType>
		inline COutputType operator ()(const CInputType &r_t_expansion) const
		{
			return r_t_expansion;
		}
	};

	template <MSVC_OMMIT(class GppDummy)>
	class CConnect<false MSVC_OMMIT_ARG(GppDummy)> {
	public:
		template <class CInputType>
		inline COutputType operator ()(const CInputType &UNUSED(r_t_expansion)) const
		{
			_ASSERTE(false); // this should never be called
			typedef typename CExpansionTraits<COutputType>::TType TScalar;
			return COutputType(TScalar(0), expansion::from_scalar_zero_pad); // return zero (but only because we must return something)
		}
	};

public:
	template <class CInputType>
	inline COutputType operator ()(const CInputType &r_t_expansion) const
	{
		return CConnect<_b_connect MSVC_OMMIT_ARG(void)>()(r_t_expansion);
	}
};

#endif // 0

template <class CLeftExpansion, class CRightExpansion/*, const bool _b_null_elimination*/>
class CAdditionResult {
public:
	typedef CLeftExpansion TLeft;
	typedef CRightExpansion TRight;
	typedef CExpansionTraits<CLeftExpansion> TLeftTraits;
	typedef CExpansionTraits<CRightExpansion> TRightTraits;
	typedef typename TLeftTraits::TType TType;
	typedef typename CCompatibleTypeCheck<TLeft, TRight>::CAssert CTypeCheck; // without mentioning CAssert0 directly it does not check (VS2008)

protected:
	enum {
		b_left_or_right_allows_dynamic = TLeftTraits::b_allow_dynamic | TRightTraits::b_allow_dynamic, // msvc 6.0 can't handle || here for some reason
		//b_null_elimination = b_left_or_right_allows_dynamic/*_b_null_elimination*/, // no explicit elimination, eliminate always if dynamic
		n_left_coeff_num = TLeftTraits::n_max_coeff_num,
		n_right_coeff_num = TRightTraits::n_max_coeff_num,
		b_need_swap = n_right_coeff_num > n_left_coeff_num // want right to have smaller dimension (for explicit specializations)
	};

	template <bool b_swap MSVC_OMMIT_ARG(class GppDummy)>
	struct CDeclareImpl {
		typedef CAdditionAlgorithm<n_left_coeff_num, n_right_coeff_num,
			b_left_or_right_allows_dynamic/*, b_null_elimination*/> CImplementation;
	};

	template <MSVC_OMMIT(class GppDummy)>
	struct CDeclareImpl<true MSVC_OMMIT_ARG(GppDummy)> {
		typedef CAdditionAlgorithm<n_right_coeff_num, n_left_coeff_num,
			b_left_or_right_allows_dynamic/*, b_null_elimination*/> CBaseImplementation;
		typedef CBinaryOperation_SwapOperands<CBaseImplementation> CImplementation;
	};

public:
	typedef typename CDeclareImpl<b_need_swap MSVC_OMMIT_ARG(void)>::CImplementation CImplementation;

protected:
	enum {
		n_max_coeff_num = CImplementation::n_max_coeff_num,
		b_allow_dynamic = CImplementation::b_allow_dynamic
	};

public:
	typedef ::CExpansion<TType, n_max_coeff_num, b_allow_dynamic> CResult;
//#if defined(_MSC_VER) && !defined(__MWERKS__) && _MSC_VER <= 1200
//	struct CPrototypeCtr : public ex_msvc6::CNoOpSpecializerWrapper {}; /**< @brief provide the default transformation for semantic check when compiling this template */
//	struct CConstructWithDefault : public CLeftExpansion, CPrototypeCtr {}; /**< @brief supply specialization-time functionality (order matters!) */
//	// note that this does not work in G++, CConstructWithDefault::CSpecialize is ambiguous
//
//	typedef typename CConstructWithDefault::CSpecialize<TType, n_max_coeff_num, b_allow_dynamic>::CResult CResult;
//#else // _MSC_VER && !__MWERKS__ && _MSC_VER <= 1200
//	typedef typename TLeft::CSpecialize<TType, n_max_coeff_num, b_allow_dynamic>::CResult CResult;
//#endif // _MSC_VER && !__MWERKS__ && _MSC_VER <= 1200
};

template <class CLeftExpansion, class CRightExpansion>
class CSubtractionResult {
public:
	typedef CLeftExpansion TLeft;
	typedef CRightExpansion TRight;
	typedef CExpansionTraits<CLeftExpansion> TLeftTraits;
	typedef CExpansionTraits<CRightExpansion> TRightTraits;
	typedef typename TLeftTraits::TType TType;
	typedef typename CCompatibleTypeCheck<TLeft, TRight>::CAssert CTypeCheck; // without mentioning CAssert0 directly it does not check (VS2008)

protected:
	enum {
		b_left_or_right_allows_dynamic = TLeftTraits::b_allow_dynamic | TRightTraits::b_allow_dynamic, // msvc 6.0 can't handle || here for some reason
		n_left_coeff_num = TLeftTraits::n_max_coeff_num,
		n_right_coeff_num = TRightTraits::n_max_coeff_num
	};

public:
	typedef CSubtractionAlgorithm<n_left_coeff_num, n_right_coeff_num,
		b_left_or_right_allows_dynamic> CImplementation;

protected:
	enum {
		n_max_coeff_num = CImplementation::n_max_coeff_num,
		b_allow_dynamic = CImplementation::b_allow_dynamic
	};

public:
	typedef ::CExpansion<TType, n_max_coeff_num, b_allow_dynamic> CResult;
};

template <class CLeftExpansion, class CRightExpansion>
class CMultiplicationResult {
public:
	typedef CLeftExpansion TLeft;
	typedef CRightExpansion TRight;
	typedef CExpansionTraits<CLeftExpansion> TLeftTraits;
	typedef CExpansionTraits<CRightExpansion> TRightTraits;
	typedef typename TLeftTraits::TType TType;
	typedef typename CCompatibleTypeCheck<TLeft, TRight>::CAssert CTypeCheck; // without mentioning CAssert0 directly it does not check (VS2008)

	enum {
		/*b_left_dynamic = TLeftTraits::b_allow_dynamic,
		b_right_dynamic = TRightTraits::b_allow_dynamic,*/
		b_left_or_right_allows_dynamic = TLeftTraits::b_allow_dynamic | TRightTraits::b_allow_dynamic, // msvc 6.0 can't handle || here for some reason
		n_left_coeff_num = TLeftTraits::n_max_coeff_num,
		n_right_coeff_num = TRightTraits::n_max_coeff_num,
		b_need_swap = n_right_coeff_num > n_left_coeff_num // want right to have smaller dimension (for explicit specializations)
	};

protected:
	template <bool b_swap MSVC_OMMIT_ARG(class GppDummy)>
	struct CDeclareImpl {
		typedef CMultiplicationAlgorithm<n_left_coeff_num, n_right_coeff_num,
			b_left_or_right_allows_dynamic> CImplementation;
	};

	template <MSVC_OMMIT(class GppDummy)>
	struct CDeclareImpl<true MSVC_OMMIT_ARG(GppDummy)> {
		typedef CMultiplicationAlgorithm<n_right_coeff_num, n_left_coeff_num,
			b_left_or_right_allows_dynamic> CBaseImplementation;
		typedef CBinaryOperation_SwapOperands<CBaseImplementation> CImplementation;
	};

public:
	typedef typename CDeclareImpl<b_need_swap MSVC_OMMIT_ARG(void)>::CImplementation CImplementation;

protected:
	enum {
		n_max_coeff_num = CImplementation::n_max_coeff_num,
		b_allow_dynamic = CImplementation::b_allow_dynamic
	};

public:
	typedef ::CExpansion<TType, n_max_coeff_num, b_allow_dynamic> CResult;
};

template <class CLeftExpansion>
class CSquareResult {
public:
	typedef CLeftExpansion TLeft;
	typedef CExpansionTraits<CLeftExpansion> TLeftTraits;
	typedef typename TLeftTraits::TType TType;

	enum {
		b_left_allows_dynamic = TLeftTraits::b_allow_dynamic,
		n_left_coeff_num = TLeftTraits::n_max_coeff_num
	};

	typedef CSquareAlgorithm<n_left_coeff_num, b_left_allows_dynamic> CImplementation;

protected:
	enum {
		n_max_coeff_num = CImplementation::n_max_coeff_num,
		b_allow_dynamic = CImplementation::b_allow_dynamic
	};

public:
	typedef ::CExpansion<TType, n_max_coeff_num, b_allow_dynamic> CResult;
};

template <class T, const unsigned int _n_max_coeff_num, const bool _b_allow_dynamic = false>
class CExpansionStorage {
#if !defined(_MSC_VER) || defined(__MWERKS__) || _MSC_VER > 1200
	template <class T_, const unsigned int _n_max_coeff_num_, const bool _b_allow_dynamic_> friend class CExpansion;
	template <class TyExpansion> friend class CExpansionTraits;
	// friend template syntax not recognized by msvc 6.0
#endif // !_MSC_VER || __MWERKS__ || _MSC_VER > 1200

public:
	typedef T TType;
	typedef T TInexactType;
	typedef CExpansionSize<(_b_allow_dynamic)? -1 : _n_max_coeff_num> TSize;

	enum {
		n_max_coeff_num = _n_max_coeff_num,
		b_allow_dynamic = _b_allow_dynamic
	};

protected:
	TSize m_n_size;
	TType m_p_value[n_max_coeff_num]; // todo - use std::vector for expansions longer than some threshold (say, 1024), to avoid blowing up the stack

protected:
	template <const int n_required_size>
	class CCheckSize {
	public:
		enum {
			b_sufficient_size = (n_max_coeff_num >= n_required_size),
			b_not_too_large_size = (b_allow_dynamic || n_max_coeff_num <= n_required_size) // dynamic expansions can be initialized with smaller expansions without performance penalties
		};

		typedef typename expansion_internal::CStaticAssert<b_sufficient_size>::EXPANSION_TOO_SHORT CAssert0;
		//typedef typename expansion_internal::CStaticAssert<b_not_too_large_size>::EXPANSION_TOO_LONG CAssert1;

		template <bool b_assert MSVC_OMMIT_ARG(class GppDummy)>
		struct CAssertHelper {
			typedef CAssert0 a; // seems to still work
			//typedef CAssert1 b; // do not want an error, just a warning
		};

		template <MSVC_OMMIT(class GppDummy)>
		struct CAssertHelper<false MSVC_OMMIT_ARG(GppDummy)> {
			typedef CAssert0 a; // seems to still work
			//typedef CAssert1 b; // do not want an error, just a warning

#ifdef __EXPANSION_WARN_UNNECESSARILY_TOO_LONG_EXPANSIONS
			inline CAssertHelper<false MSVC_OMMIT_ARG(GppDummy)>()
			{
				int EXPANSION_UNNECESSARILY_TOO_LONG; // intentionally unused, to print a warning (should work in most compilers)
				// note that this is not perfect, as the compiler will sometimes not point to the line of code where this happened (instead points to the first instantiation of CExpansion with the given T and length), requiring some backtracking and inconvenience
				// try to use the deprecation method, see if that works
			}
#endif // __EXPANSION_WARN_UNNECESSARILY_TOO_LONG_EXPANSIONS
		};

		typedef CAssertHelper<b_not_too_large_size MSVC_OMMIT_ARG(void)> CAssert;
	};

	typedef typename expansion_internal::CStaticAssert<_n_max_coeff_num != 0>::ILLEGAL_EMPTY_EXPANSION CAssert0;

protected:
	CExpansionStorage(/*TSize n_size = _n_max_coeff_num*/) // value not required, we need to initialize the size anyway and default constructor is inaccessible from the outside
		//:m_n_size(n_size)
	{}
};

} // ~expansion_internal

#endif // !__EXACT_MULTIPRECISION_FLOATING_POINT_ARITHMETIC_INTERNAL_INCLUDED
