/*
								+---------------------------------+
								|                                 |
								| *** Discrete cos. transform *** |
								|                                 |
								|  Copyright   -tHE SWINe- 2006  |
								|                                 |
								|              DCT.h              |
								|                                 |
								+---------------------------------+
*/

#pragma once
#ifndef __DCT_INCLUDED
#define __DCT_INCLUDED

/**
 *	@file DCT.h
 *	@author -tHE SWINe-
 *	@date 2006
 *	@brief Discrete Cosine Transform implementation
 *
 *	@date 2006-08-22
 *
 *	fixed error in CDCT8_1D and CFastDCT8_2D::Forward
 *	everything is working well now
 *
 *	@date 2006-09-21
 *
 *	added MMX inverse DCT code path
 *
 *  @date 2006-12-27
 *
 *  finaly rewritten floating-point FFT's for linux compatibility
 *  note CFastDCT8_2D should preferably be a template (todo)
 *
 *	@date 2008-03-04
 *
 *	now using Integer.h header
 *
 *	@date 2008-08-08
 *
 *	added \#ifdef for windows 64
 *
 *	@date 2009-05-04
 *
 *	fixed mixed windows / linux line endings
 *
 *	@date 2010-10-29
 *
 *	Unified windows detection macro to "\#if defined(_WIN32) || defined(_WIN64)".
 *
 *	@date 2012-06-19
 *
 *	Moved multiple inclusion guard before file documentation comment.
 *
 */

/**
 *	@def DCT_ENABLE_MMX
 *
 *	@brief enable integer (16-bit) IDCT acceleration using MMX
 */
//#define DCT_ENABLE_MMX

#include "Integer.h"

/**
 *	@brief Pi constant
 *
 *	This is defined in Vector.cpp (with value
 *		3.1415926535897932384626433832795028841971693993751).
 */
extern const float f_pi;

#if defined(_WIN32) || defined (_WIN64)
/**
 *	@brief constants, needed for forward DCT evaluations
 *
 *	m1 = cos(4pi/16)
 *	m2 = cos(6pi/16)
 *	m3 = cos(2pi/16) - cos(6pi/16)
 *	m4 = cos(2pi/16) + cos(6pi/16)
 *
 *	@param T is data type, DCT is going to be carried-out in (usually double,
 *		float or int).
 */
template <class T>
struct TFDCTConstants {
    static const T m1, /**< first DCT coefficient (0.707107) */
				   m2, /**< second DCT coefficient (0.382683) */
				   m3, /**< third DCT coefficient (0.541196) */
				   m4; /**< fourth DCT coefficient (1.306563) */
};
typedef TFDCTConstants<float> TFDCTConstants_float; /**< DCT constants in floating-point */
typedef TFDCTConstants<int32_t> TFDCTConstants_int; /**< DCT constants in int32 */
typedef TFDCTConstants<int16_t> TFDCTConstants_short; /**< DCT constants in int16 */
#else
struct TFDCTConstants_float {
    static const float m1, m2, m3, m4;
};
struct TFDCTConstants_int {
    static const int32_t m1, m2, m3, m4;
};
struct TFDCTConstants_short {
    static const int16_t m1, m2, m3, m4;
};
#endif

/**
 *	@brief forward DCT template
 *
 *	One-dimensional 8-tap DCT template. Original code by Yann Guidon in article
 *		"Optimised Winograd DCT for the F-CPU Core 0".
 *
 *	@param T is data type operations are performed on (int16 should be sufficient)
 *	@param n_offset is offset between source samples (1 for line DCT, 8 for row DCT)
 *	@param n_mul_shift is bit-shift after multiplication (use 0 for T = float, 8
 *		for int16, etc)
 *	@param TConstsClass conmtains static precalculated constants in used data type
 *
 *	@note This can operate in-situ (p_dest can be equal to p_src).
 */
template <class T, const int n_offset, const int n_mul_shift, class TConstsClass>
class CFDCT8_1D {
public:
	/**
	 *	@brief DCT routine
	 *
	 *	Calcualtes DCT of p_src and stores results in p_dest
	 *
	 *	@param[out] p_dest contains space for 8 DCT results
	 *	@param[in] p_src contains 8 input samples
	 */
	static inline void Do(T *p_dest, const T *p_src)
	{
		T n_tmp_7 = p_src[0 * n_offset] + p_src[7 * n_offset];
		T n_tmp_6 = p_src[1 * n_offset] + p_src[6 * n_offset];
		T n_tmp_9 = p_src[2 * n_offset] + p_src[5 * n_offset];
		T n_tmp_0 = p_src[3 * n_offset] + p_src[4 * n_offset];
		T n_tmp_8 = p_src[0 * n_offset] - p_src[7 * n_offset];
		T n_tmp_5 = p_src[1 * n_offset] - p_src[6 * n_offset];
		T n_tmp_3 = p_src[2 * n_offset] - p_src[5 * n_offset];
		T n_tmp_4 = p_src[3 * n_offset] - p_src[4 * n_offset];

		T n_tmp_1 = n_tmp_7 + n_tmp_0;
		T n_tmp_2 = n_tmp_6 - n_tmp_9;
		n_tmp_4 = n_tmp_4 + n_tmp_3;
		n_tmp_3 = n_tmp_5 + n_tmp_3;
		n_tmp_7 = n_tmp_7 - n_tmp_0;
		n_tmp_5 = n_tmp_5 + n_tmp_8;
		n_tmp_9 = n_tmp_6 + n_tmp_9;

		n_tmp_3 = (T)((TConstsClass::m1 * n_tmp_3) / (1 << n_mul_shift));
		n_tmp_2 = n_tmp_2 + n_tmp_7;
		n_tmp_6 = n_tmp_4 - n_tmp_5;

		n_tmp_4 = (T)((TConstsClass::m3 * n_tmp_4) / (1 << n_mul_shift));
		n_tmp_5 = (T)((TConstsClass::m4 * n_tmp_5) / (1 << n_mul_shift));
		n_tmp_2 = (T)((TConstsClass::m1 * n_tmp_2) / (1 << n_mul_shift));
		n_tmp_6 = (T)((TConstsClass::m2 * n_tmp_6) / (1 << n_mul_shift));

		p_dest[0 * n_offset] = n_tmp_1 + n_tmp_9;
		p_dest[4 * n_offset] = n_tmp_1 - n_tmp_9;

		n_tmp_1 = n_tmp_3 + n_tmp_8;
		n_tmp_3 = n_tmp_8 - n_tmp_3;
		p_dest[2 * n_offset] = n_tmp_7 + n_tmp_2;
		n_tmp_4 = n_tmp_4 + n_tmp_6;
		p_dest[6 * n_offset] = n_tmp_7 - n_tmp_2;
		n_tmp_2 = n_tmp_5 + n_tmp_6;

		p_dest[3 * n_offset] = n_tmp_3 - n_tmp_4;
		p_dest[5 * n_offset] = n_tmp_3 + n_tmp_4;
		p_dest[1 * n_offset] = n_tmp_1 + n_tmp_2;
		p_dest[7 * n_offset] = n_tmp_1 - n_tmp_2;
	}
};

#if defined(_WIN32) || defined (_WIN64)
/**
 *	@brief constants, needed for inverse DCT evaluations
 *
 *	m1 =  1.414213562
 *	m2 =  1.847759065
 *	m3 =  1.082392200
 *	m4 = -2.613125930
 *
 *	@param T is data type, DCT is going to be carried-out in (usually double,
 *		float or int).
 */
template <class T>
struct TIDCTConstants {
    static const T m1, /**< first IDCT coefficient (1.414213562) */
				   m2, /**< second IDCT coefficient (1.847759065) */
				   m3, /**< third IDCT coefficient (1.082392200) */
				   m4; /**< fourth IDCT coefficient (-2.613125930) */
};
typedef TIDCTConstants<float> TIDCTConstants_float; /**< DCT constants in floating-point */
typedef TIDCTConstants<int32_t> TIDCTConstants_int; /**< DCT constants in int32 */
typedef TIDCTConstants<int16_t> TIDCTConstants_short; /**< DCT constants in int16 */
#else
struct TIDCTConstants_float {
    static const float m1, m2, m3, m4;
};
struct TIDCTConstants_int {
    static const int32_t m1, m2, m3, m4;
};
struct TIDCTConstants_short {
    static const int16_t m1, m2, m3, m4;
};
#endif

/**
 *	@brief inverse DCT template
 *
 *	One-dimensional 8-tap DCT template. Somewhat (~30% faster) optimised version
 *		of algorithm used in Thomas G. Lane's jpeglib.
 *
 *	@param T is data type operations are performed on (int16 should be sufficient)
 *	@param n_offset is offset between source samples (1 for line DCT, 8 for row DCT)
 *	@param n_mul_shift is bit-shift after multiplication (use 0 for T = float, 8
 *		for int16, etc)
 *	@param n_post_shift is final bit-shift after IDCT
 *	@param TConstsClass conmtains static precalculated constants in used data type
 *
 *	@note This can operate in-situ (p_dest can be equal to p_src).
 */
template <class T, const int n_offset, const int n_mul_shift, class TConstsClass, const int n_post_shift = 0>
class CIDCT8_1D {
public:
	/**
	 *	@brief inverse DCT routine
	 *
	 *	Calcualtes inverse DCT of p_src and stores results in p_dest
	 *
	 *	@param[out] p_dest contains space for 8 output samples
	 *	@param[in] p_src contains 8 input DCT coefficients
	 */
	static inline void Do(T *p_dest, const T *p_src)
	{
		T n_tmp_4 = p_src[0 * n_offset] + p_src[4 * n_offset];
		T n_tmp_8 = p_src[1 * n_offset] + p_src[7 * n_offset];
		T n_tmp_7 = p_src[2 * n_offset] + p_src[6 * n_offset];
		T n_tmp_9 = p_src[5 * n_offset] + p_src[3 * n_offset];
		T n_tmp_5 = p_src[0 * n_offset] - p_src[4 * n_offset];
		T n_tmp_6 = (T)(((p_src[2 * n_offset] - p_src[6 * n_offset]) * TConstsClass::m1) /
			(1 << n_mul_shift)) - n_tmp_7;

		T n_tmp_0 = n_tmp_4 + n_tmp_7;
		T n_tmp_3 = n_tmp_4 - n_tmp_7;
		T n_tmp_1 = n_tmp_5 + n_tmp_6;
		T n_tmp_2 = n_tmp_5 - n_tmp_6;

		n_tmp_5 = (T)(((n_tmp_8 - n_tmp_9) * TConstsClass::m1) / (1 << n_mul_shift));
		n_tmp_9 += n_tmp_8;

		p_dest[0 * n_offset] = (n_tmp_0 + n_tmp_9) / (1 << n_post_shift);
		p_dest[7 * n_offset] = (n_tmp_0 - n_tmp_9) / (1 << n_post_shift);

		n_tmp_0 = p_src[5 * n_offset] - p_src[3 * n_offset];
		n_tmp_8 = p_src[1 * n_offset] - p_src[7 * n_offset];

		n_tmp_7 = (T)(((n_tmp_0 + n_tmp_8) * TConstsClass::m2) / (1 << n_mul_shift));
		n_tmp_4 = (T)((n_tmp_8 * TConstsClass::m3) / (1 << n_mul_shift)) - n_tmp_7;
		n_tmp_6 = (T)((n_tmp_0 * TConstsClass::m4) / (1 << n_mul_shift)) + n_tmp_7;

		n_tmp_6 -= n_tmp_9;
		n_tmp_5 -= n_tmp_6;
		n_tmp_4 += n_tmp_5;

		p_dest[1 * n_offset] = (n_tmp_1 + n_tmp_6) / (1 << n_post_shift);
		p_dest[6 * n_offset] = (n_tmp_1 - n_tmp_6) / (1 << n_post_shift);
		p_dest[2 * n_offset] = (n_tmp_2 + n_tmp_5) / (1 << n_post_shift);
		p_dest[5 * n_offset] = (n_tmp_2 - n_tmp_5) / (1 << n_post_shift);
		p_dest[4 * n_offset] = (n_tmp_3 + n_tmp_4) / (1 << n_post_shift);
		p_dest[3 * n_offset] = (n_tmp_3 - n_tmp_4) / (1 << n_post_shift);
	}
};

#ifdef DCT_ENABLE_MMX

/**
 *	@brief vertical step of 8x8 inverse DCT, implemented in MMX
 */
class CIDCT8_1D_MMX_Vert {
public:
	/**
	 *	@brief DCT routine
	 *
	 *	Calcualtes DCT of p_src and stores results in p_dest
	 *
	 *	@param[out] p_dest contains space for 8 DCT results
	 *	@param[in] p_src contains 8 input samples
	 */
	static inline void Do(int16_t *p_dest, const int16_t *p_src);
};

/**
 *	@brief horizontal step of 8x8 inverse DCT, implemented in MMX
 *	@note Divides results by 8 (optimization for JPEG codec).
 */
class CIDCT8_1D_MMX_Horiz_Divide8 {
public:
	/**
	 *	@brief inverse DCT routine
	 *
	 *	Calcualtes inverse DCT of p_src and stores results in p_dest,
	 *		divides results by 8 (optimization for JPEG codec).
	 *
	 *	@param[out] p_dest contains space for 8 output samples
	 *	@param[in] p_src contains 8 input DCT coefficients
	 */
	static inline void Do(int16_t *p_dest, const int16_t *p_src);
};

#endif // DCT_ENABLE_MMX

/**
 *	@brief reference 8x8 DCT
 *
 *	Naive 2D floating-point DCT implementation, used as reference for debugging
 *		the other implementations.
 *
 *	@note It is very slow.
 */
class CReferenceDCT8_2D {
public:
	/**
	 *	@brief calculates 2D (8x8) DCT
	 *
	 *	@param[in] p_dest contains space for 8x8 DCT results
	 *	@param[in] p_src contains 8x8 input samples
	 *
	 *	@note Values in p_src, p_dest are stored in row-major manner.
	 */
	static void Forward(float *p_dest, const float *p_src);

	/**
	 *	@brief calculates 2D (8x8) inverse DCT
	 *
	 *	@param[in] p_dest contains space for 8x8 output samples
	 *	@param[in] p_src contains 8x8 input DCT coefficients
	 *
	 *	@note Values in p_src, p_dest are stored in row-major manner.
	 */
	static void Inverse(float *p_dest, const float *p_src);
};

/**
 *	@brief 8x8 DCT implementations
 *
 *	Fast 2D floating-point DCT implementations.
 */
class CFastDCT8_2D {
private:
	static const float m_p_prescale_table[8];

public:
	/**
	 *	@brief gets prescale table for DCT
	 *
	 *	AA&N family DCT need pre-scaled coefficients, this returns table with
	 *		weights.
	 *
	 *	@return Returns pointer to array of 8 float coefficents.
	 *
	 *	@note The coefficients for 8x8 DCT must all be prescaled column-wise and
	 *		row-wise as well.
	 */
	static const float *p_PrescaleTable();

	/**
	 *	@brief calculates 2D (8x8) inverse DCT
	 *
	 *	@param[in] p_dest contains space for 8x8 output samples
	 *	@param[in] p_src contains 8x8 input DCT coefficients
	 *
	 *	@note Values in p_src, p_dest are stored in row-major manner.
	 */
	static void Inverse(float *p_dest, const float *p_src);

	/**
	 *	@brief calculates 2D (8x8) inverse DCT
	 *
	 *	@param[in] p_dest contains space for 8x8 output samples
	 *	@param[in] p_src contains 8x8 input DCT coefficients
	 *
	 *	@note Values in p_src, p_dest are stored in row-major manner.
	 *	@note This integer arithmetic version will be probably faster
	 *		than floating-point version on most systems.
	 */
	static void Inverse(int32_t *p_dest, const int32_t *p_src);

	/**
	 *	@brief calculates 2D (8x8) inverse DCT
	 *
	 *	@param[in] p_dest contains space for 8x8 output samples
	 *	@param[in] p_src contains 8x8 input DCT coefficients
	 *
	 *	@note Values in p_src, p_dest are stored in row-major manner.
	 *	@note This integer arithmetic version will be probably faster
	 *		than floating-point version on most systems.
	 */
	static void Inverse(int16_t *p_dest, const int16_t *p_src);

#ifdef DCT_ENABLE_MMX
	/**
	 *	@brief calculates 2D (8x8) inverse DCT
	 *
	 *	@param[in] p_dest contains space for 8x8 output samples
	 *	@param[in] p_src contains 8x8 input DCT coefficients
	 *
	 *	@note Values in p_src, p_dest are stored in row-major manner.
	 *	@note This MMX optimized integer arithmetic version will be probably
	 *		faster than floating-point version on most systems.
	 */
	static void Inverse_MMX(int16_t *p_dest, const int16_t *p_src);
#endif

	/**
	 *	@brief calculates 2D (8x8) DCT
	 *
	 *	@param[in] p_dest contains space for 8x8 DCT results
	 *	@param[in] p_src contains 8x8 input samples
	 *
	 *	@note Values in p_src, p_dest are stored in row-major manner.
	 */
    static void Forward(float *p_dest, const float *p_src);

	/**
	 *	@brief calculates 2D (8x8) DCT
	 *
	 *	@param[in] p_dest contains space for 8x8 DCT results
	 *	@param[in] p_src contains 8x8 input samples
	 *
	 *	@note Values in p_src, p_dest are stored in row-major manner.
	 *	@note This integer arithmetic version will be probably faster
	 *		than floating-point version on most systems.
	 */
	static void Forward(int32_t *p_dest, const int32_t *p_src);

	/**
	 *	@brief calculates 2D (8x8) DCT
	 *
	 *	@param[in] p_dest contains space for 8x8 DCT results
	 *	@param[in] p_src contains 8x8 input samples
	 *
	 *	@note Values in p_src, p_dest are stored in row-major manner.
	 *	@note This integer arithmetic version will be probably faster
	 *		than floating-point version on most systems.
	 */
	static void Forward(int16_t *p_dest, const int16_t *p_src);
};

#endif // __DCT_INCLUDED
