/*
								+---------------------------------+
								|                                 |
								| *** Discrete cos. transform *** |
								|                                 |
								|  Copyright   -tHE SWINe- 2006  |
								|                                 |
								|             DCT.cpp             |
								|                                 |
								+---------------------------------+
*/

/**
 *	@file DCT.cpp
 *	@author -tHE SWINe-
 *	@date 2006
 *	@brief Discrete Cosine Transform implementation
 *
 *	@date 2006-08-22
 *
 *	passed code revision
 *
 *	fixed error in CDCT8_1D and CFastDCT8_2D::Forward
 *	everything is working well now
 *
 *	@date 2006-09-21
 *
 *	passed code revision
 *
 *	added MMX inverse DCT code path
 *
 *  @date 2006-12-27
 *
 *	passed code revision
 *
 *  finaly rewritten floating-point FFT's for linux compatibility
 *  note CFastDCT8_2D should preferably be a template (todo)
 *
 *	@date 2008-03-04
 *
 *	now using Integer.h header
 *
 *	@date 2008-08-08
 *
 *	added \#ifdef for windows 64
 *
 *	@date 2009-05-04
 *
 *	fixed mixed windows / linux line endings
 *
 */

#include "NewFix.h"

#include "CallStack.h"
#include <math.h>
#include "DCT.h"

//const float f_pi = 3.14159265358979323846264338327950288419716939937510f;

#if defined(_MSC_VER) && !defined(__MWERKS__) && !defined(for) && _MSC_VER <= 1200
#define for if(0) {} else for
#endif // _MSC_VER && !__MWERKS__ && !for && _MSC_VER <= 1200
// msvc 'for' scoping hack

/*
 *								=== CIDCT8_1D_MMX_Horiz ===
 */

#ifdef DCT_ENABLE_MMX

/*
 *	void CIDCT8_1D_MMX_Horiz_Divide8::operator ()(int16_t *p_dest, const int16_t *p_src)
 *		- fast MMX IDCT, results are divided by 8 (shift right 3 bits)
 *		- note the code uses the ebx register that is used as frame pointer by some compillers
 */
void CIDCT8_1D_MMX_Horiz_Divide8::Do(int16_t *p_dest, const int16_t *p_src)
{
	__declspec(align(64)) int16_t n_tmp_0, n_tmp_1, n_tmp_2, n_tmp_3, n_tmp_5, n_tmp_9;

	_asm {
		mov edx, dword ptr p_src
		mov ax, [edx] // eax = 0 | p_src[0]
		shl eax, 16
		mov ax, [edx + 2] // eax = p_src[0] | p_src[1]
		movd mm0, eax // mm0 = 0 | 0 | p_src[0] | p_src[1]
		mov ax, [edx + 4]
		shl eax, 16   // eax = p_src[2] | 0
		mov bx, [edx + 6] // ebx = 0 | p_src[3]
		mov cx, [edx + 8]
		shl ecx, 16	  // ecx = p_src[4] | 0
		mov ax, [edx + 10] // eax = p_src[2] | p_src[5]
		movd mm1, eax // mm1 = 0 | 0 | p_src[2] | p_src[5]
		psllq mm0, 32 // mm0 = p_src[0] | p_src[1] | 0 | 0
		por mm0, mm1  // mm0 = p_src[0] | p_src[1] | p_src[2] | p_src[5]
		mov ax, [edx + 12] // eax = 0 | p_src[6]
		shl eax, 16
		mov ax, bx    // eax = p_src[6] | p_src[3]
		mov cx, [edx + 14] // ecx = p_src[4] | p_src[7]
		movd mm1, ecx // mm1 = 0 | 0 | p_src[4] | p_src[7]
		movd mm2, eax // mm2 = 0 | 0 | p_src[6] | p_src[3]
		psllq mm1, 32 // mm1 = p_src[4] | p_src[7] | 0 | 0
		por mm1, mm2  // mm1 = p_src[4] | p_src[7] | p_src[6] | p_src[3]
		pxor mm2, mm2
		por mm2, mm0  // mm2 = p_src[0] | p_src[1] | p_src[2] | p_src[5]
		paddsw mm0, mm1  // mm0 = p_src[0] + p_src[4] | p_src[1] + p_src[7] | p_src[2] + p_src[6] | p_src[5] + p_src[3]
		movd eax, mm0 // eax = p_src[2] + p_src[6] | p_src[5] + p_src[3]
		mov word ptr n_tmp_9, ax // n_tmp_9 = p_src[5] + p_src[3];
		shr eax, 16
		mov cx, ax // ecx = 0 | n_tmp_7
		psrlq mm0, 32 // mm0 = 0 | 0 | p_src[0] + p_src[4] | p_src[1] + p_src[7]
		movd edx, mm0 // edx = p_src[0] + p_src[4] | p_src[1] + p_src[7]
		psubsw mm2, mm1 // mm2 = p_src[0] - p_src[4] | p_src[1] - p_src[7] | p_src[2] - p_src[6] | p_src[5] - p_src[3]
		movd eax, mm2 // eax = p_src[2] - p_src[6] | p_src[5] - p_src[3]
		shr eax, 16 // eax = 0 | p_src[2] - p_src[6]
		cwde // convert word -> doubleword (eax)
		imul eax, eax, 362 // eax = (p_src[2] - p_src[6]) * m1
		sar eax, 8
		sub ax, cx // eax = !0 | (p_src[2] - p_src[6]) * m1 - n_tmp_7
		psrlq mm2, 48 // mm2 = 0 | 0 | 0 | p_src[0] - p_src[4]
		movd ebx, mm2 // bx = p_src[0] - p_src[4]
		shl ecx, 16
		mov cx, ax // ecx = n_tmp_7 | n_tmp_6
		xchg dx, bx // edx = n_tmp_4 | n_tmp_5, bx = n_tmp_8
		movd mm0, ecx // mm0 = 0 | 0 | n_tmp_7 | n_tmp_6
		movd mm1, edx // mm1 = 0 | 0 | n_tmp_4 | n_tmp_5
		paddsw mm1, mm0 // mm1 = 0 | 0 | n_tmp_4 + n_tmp_7 | n_tmp_5 + n_tmp_6
		movd eax, mm1 // eax = n_tmp_4 + n_tmp_7 | n_tmp_5 + n_tmp_6
		mov word ptr n_tmp_1, ax // n_tmp_1 = n_tmp_5 + n_tmp_6;
		shr eax, 16
		mov word ptr n_tmp_0, ax // n_tmp_0 = n_tmp_4 + n_tmp_7;
		movd mm1, edx // mm0 = 0 | 0 | n_tmp_4 | n_tmp_5
		psubsw mm1, mm0 // mm1 = 0 | 0 | n_tmp_4 - n_tmp_7 | n_tmp_5 - n_tmp_6
		movd eax, mm1 // eax = n_tmp_4 - n_tmp_7 | n_tmp_5 - n_tmp_6
		mov word ptr n_tmp_2, ax // n_tmp_2 = n_tmp_5 - n_tmp_6;
		shr eax, 16
		mov word ptr n_tmp_3, ax // n_tmp_3 = n_tmp_4 - n_tmp_7;
		xor eax, eax // eax = 0 | 0
		mov ax, bx // eax = 0 | n_tmp_8
		mov cx, word ptr n_tmp_9 // cx = n_tmp_9
		add bx, cx // bx = n_tmp_9 + n_tmp_8
		mov word ptr n_tmp_9, bx // n_tmp_9 += n_tmp_8;
		sub ax, cx // eax = 0 | n_tmp_8 - n_tmp_9
		cwde
		imul eax, eax, 362
		sar eax, 8
		mov word ptr n_tmp_5, ax // n_tmp_5 = (int16_t)(((n_tmp_8 - n_tmp_9) * 362) >> 8);
		mov edx, dword ptr p_dest
		mov ax, word ptr n_tmp_0 // ax = n_tmp_0
		mov cx, ax
		add ax, bx
		sar ax, 3 // hack - faster divide by 8 here
		mov [edx], ax // p_dest[0] = (n_tmp_0 + n_tmp_9);
		sub cx, bx
		sar cx, 3 // hack - faster divide by 8 here
		mov [edx + 14], cx // p_dest[7] = (n_tmp_0 - n_tmp_9);
		mov edx, dword ptr p_src
		mov cx, [edx + 2]
		mov ax, [edx + 14]
		sub cx, ax // cx = n_tmp_8 = p_src[1] - p_src[7];
		mov ax, [edx + 10]
		mov dx, [edx + 6]
		sub ax, dx // ax = n_tmp_0 = p_src[5] - p_src[3];
		mov dx, cx
		shl edx, 16
		mov dx, ax
		movd mm0, edx
		pxor mm3, mm3
		punpcklwd mm0, mm3 // mm0 = 0 | n_tmp_8 | 0 | n_tmp_0
		add ax, cx
		cwde
		imul eax, eax, 473
		sar eax, 8 // eax = !0 | n_tmp_7
		mov cx, ax
		neg cx
		shl eax, 16
		mov ax, cx // eax = n_tmp_7 | -n_tmp_7
		movd mm1, eax // mm1 = 0 | 0 | n_tmp_7 | -n_tmp_7
		punpcklwd mm1, mm3 // mm1 = 0 | n_tmp_7 | 0 | -n_tmp_7
		mov eax, 0x0115fd63 // eax = 277 | -669
		movd mm2, eax // mm2 = 0 | 0 | 277 | -669
		punpcklwd mm2, mm3 // mm2 = 0 | 277 | 0 | -669
		pmaddwd mm0, mm2 // mm0 =  n_tmp_8 * 277 | n_tmp_0 * -669
		psrad mm0, 8 // (n_tmp_8 * 277) >> 8 | (n_tmp_0 * -669) >> 8
		psubsw mm0, mm1 // mm0 = !0 | ((n_tmp_8 * 277) >> 8) - n_tmp_7 | !0 | ((n_tmp_0 * -669) >> 8) + n_tmp_7
		movd eax, mm0 // eax = !0 | ((n_tmp_0 * -669) >> 8) + n_tmp_7
		psrlq mm0, 32
		movd ecx, mm0 // ecx = !0 | ((n_tmp_8 * 277) >> 8) - n_tmp_7
		sub ax, bx // ax = n_tmp_6 -= n_tmp_9;
		mov bx, word ptr n_tmp_5
		sub bx, ax // bx = n_tmp_5 -= n_tmp_6;
		add cx, bx // cx = n_tmp_4 += n_tmp_5; | n_tmp_6
		shl ebx, 16
		mov bx, ax
		movd mm1, ebx
		mov dx, cx
		neg dx
		shl ecx, 16
		mov cx, dx
		movd mm0, ecx
		psllq mm0, 32
		por mm0, mm1 // mm0 = n_tmp_4 | -n_tmp_4 | n_tmp_5 | n_tmp_6
		mov ax, word ptr n_tmp_3
		mov bx, ax
		shl eax, 16
		mov ax, bx
		movd mm1, eax
		psllq mm1, 32 // mm1 = n_tmp_3 | n_tmp_3 | 0 | 0
		mov ax, word ptr n_tmp_2
		shl eax, 16
		mov ax, word ptr n_tmp_1
		movd mm2, eax
		por mm1, mm2 // mm1 = n_tmp_3 | n_tmp_3 | n_tmp_2 | n_tmp_1
		por mm3, mm1 // mm3 = n_tmp_3 | n_tmp_3 | n_tmp_2 | n_tmp_1
		paddsw mm1, mm0 // mm1 = n_tmp_3 + n_tmp_4 | n_tmp_3 - n_tmp_4 | n_tmp_2 + n_tmp_5 | n_tmp_1 + n_tmp_6
		mov edx, dword ptr p_dest
		psraw mm1, 3 // hack - faster divide by 8 here
		movq [edx + 2], mm1 // fill p_dest[1] ... p_dest[4]
		psubsw mm3, mm0 // mm3 = n_tmp_3 - n_tmp_4 | n_tmp_3 + n_tmp_4 | n_tmp_2 - n_tmp_5 | n_tmp_1 - n_tmp_6
		psraw mm3, 3 // hack - faster divide by 8 here
		movd eax, mm3 // eax = n_tmp_2 - n_tmp_5 | n_tmp_1 - n_tmp_6
		mov [edx + 12], ax
		shr eax, 16
		mov [edx + 10], ax
	}
}

/*
 *								=== ~CIDCT8_1D_MMX_Horiz ===
 */

/*
 *								=== CIDCT8_1D_MMX_Vert ===
 */

/*
 *	void CIDCT8_1D_MMX_Vert::operator ()(int16_t *p_dest, const int16_t *p_src)
 *		- fast MMX IDCT
 *		- note the code uses the ebx register that is used as frame pointer by some compillers
 */
void CIDCT8_1D_MMX_Vert::Do(int16_t *p_dest, const int16_t *p_src)
{
	__declspec(align(64)) int16_t n_tmp_0, n_tmp_1, n_tmp_2, n_tmp_3, n_tmp_5, n_tmp_9;

	_asm {
		mov edx, dword ptr p_src
		mov ax, [edx] // eax = 0 | p_src[0]
		shl eax, 16
		mov ax, [edx + 8 * 2] // eax = p_src[0] | p_src[1]
		movd mm0, eax // mm0 = 0 | 0 | p_src[0] | p_src[1]
		mov ax, [edx + 8 * 4]
		shl eax, 16   // eax = p_src[2] | 0
		mov bx, [edx + 8 * 6] // ebx = 0 | p_src[3]
		mov cx, [edx + 8 * 8]
		shl ecx, 16	  // ecx = p_src[4] | 0
		mov ax, [edx + 8 * 10] // eax = p_src[2] | p_src[5]
		movd mm1, eax // mm1 = 0 | 0 | p_src[2] | p_src[5]
		psllq mm0, 32 // mm0 = p_src[0] | p_src[1] | 0 | 0
		por mm0, mm1  // mm0 = p_src[0] | p_src[1] | p_src[2] | p_src[5]
		mov ax, [edx + 8 * 12] // eax = 0 | p_src[6]
		shl eax, 16
		mov ax, bx    // eax = p_src[6] | p_src[3]
		mov cx, [edx + 8 * 14] // ecx = p_src[4] | p_src[7]
		movd mm1, ecx // mm1 = 0 | 0 | p_src[4] | p_src[7]
		movd mm2, eax // mm2 = 0 | 0 | p_src[6] | p_src[3]
		psllq mm1, 32 // mm1 = p_src[4] | p_src[7] | 0 | 0
		por mm1, mm2  // mm1 = p_src[4] | p_src[7] | p_src[6] | p_src[3]
		pxor mm2, mm2
		por mm2, mm0  // mm2 = p_src[0] | p_src[1] | p_src[2] | p_src[5]
		paddsw mm0, mm1  // mm0 = p_src[0] + p_src[4] | p_src[1] + p_src[7] | p_src[2] + p_src[6] | p_src[5] + p_src[3]
		movd eax, mm0 // eax = p_src[2] + p_src[6] | p_src[5] + p_src[3]
		mov word ptr n_tmp_9, ax // n_tmp_9 = p_src[5] + p_src[3];
		shr eax, 16
		mov cx, ax // ecx = 0 | n_tmp_7
		psrlq mm0, 32 // mm0 = 0 | 0 | p_src[0] + p_src[4] | p_src[1] + p_src[7]
		movd edx, mm0 // edx = p_src[0] + p_src[4] | p_src[1] + p_src[7]
		psubsw mm2, mm1 // mm2 = p_src[0] - p_src[4] | p_src[1] - p_src[7] | p_src[2] - p_src[6] | p_src[5] - p_src[3]
		movd eax, mm2 // eax = p_src[2] - p_src[6] | p_src[5] - p_src[3]
		shr eax, 16 // eax = 0 | p_src[2] - p_src[6]
		cwde // convert word -> doubleword (eax)
		imul eax, eax, 362 // eax = (p_src[2] - p_src[6]) * m1
		sar eax, 8
		sub ax, cx // eax = !0 | (p_src[2] - p_src[6]) * m1 - n_tmp_7
		psrlq mm2, 48 // mm2 = 0 | 0 | 0 | p_src[0] - p_src[4]
		movd ebx, mm2 // bx = p_src[0] - p_src[4]
		shl ecx, 16
		mov cx, ax // ecx = n_tmp_7 | n_tmp_6
		xchg dx, bx // edx = n_tmp_4 | n_tmp_5, bx = n_tmp_8
		movd mm0, ecx // mm0 = 0 | 0 | n_tmp_7 | n_tmp_6
		movd mm1, edx // mm1 = 0 | 0 | n_tmp_4 | n_tmp_5
		paddsw mm1, mm0 // mm1 = 0 | 0 | n_tmp_4 + n_tmp_7 | n_tmp_5 + n_tmp_6
		movd eax, mm1 // eax = n_tmp_4 + n_tmp_7 | n_tmp_5 + n_tmp_6
		mov word ptr n_tmp_1, ax // n_tmp_1 = n_tmp_5 + n_tmp_6;
		shr eax, 16
		mov word ptr n_tmp_0, ax // n_tmp_0 = n_tmp_4 + n_tmp_7;
		movd mm1, edx // mm0 = 0 | 0 | n_tmp_4 | n_tmp_5
		psubsw mm1, mm0 // mm1 = 0 | 0 | n_tmp_4 - n_tmp_7 | n_tmp_5 - n_tmp_6
		movd eax, mm1 // eax = n_tmp_4 - n_tmp_7 | n_tmp_5 - n_tmp_6
		mov word ptr n_tmp_2, ax // n_tmp_2 = n_tmp_5 - n_tmp_6;
		shr eax, 16
		mov word ptr n_tmp_3, ax // n_tmp_3 = n_tmp_4 - n_tmp_7;
		xor eax, eax // eax = 0 | 0
		mov ax, bx // eax = 0 | n_tmp_8
		mov cx, word ptr n_tmp_9 // cx = n_tmp_9
		add bx, cx // bx = n_tmp_9 + n_tmp_8
		mov word ptr n_tmp_9, bx // n_tmp_9 += n_tmp_8;
		sub ax, cx // eax = 0 | n_tmp_8 - n_tmp_9
		cwde
		imul eax, eax, 362
		sar eax, 8
		mov word ptr n_tmp_5, ax // n_tmp_5 = (int16_t)(((n_tmp_8 - n_tmp_9) * 362) >> 8);
		mov edx, dword ptr p_dest
		mov ax, word ptr n_tmp_0 // ax = n_tmp_0
		mov cx, ax
		add ax, bx
		mov [edx], ax // p_dest[0] = (n_tmp_0 + n_tmp_9);
		sub cx, bx
		mov [edx + 8 * 14], cx // p_dest[7] = (n_tmp_0 - n_tmp_9);
		mov edx, dword ptr p_src
		mov cx, [edx + 8 * 2]
		mov ax, [edx + 8 * 14]
		sub cx, ax // cx = n_tmp_8 = p_src[1] - p_src[7];
		mov ax, [edx + 8 * 10]
		mov dx, [edx + 8 * 6]
		sub ax, dx // ax = n_tmp_0 = p_src[5] - p_src[3];
		mov dx, cx
		shl edx, 16
		mov dx, ax
		movd mm0, edx
		pxor mm3, mm3
		punpcklwd mm0, mm3 // mm0 = 0 | n_tmp_8 | 0 | n_tmp_0
		add ax, cx
		cwde
		imul eax, eax, 473
		sar eax, 8 // eax = !0 | n_tmp_7
		mov cx, ax
		neg cx
		shl eax, 16
		mov ax, cx // eax = n_tmp_7 | -n_tmp_7
		movd mm1, eax // mm1 = 0 | 0 | n_tmp_7 | -n_tmp_7
		punpcklwd mm1, mm3 // mm1 = 0 | n_tmp_7 | 0 | -n_tmp_7
		mov eax, 0x0115fd63 // eax = 277 | -669
		movd mm2, eax // mm2 = 0 | 0 | 277 | -669
		punpcklwd mm2, mm3 // mm2 = 0 | 277 | 0 | -669
		pmaddwd mm0, mm2 // mm0 =  n_tmp_8 * 277 | n_tmp_0 * -669
		psrad mm0, 8 // (n_tmp_8 * 277) >> 8 | (n_tmp_0 * -669) >> 8
		psubsw mm0, mm1 // mm0 = !0 | ((n_tmp_8 * 277) >> 8) - n_tmp_7 | !0 | ((n_tmp_0 * -669) >> 8) + n_tmp_7
		movd eax, mm0 // eax = !0 | ((n_tmp_0 * -669) >> 8) + n_tmp_7
		psrlq mm0, 32
		movd ecx, mm0 // ecx = !0 | ((n_tmp_8 * 277) >> 8) - n_tmp_7
		sub ax, bx // ax = n_tmp_6 -= n_tmp_9;
		mov bx, word ptr n_tmp_5
		sub bx, ax // bx = n_tmp_5 -= n_tmp_6;
		add cx, bx // cx = n_tmp_4 += n_tmp_5; | n_tmp_6
		shl ebx, 16
		mov bx, ax
		movd mm1, ebx
		mov dx, cx
		neg dx
		shl ecx, 16
		mov cx, dx
		movd mm0, ecx
		psllq mm0, 32
		por mm0, mm1 // mm0 = n_tmp_4 | -n_tmp_4 | n_tmp_5 | n_tmp_6
		mov ax, word ptr n_tmp_3
		mov bx, ax
		shl eax, 16
		mov ax, bx
		movd mm1, eax
		psllq mm1, 32 // mm1 = n_tmp_3 | n_tmp_3 | 0 | 0
		mov ax, word ptr n_tmp_2
		shl eax, 16
		mov ax, word ptr n_tmp_1
		movd mm2, eax
		por mm1, mm2 // mm1 = n_tmp_3 | n_tmp_3 | n_tmp_2 | n_tmp_1
		por mm3, mm1 // mm3 = n_tmp_3 | n_tmp_3 | n_tmp_2 | n_tmp_1
		paddsw mm1, mm0 // mm1 = n_tmp_3 + n_tmp_4 | n_tmp_3 - n_tmp_4 | n_tmp_2 + n_tmp_5 | n_tmp_1 + n_tmp_6
		mov edx, dword ptr p_dest
		movd eax, mm1
		mov [edx + 8 * 2], ax
		shr eax, 16
		mov [edx + 8 * 4], ax
		psrlq mm1, 32
		movd eax, mm1
		mov [edx + 8 * 6], ax
		shr eax, 16
		mov [edx + 8 * 8], ax // fill p_dest[1] ... p_dest[4] (can't use movq any more)
		psubsw mm3, mm0 // mm3 = n_tmp_3 - n_tmp_4 | n_tmp_3 + n_tmp_4 | n_tmp_2 - n_tmp_5 | n_tmp_1 - n_tmp_6
		movd eax, mm3 // eax = n_tmp_2 - n_tmp_5 | n_tmp_1 - n_tmp_6
		mov [edx + 8 * 12], ax
		shr eax, 16
		mov [edx + 8 * 10], ax
	}
}

#endif // DCT_ENABLE_MMX

/*
 *								=== ~CIDCT8_1D_MMX_Vert ===
 */

/*
 *								=== CReferenceDCT8 ===
 */

void CReferenceDCT8_2D::Forward(float *p_dest, const float *p_src)
{
	const float f_cos_table[64] = {
		1.0f, .980785f, .923880f, .831470f, .707107f, .555570f, .382683f, .195090f,
		1.0f, .831470f, .382683f, -.195090f, -.707107f, -.980785f, -.923880f, -.555570f,
		1.0f, .555570f, -.382683f, -.980785f, -.707107f, .195090f, .923880f, .831470f,
		1.0f, .195090f, -.923880f, -.555570f, .707107f, .831470f, -.382684f, -.980785f,
		1.0f, -.195090f, -.923880f, .555570f, .707107f, -.831470f, -.382683f, .980785f,
		1.0f, -.555570f, -.382683f, .980785f, -.707107f, -.195090f, .923879f, -.831470f,
		1.0f, -.831470f, .382684f, .195090f, -.707107f, .980785f, -.923880f, .555571f,
		1.0f, -.980785f, .923880f, -.831470f, .707107f, -.555571f, .382684f, -.195091f
	};

	for(int u = 0; u < 8; u ++) {
		for(int v = 0; v < 8; v ++) {
			p_dest[u + v * 8] = 0;
			for(int x = 0; x < 8; x ++) {
				for(int y = 0; y < 8; y ++) {
					p_dest[u + v * 8] += p_src[x + y * 8] *
						f_cos_table[u + x * 8] * f_cos_table[v + y * 8];
				}
			}
			p_dest[u + v * 8] *= .25f;
			if(!u)
				p_dest[u + v * 8] *= .70710678118654752440084436210485f;
			if(!v)
				p_dest[u + v * 8] *= .70710678118654752440084436210485f;
		}
	}
}

void CReferenceDCT8_2D::Inverse(float *p_dest, const float *p_src)
{
	const float f_cos_table[64] = {
		.707107f, .980785f, .923880f, .831470f, .707107f, .555570f, .382683f, .195090f,
		.707107f, .831470f, .382683f, -.195090f, -.707107f, -.980785f, -.923880f, -.555570f,
		.707107f, .555570f, -.382683f, -.980785f, -.707107f, .195090f, .923880f, .831470f,
		.707107f, .195090f, -.923880f, -.555570f, .707107f, .831470f, -.382684f, -.980785f,
		.707107f, -.195090f, -.923880f, .555570f, .707107f, -.831470f, -.382683f, .980785f,
		.707107f, -.555570f, -.382683f, .980785f, -.707107f, -.195090f, .923879f, -.831470f,
		.707107f, -.831470f, .382684f, .195090f, -.707107f, .980785f, -.923880f, .555571f,
		.707107f, -.980785f, .923880f, -.831470f, .707107f, -.555571f, .382684f, -.195091f
	};

	float p_tmp[64];

	for(int x = 0; x < 8; x ++) {
		for(int y = 0; y < 8; y ++) {
			p_tmp[y + x * 8] = 0;
			for(int u = 0; u < 8; u ++)
				p_tmp[y + x * 8] += p_src[y + u * 8] * f_cos_table[u + x * 8];
			p_tmp[y + x * 8] *= .5;
		}
	}

	for(int x = 0; x < 8; x ++) {
		for(int y = 0; y < 8; y ++) {
			p_dest[x + y * 8] = 0;
			for(int u = 0; u < 8; u ++)
				p_dest[x + y * 8] += p_tmp[u + y * 8] * f_cos_table[u + x * 8];
			p_dest[x + y * 8] *= .5f;
		}
	}
}

/*
 *								=== ~CReferenceDCT8 ===
 */

/*
 *								=== CFastDCT8 ===
 */

const float CFastDCT8_2D::m_p_prescale_table[8] = {1.0f, 1.387039845f,
	1.306562965f, 1.175875602f, 1.0f, 0.785694958f, 0.541196100f, 0.275899379f};

const float *CFastDCT8_2D::p_PrescaleTable()
{
	return m_p_prescale_table;
}

const float TFDCTConstants_float::m1 = .707107f;
const float TFDCTConstants_float::m2 = .382683f;
const float TFDCTConstants_float::m3 = .541196f;
const float TFDCTConstants_float::m4 = 1.306563f;

//#ifdef WIN32 // g++ forbids float to be template argument
void CFastDCT8_2D::Forward(float *p_dest, const float *p_src)
{
	//CFDCT8_1D<float, 1, 0, TFDCTConstants_float> t_horiz_dct;
	//CFDCT8_1D<float, 8, 0, TFDCTConstants_float> t_vert_dct;

	float p_tmp[64];

	for(float *p_row = p_tmp, *p_end = p_tmp + 8; p_row < p_end;)
		CFDCT8_1D<float, 8, 0, TFDCTConstants_float>::Do(p_row ++, p_src ++);
	// process rows first

	for(float *p_line = p_dest, *p_src_line = p_tmp, *p_end = p_dest + 64; p_line < p_end;
	   p_line += 8, p_src_line += 8)
		CFDCT8_1D<float, 1, 0, TFDCTConstants_float>::Do(p_line, p_src_line);
	// then lines ... and we're done

	for(float *p_end = p_dest + 64; p_dest < p_end;)
		*p_dest ++ *= .125f;
}
//#endif

const int32_t TFDCTConstants_int::m1 = (int32_t)(.707107f * .707107f * 0x10000);
const int32_t TFDCTConstants_int::m2 = (int32_t)(.707107f * .382683f * 0x10000);
const int32_t TFDCTConstants_int::m3 = (int32_t)(.707107f * .541196f * 0x10000);
const int32_t TFDCTConstants_int::m4 = (int32_t)(.707107f * 1.306563f * 0x10000);

void CFastDCT8_2D::Forward(int32_t *p_dest, const int32_t *p_src)
{
	//CFDCT8_1D<int32_t, 1, 16, TFDCTConstants_int> t_horiz_dct;
	//CFDCT8_1D<int32_t, 8, 16, TFDCTConstants_int> t_vert_dct;

	int32_t p_tmp[64];

	for(int32_t *p_row = p_tmp, *p_end = p_tmp + 8; p_row < p_end;)
		CFDCT8_1D<int32_t, 8, 16, TFDCTConstants_int>::Do(p_row ++, p_src ++);
	// process rows first

	for(int32_t *p_line = p_dest, *p_src_line = p_tmp, *p_end = p_dest + 64; p_line < p_end;
	   p_line += 8, p_src_line += 8)
		CFDCT8_1D<int32_t, 1, 16, TFDCTConstants_int>::Do(p_line, p_src_line);
	// then lines ... and we're done

	for(int32_t *p_end = p_dest + 64; p_dest < p_end;)
		*p_dest ++ >>= 3;
}

const int16_t TFDCTConstants_short::m1 = (int16_t)(.707107f * .707107f * 0x100);
const int16_t TFDCTConstants_short::m2 = (int16_t)(.707107f * .382683f * 0x100);
const int16_t TFDCTConstants_short::m3 = (int16_t)(.707107f * .541196f * 0x100);
const int16_t TFDCTConstants_short::m4 = (int16_t)(.707107f * 1.306563f * 0x100);

void CFastDCT8_2D::Forward(int16_t *p_dest, const int16_t *p_src)
{
	//CFDCT8_1D<int16_t, 1, 8, TFDCTConstants_short> t_horiz_dct;
	//CFDCT8_1D<int16_t, 8, 8, TFDCTConstants_short> t_vert_dct;

	int16_t p_tmp[64];

	for(int16_t *p_row = p_tmp, *p_end = p_tmp + 8; p_row < p_end;)
		CFDCT8_1D<int16_t, 8, 8, TFDCTConstants_short>::Do(p_row ++, p_src ++);
	// process rows first

	for(int16_t *p_line = p_dest, *p_src_line = p_tmp, *p_end = p_dest + 64; p_line < p_end;
	   p_line += 8, p_src_line += 8)
		CFDCT8_1D<int16_t, 1, 8, TFDCTConstants_short>::Do(p_line, p_src_line);
	// then lines ... and we're done

	for(int16_t *p_end = p_dest + 64; p_dest < p_end;)
		*p_dest ++ >>= 3;
}

const float TIDCTConstants_float::m1 = 1.414213562f;
const float TIDCTConstants_float::m2 = 1.847759065f;
const float TIDCTConstants_float::m3 = 1.082392200f;
const float TIDCTConstants_float::m4 = -2.613125930f;

void CFastDCT8_2D::Inverse(float *p_dest, const float *p_src)
{
	float p_tmp[64];

	for(float *p_row = p_tmp, *p_end = p_tmp + 8; p_row < p_end;)
		CIDCT8_1D<float, 8, 0, TIDCTConstants_float>::Do(p_row ++, p_src ++);
	// process rows first

	for(float *p_line = p_dest, *p_src_line = p_tmp, *p_end = p_dest + 64; p_line < p_end;
	   p_line += 8, p_src_line += 8)
		CIDCT8_1D<float, 1, 0, TIDCTConstants_float, 3>::Do(p_line, p_src_line);
	// then lines ... and we're done

	/*for(float *p_end = p_dest + 64; p_dest < p_end;)
		*p_dest ++ *= .125f;*/
}

const int32_t TIDCTConstants_int::m1 = (int32_t)(1.414213562f * 0x10000);
const int32_t TIDCTConstants_int::m2 = (int32_t)(1.847759065f * 0x10000);
const int32_t TIDCTConstants_int::m3 = (int32_t)(1.082392200f * 0x10000);
const int32_t TIDCTConstants_int::m4 = (int32_t)(-2.613125930f * 0x10000);

void CFastDCT8_2D::Inverse(int32_t *p_dest, const int32_t *p_src)
{
	int32_t p_tmp[64];

	for(int32_t *p_row = p_tmp, *p_end = p_tmp + 8; p_row < p_end;)
		CIDCT8_1D<int32_t, 8, 16, TIDCTConstants_int>::Do(p_row ++, p_src ++);
	// process rows first

	for(int32_t *p_line = p_dest, *p_src_line = p_tmp, *p_end = p_dest + 64; p_line < p_end;
	   p_line += 8, p_src_line += 8)
		CIDCT8_1D<int32_t, 1, 16, TIDCTConstants_int, 3>::Do(p_line, p_src_line);
	// then lines ... and we're done
}

const int16_t TIDCTConstants_short::m1 = (int16_t)(1.414213562f * 0x100);
const int16_t TIDCTConstants_short::m2 = (int16_t)(1.847759065f * 0x100);
const int16_t TIDCTConstants_short::m3 = (int16_t)(1.082392200f * 0x100);
const int16_t TIDCTConstants_short::m4 = (int16_t)(-2.613125930f * 0x100);

void CFastDCT8_2D::Inverse(int16_t *p_dest, const int16_t *p_src)
{
	int16_t p_tmp[64];

	for(int16_t *p_row = p_tmp, *p_end = p_tmp + 8; p_row < p_end;)
		CIDCT8_1D<int16_t, 8, 8, TIDCTConstants_short>::Do(p_row ++, p_src ++);
	// process rows first

	for(int16_t *p_line = p_dest, *p_src_line = p_tmp, *p_end = p_dest + 64; p_line < p_end;
	   p_line += 8, p_src_line += 8)
		CIDCT8_1D<int16_t, 1, 8, TIDCTConstants_short, 3>::Do(p_line, p_src_line);
	// then lines ... and we're done
}

#ifdef DCT_ENABLE_MMX

void CFastDCT8_2D::Inverse_MMX(int16_t *p_dest, const int16_t *p_src)
{
	__declspec(align(64)) int16_t p_tmp[64];

	for(int16_t *p_row = p_tmp, *p_end = p_tmp + 8; p_row < p_end;)
		CIDCT8_1D_MMX_Vert::Do(p_row ++, p_src ++);
	// process rows first

	for(int16_t *p_line = p_dest, *p_src_line = p_tmp, *p_end = p_dest + 64; p_line < p_end;
	   p_line += 8, p_src_line += 8)
		   CIDCT8_1D_MMX_Horiz_Divide8::Do(p_line, p_src_line); // divide by 8 here (to better exploit MMX)
	// then lines ... and we're done

	//for(int16_t *p_end = p_dest + 64; p_dest < p_end;)
	//	*p_dest ++ >>= 3; // division made in t_horiz_dct

	_asm {
		//emms
		femms // actualy 3D-NOW!, but it's faster
	}
}

#endif // DCT_ENABLE_MMX

/*
 *								=== ~CFastDCT8 ===
 */
