/*
 * LRD evaluation engine - SIMD version
 *
 * Known issues:
 * - Image size dependency
 *   Problem:
 *     The classifier parameters are tuned for particular size of image
 *     TStage::ATab and TStage::BTab and also addressing tables should
 *     be provided according to the size.
 *   Solution:
 *     - Addressing tables will be within an convolution image (it makes more sense)
 *     - The classifier will hold constant vector of stages and its dynamic
 *       versions calculated for particular image size.
 *    
 *
 *  Spravny postup pouziti
 *  - Znama velikost obrazu
 *  - createConvolutions
 *  - prepareAddressTables
 *  - loadClassifier
 *  - initClassifier
 *  
 *  - convolveImage
 *  - scanConvolvedImage
 *
 */

#include "lrd_engine.h"
#include "lrd_const.h"
#include "imagetools.h"
#include <iostream>
#include <pmmintrin.h>

using namespace std;

inline unsigned long long rdtsc()
{
    unsigned long long x = 0;
//    __asm__ volatile ("rdtsc" : "=A" (x));
    return x;
}


#if MEASURE_PERFORMANCE
unsigned long long hypotheses = 0;
unsigned long long windows = 0;

unsigned long long totalWindows = 0;
unsigned long long totalHypotheses = 0;

unsigned long long scanTSC = 0;
unsigned long long tmpScanTSC = 0;

unsigned long long evalTSC = 0;
unsigned long long tmpEvalTSC = 0;
#endif


int initClassifier(TClassifier * classifier)
{
    float * stgAlpha = classifier->alpha;

    for (unsigned s = 0; s < classifier->stageCount; ++s, stgAlpha += classifier->alphaCount)
    {
        TStage & stage = classifier->stage[s];
        
        if (stage.w > 2 || stage.h > 2) // No blocks larger than 2x2 allowed
            return 0;

        // get feature type
        // 000000wh
        stage.szType = ((stage.h-1) << 1) | (stage.w-1);
        // get position type
        // 0000yyxx
        stage.posType = ((stage.y & 0x03) << 2) | (stage.x & 0x03);
        // Link alpha table
        stage.alpha = stgAlpha;
    }

    return 1;
}

/*
void updateClassifierParams(TClassifier * classifier, vector<TConvolution*> & conv)
{
    for (unsigned s = 0; s < classifier->stageCount; ++s)
    {
        TStage & stage = classifier->stage[s];
        
        // recalc A and B
        // A, B
        for (int t = 0; t < 4; ++t)
        {
            int & A = stage.rank[2 * t + 0];
            int & B = stage.rank[2 * t + 1];
            A = rankTable[t][int(stage.A)];
            B = rankTable[t][int(stage.B)];
            if (A >= 8) A += conv[stage.szType]->rowStep - 8;
            if (B >= 8) B += conv[stage.szType]->rowStep - 8;
        }
    }
}
*/
void calculateClassifierRanks(Rank * ranks, const TClassifier * classifier, const vector<TConvolution*> & conv)
{
	for (unsigned s = 0; s < classifier->stageCount; ++s)
	{
		const TStage & stage = classifier->stage[s];

		for (int t = 0; t < 4; ++t)
		{
			int & A = ranks[s][2 * t + 0];
			int & B = ranks[s][2 * t + 1];
			A = rankTable[t][int(stage.A)];
			B = rankTable[t][int(stage.B)];
			if (A >= 8) A += conv[stage.szType]->rowStep - 8;
			if (B >= 8) B += conv[stage.szType]->rowStep - 8;
		}
	}
}


vector<TConvolution*> * createConvolutions(CvSize sz)
{
    vector<TConvolution*> * tmp = new vector<TConvolution*>(4);
    vector<TConvolution*> & conv = *tmp;
    conv[0] = new TConvolution(sz, 1, 1);
    conv[1] = new TConvolution(sz, 2, 1);
    conv[2] = new TConvolution(sz, 1, 2);
    conv[3] = new TConvolution(sz, 2, 2);
    return tmp;
}


void releaseConvolutions(vector<TConvolution*> ** conv)
{
    if (conv && *conv)
    {
        vector<TConvolution*> & c = **conv;
        // delete convolution images
        for (unsigned i = 0; i < c.size(); ++i)
        {
            if (c[i])
            {
                delete c[i];
            }
        }
        // delete the vector
        delete *conv;
        *conv = 0;
    }
}


int * prepareColAddressTable(vector<TConvolution*> & c)
{
    int * table = new int[c[0]->srcSz.width * 4];
    for (int i = 0; i < c[0]->srcSz.width; ++i)
    {
        table[4 * i + 0] = (i & 0xFFFFFFFE) << 1;
        table[4 * i + 1] = (i & 0xFFFFFFFC);
        table[4 * i + 2] = (i & 0xFFFFFFFE) << 1;
        table[4 * i + 3] = (i & 0xFFFFFFFC);
    }
    return table;
}

    
int * prepareRowAddressTable(vector<TConvolution*> & c)
{
    int * table = new int[c[0]->srcSz.height * 4];
    for (int i = 0; i < c[0]->srcSz.height; ++i)
    {
        table[4 * i + 0] = (i >> 1) * c[0]->rowStep;
        table[4 * i + 1] = (i >> 1) * c[1]->rowStep;
        table[4 * i + 2] = (i >> 2) * c[2]->rowStep;
        table[4 * i + 3] = (i >> 2) * c[3]->rowStep;
    }
    return table;
}


/// SIMD evaluation
static inline float evalLRDStage(
        const std::vector<TConvolution*> & convolutions,
        const int * addrX, const int * addrY,
        int fx, int fy, int sampleModPos,
        const TStage * stg, const Rank * rank)
{
    // Get the convolution image;
    TConvolution * conv = convolutions[stg->szType];
    
    // index to tables - depends on feature size, sample position and feature position relative to sample
    int tableIdx = (stg->szType << 8) | sampleModPos | stg->posType;
    
    // Get the block of the convolution (depends on feature modulo shift)
    int blockId = blockTable[tableIdx];
    
    // Get the mask type
    int maskType = maskTable[tableIdx];
    
    // Get address of the feature in image
    int dataOffset = addrX[4 * (fx + stg->x) + stg->szType] + addrY[4 * (fy+stg->y) + stg->szType];
    //signed char * data0 = conv->image + (blockId * conv->blockStep) + dataOffset;
    signed char * data0 = conv->block[blockId] + dataOffset;
    signed char * data1 = data0 + conv->rowStep;
    
    // Get the A nd B rank index according to the shift type
    int AOffset = (*rank)[2 * maskType + 0];
    int BOffset = (*rank)[2 * maskType + 1];

    register __m128i data = _mm_set_epi64(*(__m64*)(data1), *(__m64*)(data0));
    register __m128i zero = _mm_setzero_si128();

	union {
        __m128i q;
        signed short ss[8];
    } diff = { _mm_sub_epi16(
        _mm_sad_epu8( // countA
            _mm_and_si128(
                _mm_cmpgt_epi8(_mm_set1_epi8(*(data0+AOffset)), data),
                masks[maskType].q),
            zero),
        _mm_sad_epu8( // countB
            _mm_and_si128(
                _mm_cmpgt_epi8(_mm_set1_epi8(*(data0+BOffset)), data),
                masks[maskType].q),
            zero)
        )
    }; 

	int lrd = diff.ss[4] + diff.ss[0];
    _mm_empty();

    return stg->alpha[lrd + 8];
    //return 0;
}

#if 0
///////////////////////////////////////////////////////////////////////////////
// Simple eval

/// Sum of regions in 3x3 grid.
/// Sums values in regions and stores the results in a vector.
static void sumRegions3x3(unsigned char * data, int w, int h, unsigned widthStep, int * v)
{
	unsigned blockStep = h * widthStep;
    widthStep -= w;

	// Prepare pointer array
    unsigned char * base[9] = {
		data, data+w, data+2*w,
		data+blockStep,data+blockStep+w,data+blockStep+2*w,
		data+2*blockStep,data+2*blockStep+w,data+2*blockStep+2*w,
	};

    for (int y = 0; y < h; ++y)
    {
        // go through all pixels in row and accumulate
		int x = 0;
        while (x < w)
        {
			for (int i = 0; i < 9; ++i)
			{
				v[i] += *base[i];
				++base[i];
			}
			++x;
        }
        // set pointers to next line 
		for (int i = 0; i < 9; ++i)
			base[i] += widthStep;
    }
    for (int i = 0; i < 9; ++i)
        v[i] /= w*h;
}

/// Simple version of LRD evaluation.
/// Sums the pixels in the feature grid and calculates ranks of selected pixels.
static float evalLRDStageSimple(IplImage * image, unsigned smpOffset, TStage * stg)
{
	int values[9] = {0,0,0,0,0,0,0,0,0};
    // Get absolute address of feature in image
	unsigned char * base = (unsigned char*)(image->imageData + smpOffset + (stg->x + image->widthStep * stg->y));
	// Get sums in the feature blocks
    // There is no need to calculate the means because we care only about
    // rang of blocks not actual values
    sumRegions3x3(base, stg->w, stg->h, image->widthStep, values);

	int countA = 0;
	int countB = 0;
	int valA = values[stg->A];
	int valB = values[stg->B];

	// calculate ranks
    int * data = values;
	for (int i = 0; i < 9; ++i, ++data)
	{
		if (valA > *data) ++countA;
		if (valB > *data) ++countB;
	}

    int lrd = countA - countB + 8;

	// return weak hypothesis response
    return stg->alpha[lrd];
}


static inline int evalLRDClassifierSimple(IplImage * image,
        TClassifier * classifier, // The classifier
        const int x, const int y, // Position to evaluate
        float * response) // Response value
{
    *response = 0.0f;
    int offset = y * image->widthStep + x;
    
    for (TStage * stg = classifier->stage; stg < classifier->stage + classifier->stageCount; ++stg)
    {
        *response += evalLRDStageSimple(image, offset, stg);

        if (*response < stg->theta_b) // Negative result
        {
#if MEASURE_PERFORMANCE
            hypotheses += stg - classifier->stage + 1;
#endif
            return 0;
        }
    }
        
#if MEASURE_PERFORMANCE
    hypotheses += classifier->stageCount;
#endif

    return (*response > classifier->threshold) ? 1 : 0;
}

/////////////////
#endif


static inline int evalLRDClassifier(
        const std::vector<TConvolution*> & convolutions, // The input image
        const TClassifier * classifier, const Rank * ranks, // The classifier
        const int * addrX, const int * addrY, // Convolution Address tables
        const int x, const int y, // Position to evaluate
        float * response) // Response value
{
    
#if MEASURE_PERFORMANCE
    // tmpEvalTSC = rdtsc();
#endif
    
    *response = 0.0f;
    
    // posType - yyxx0000
    int posType = (((y & 0x03) << 6) | (x & 0x03) << 4); // (position % 4) * 16
    
    //for (TStage * stg = classifier->stage; stg < classifier->stage + classifier->stageCount; ++stg)
    for (unsigned s = 0; s < classifier->stageCount; ++s)
    {
	    const TStage * stg = classifier->stage + s;
	    const Rank * rank = ranks + s;
        *response += evalLRDStage(convolutions, addrX, addrY, x, y, posType, stg, rank);
        if (*response < stg->theta_b) // Negative result
        {
#if MEASURE_PERFORMANCE
            hypotheses += stg - classifier->stage + 1;
            // evalTSC += rdtsc() - tmpEvalTSC;
#endif
            return 0;
        }
    }
    
#if MEASURE_PERFORMANCE
    hypotheses += classifier->stageCount;
    // evalTSC += rdtsc() - tmpEvalTSC;
#endif

    return (*response > classifier->threshold) ? 1 : 0;
}

// TODO
// - scan step
unsigned scanConvolvedImage(const std::vector<TConvolution*> & convolutions,
        const int * xtable, const int * ytable,
        const TClassifier * classifier, const Rank * ranks,
        TDetectionList::iterator first, TDetectionList::iterator last)
{
#if MEASURE_PERFORMANCE
    hypotheses = 0;
    windows = 0;
    // tmpScanTSC = rdtsc();
#endif
    
    CvSize sz = convolutions[0]->srcSz; // assuming all convolutions are for the same input image size!!
    TDetectionList::iterator det = first;
    
    for (unsigned y = 0; y < sz.height-classifier->height; ++y)
    {
        for (unsigned x = 0; x < sz.width-classifier->width; ++x)
        {
            float response;
            int d = evalLRDClassifier(convolutions, classifier, ranks, xtable, ytable, x, y, &response);
            
            if (d) // Positive response
            {
                // create the detection record
                TDetection tmp = { response, 0.0f, cvRect(x, y, classifier->width, classifier->height) };
                *det = tmp;
                ++det;
                if (det == last) // No place for another detection - quit
                {
#if MEASURE_PERFORMANCE
                    windows += x + 1;
                    // scanTSC += rdtsc() - tmpScanTSC;
                    totalWindows += windows;
                    totalHypotheses += hypotheses;
#endif
                    return last - first;
                }
            }
        } // x

#if MEASURE_PERFORMANCE
        windows += sz.width - classifier->width + 1;
        // scanTSC += rdtsc() - tmpScanTSC;
        totalWindows += windows;
        totalHypotheses += hypotheses;
#endif
    } // y
    
    return det - first;
}


void resetScanStats()
{
#if MEASURE_PERFORMANCE
    evalTSC = 0;    
    scanTSC = 0;
    totalWindows = 0;
    totalHypotheses = 0;
#endif
}

#if 0
unsigned scanIntensityImage(IplImage * image,
        TClassifier * classifier,
        TDetections::iterator first, TDetections::iterator last)
{
#if MEASURE_PERFORMANCE
    hypotheses = 0;
    windows = 0;
#endif
    
    TDetections::iterator det = first;
    
    for (unsigned y = 0; y < image->height-classifier->height; ++y)
    {
        for (unsigned x = 0; x < image->width-classifier->width; ++x)
        {
            float response;
            int d = evalLRDClassifierSimple(image, classifier, x, y, &response);
            //cout << response << ", ";
            if (d) // Positive response
            {
                // create the detection record
                TDetection tmp = { response, 0.0f, cvRect(x, y, classifier->width, classifier->height) };
                *det = tmp;
                ++det;
                if (det == last) // No place for another detection - quit
                {
#if MEASURE_PERFORMANCE
                    windows += x + 1;
#endif
                    return last - first;
                }
            }
        } // x
        //cout << endl;
#if MEASURE_PERFORMANCE
        windows += image->width - classifier->width + 1;
#endif
    } // y
    
#if MEASURE_PERFORMANCE
    cout << double(hypotheses)/windows << ", " << det-first << "; " << endl;
#endif
    
    return det - first;
}

#endif
/*


#define SHIFT_TYPE(x,y) (((y) & 0x03) << 2) | ((x) & 0x03)
#define POS_TYPE(x,y) (((y) & 0x03) << 2) | ((x) & 0x03)
#define SIZE_TYPE(x,y) ((((y)-1) << 1) | ((x)-1))

template <int x, int y, int szType, int posType>
inline float LRDStageTemplate(
        std::vector<TConvolution*> & convolutions,
        const int * addrX, const int * addrY,
        int fx, int fy, int sampleModPos,
        const int * ATab, const int * BTab, const float * alpha)
{
    // Get the convolution image;
    TConvolution * conv = convolutions[szType];
    
    // index to tables - depends on feature size, sample position and feature position relative to sample
    int tableIdx = (szType << 8) | sampleModPos | posType;
    
    // Get the block of the convolution (depends on feature modulo shift)
    int blockId = blockTable[tableIdx];
    
    // Get the mask type
    int maskType = maskTable[tableIdx];
    
    // Get address of the feature in image
    int dataOffset = addrX[4 * fx + szType] + addrY[4 * fy + szType];
    //signed char * data0 = conv->image + (blockId * conv->blockStep) + dataOffset;
    signed char * data0 = conv->block[blockId] + dataOffset;
    signed char * data1 = data0 + conv->rowStep;
    
    // Get the A nd B rank index according to the shift type
    int AOffset = ATab[2 * maskType + 0];
    int BOffset = BTab[2 * maskType + 1];

    register __m128i data = _mm_set_epi64(*(__m64*)(data1), *(__m64*)(data0));
    register __m128i zero = _mm_setzero_si128();
    
    union {
        __m128i q;
        signed short ss[8];
    } diff = { _mm_sub_epi16(
        _mm_sad_epu8( // countA
            _mm_and_si128(
                _mm_cmpgt_epi8(_mm_set1_epi8(*(data0+AOffset)), data),
                masks[maskType].q),
            zero),
        _mm_sad_epu8( // countB
            _mm_and_si128(
                _mm_cmpgt_epi8(_mm_set1_epi8(*(data0+BOffset)), data),
                masks[maskType].q),
            zero)
        )
    }; 

    int lrd = diff.ss[4] + diff.ss[0];
    _mm_empty();

    return alpha[lrd + 8];
}


int lrdFaceClassifier(vector<TConvolution*> & convolutions, int x, int y, float threshold, const int * addrX, const int * addrY)
{
    float response = 0.0f;
    
    const int ranks[16 * STAGECOUNT] = {0};
    const float alphas[17*3] = {};

    response += LRDStageTemplate<7, 4, SIZE_TYPE(2,2), POS_TYPE(7,4)>
        (convolutions, addrX, addrY, x, y, SHIFT_TYPE(x,y), ranks, ranks, alphas);
    if (response < -1.3) return 0;
    
    response += LRDStageTemplate<13, 6, SIZE_TYPE(2,2), POS_TYPE(13,6)>
        (convolutions, addrX, addrY, x, y, SHIFT_TYPE(x,y), ranks, ranks, alphas + 17);
    if (response < -1.7) return 0;
        
    
    return (response > threshold) ? 1 : 0;
}

*/
