/*
 * File:   main.c
 * Author: Pavel Najman <najman.pavel at gmail.com>
 *
 * Created on April 28, 2017, 8:08 AM
 */

#include <stdio.h>
#include <assert.h>
#include <time.h>
#include <malloc.h>
#include <stdlib.h>

#ifdef _OPENMP
    #include <omp.h>
#endif

#ifndef NO_SSE
    #ifndef NO_FMA
        #include <immintrin.h>
    #endif
        #include <xmmintrin.h>
        #include <smmintrin.h>
#endif

#include "common.h"

//#include <iacaMarks.h>

#if defined(NO_SSE) && defined(NO_BARRIER)
void nsp_no_SSE_no_barrier(const Image * img, Bands * bands, TmpMem * tmp_mem, const BandsThreadingInfo * threading_info);
void nsp_predict_with_unpack_no_SSE_no_barrier(const Tile * tile, TileBands * bands, const BandsThreadingInfo * threading_info, TmpMem * tmp);
void nsp_update_unpacked_no_SSE_no_barrier(TileBands * bands, const BandsThreadingInfo * threading_info, TmpMem * tmp_mem);
#elif defined NO_SSE
void nsp_no_SSE(const Image * img, Bands * bands, TmpMem * tmp_mem, const BandsThreadingInfo * threading_info);
void nsp_predict_with_unpack_no_SSE(const Tile * tile, TileBands * bands, const BandsThreadingInfo * threading_info);
void nsp_update_unpacked_no_SSE(TileBands * bands, const BandsThreadingInfo * threading_info, TmpMem * tmp_mem, int mem_flag);
#elif defined NO_BARRIER
void nsp_no_barrier(const Image * img, Bands * bands, TmpMem * tmp_mem, const BandsThreadingInfo * threading_info);
void nsp_predict_with_unpack_no_barrier(const Tile * tile, TileBands * bands, const BandsThreadingInfo * threading_info, TmpMem * tmp_mem);
void nsp_update_unpacked_no_barrier(TileBands * bands, const BandsThreadingInfo * threading_info, TmpMem * tmp_mem);
#else
void nsp(const Image * img, Bands * bands, TmpMem * tmp_mem, const BandsThreadingInfo * threading_info);
void nsp_predict_with_unpack(const Tile * tile, TileBands * bands, const BandsThreadingInfo * threading_info);
void nsp_update_unpacked(TileBands * bands, const BandsThreadingInfo * threading_info, TmpMem * tmp_mem, int mem_flag);
#endif

void test_nsp(const char * filename, Image * img, Bands * bands, TmpMem * tmp_mem, const BandsThreadingInfo * threading_info);

double measure_nsp(Image * img, Bands * bands, TmpMem * tmp_mem, const BandsThreadingInfo * threading_info, TestConfig config);
double measure_nsp_tile(Image * img, Bands * bands, TmpMem * tmp_mem, const BandsThreadingInfo * threading_info);
double measure_nsp_image(Image * img, Bands * bands, TmpMem * tmp_mem, const BandsThreadingInfo * threading_info);
double measure_nsp_tiles_in_image(Image * img, Bands * bands, TmpMem * tmp_mem, const BandsThreadingInfo * threading_info);

/*
 *
 */
int main(int argc, char** argv) {
    omp_set_nested(1);

    TestConfig config = IMAGE;

    size_t num_sockets = 1;
    size_t num_threads = 1;

    if (argc > 1)
        num_sockets = (size_t) atoi(argv[1]);

    if (argc > 2)
        num_threads = (size_t) atoi(argv[2]);

    size_t size_x = 512, size_y = 512;
    size_t tile_size_x = 256, tile_size_y = 256;

    if (argc > 3)
        size_x = size_y = (size_t) atoi(argv[3]);

    if (argc > 4)
        size_y = (size_t) atoi(argv[4]);

    if (argc > 5)
        tile_size_x = tile_size_y = (size_t) atoi(argv[5]);

    if (argc > 6)
        config = (TestConfig)atoi(argv[6]);

    if (argc > 7)
        tile_size_y = (size_t) atoi(argv[7]);

    assert(size_x % 8 == 0 && "transform size must be a power of two");
    assert(size_y % 8 == 0 && "transform size must be a power of two");

#ifdef NO_SSE
    Image * img = memalign(16, num_sockets * sizeof(Image));
    Bands * bands = memalign(16, num_sockets * sizeof(Bands));
    BandsThreadingInfo * threading_info = memalign(16, num_sockets * sizeof(BandsThreadingInfo));
    TmpMem * mem = memalign(16, num_sockets * sizeof(TmpMem));
#else
    Image * img = _mm_malloc(num_sockets * sizeof(Image), 16);
    Bands * bands = _mm_malloc(num_sockets * sizeof(Bands), 16);
    BandsThreadingInfo * threading_info = _mm_malloc(num_sockets * sizeof(BandsThreadingInfo), 16);
    TmpMem * mem = _mm_malloc(num_sockets * sizeof(TmpMem), 16);
#endif
    assert(img != NULL && bands != NULL && threading_info != NULL && mem != NULL);
    #pragma omp parallel proc_bind(spread) num_threads(num_sockets)
    {
        int sid = omp_get_thread_num();
        
        allocate_image(&img[sid], size_x, size_y);

        set_tile_size(&img[sid], tile_size_x, tile_size_y);

        allocate_bands(&bands[sid], &img[sid]);

        allocate_bands_threading_info(&threading_info[sid], num_threads);

        allocate_tmp_mem(&mem[sid], num_threads, tile_size_x);
    }

    for(size_t sid = 0; sid < num_sockets; ++sid){
        srand(0);
        init_image(&img[sid], tile_size_x, tile_size_y);
        init_bands_threading_info(&threading_info[sid], &bands[sid], num_sockets, num_threads);
    }

    char filename[50];
    sprintf(filename, "%zu_%zu.mat", size_x, tile_size_x);
    
    test_nsp(filename, img, bands, mem, threading_info);

    double result;
    result = measure_nsp(img, bands, mem, threading_info, config);
    printf("%f\n", result/1000.0);
    //printf("nsp: done in %lli picoseconds/pixel => %lli MB/s\n", result, (long long) sizeof(float) * 1000000LL / result);
    //result = measure_nsp(&img, &bands, &mem);
    //printf("nsp: done in %lli picoseconds/pixel => %lli MB/s\n", result, (long long) sizeof(float) * 1000000LL / result);

    #pragma omp parallel proc_bind(spread) num_threads(num_sockets)
    {
        int sid = omp_get_thread_num();
        free_image(&img[sid]);
        free_bands(&bands[sid]);
        free_bands_threading_info(&threading_info[sid]);
        free_tmp_mem(&mem[sid]);
    }
#ifdef NO_SSE
    free(img);
    free(bands);
    free(threading_info);
    free(mem);
#else
    _mm_free(img);
    _mm_free(bands);
    _mm_free(threading_info);
    _mm_free(mem);
#endif

    return EXIT_SUCCESS;
}

#ifdef NO_SSE
static inline void nsp_predict_with_unpack_no_SSE_kernel(float * m0, float * m1, float * m2, float * ll, float * hl, float *lh, float * hh, float ALPHA, float ALPHA2)
{
    float LL00, LL01, LL11, LL10, LH00, LH01, HH00, HL00, HL10;

    LL00 = *m0; LL01 = *(m0+2); LL10 = *m2; LL11 = *(m2+2);
    LH00 = *m1; LH01 = *(m1+2);
    HH00 = *(m1+1);
    HL00 = *(m0+1); HL10 = *(m2+1);

    HL00 += ALPHA * (LL01 + LL00);

    LH00 += ALPHA * (LL10);
    HH00 += ALPHA2 * LL11 + ALPHA * (HL10 + LH01 + LH00);
    LH00 += ALPHA * LL00;

    HH00 += ALPHA * HL00;

    *ll = LL00; *hl = HL00;
    *hh = HH00; *lh = LH00;
}

static inline void nsp_predict_with_unpack_no_SSE_symmetric_extension_kernel(float * m0, float * m1, float * m2, float * ll, float * hl, float *lh, float * hh, float ALPHA, float ALPHA2)
{
    float LL00, LL01, LL11, LL10, LH00, LH01, HH00, HL00, HL10;

    LL00 = *m0; LL01 = *m0; LL10 = *m2; LL11 = *m2;
    LH00 = *m1; LH01 = *m1;
    HH00 = *(m1+1);
    HL00 = *(m0+1); HL10 = *(m2+1);

    HL00 += ALPHA * (LL01 + LL00);

    LH00 += ALPHA * (LL10);
    HH00 += ALPHA2 * LL11 + ALPHA * (HL10 + LH01 + LH00);
    LH00 += ALPHA * LL00;

    HH00 += ALPHA * HL00;

    *ll = LL00; *hl = HL00;
    *hh = HH00; *lh = LH00;
}

static inline void NO_TREE_VECTORIZE nsp_update_unpacked_no_SSE_kernel(float * ll, float * lh, float * hl, float * hh, float *lh1, float *hh1, float BETA, float BETA2)
{
    float LL00, HL01, HH11, LH10, LH00, HH01, HH00, HL00, HH10;

    LL00 = *ll; HL00 = *hl; LH00 = *lh; HH00 = *hh;
    HL01 = *(hl - 1); HH01 = *(hh - 1);
    LH10 = *(lh1); HH10 = *(hh1);
    HH11 = *(hh1 - 1);

    LL00 += BETA2 * (HH11) + BETA * (LH10 + HL01);
    HL00 += BETA * HH10;
    LH00 += BETA * (HH01 + HH00);
    LL00 += BETA * (HL00 + LH00);
    HL00 += BETA * HH00;

    *lh = LH00;
    *ll = LL00;
    *hl = HL00;
}

static inline void NO_TREE_VECTORIZE nsp_update_unpacked_no_SSE_symmetric_extension_kernel(float * ll, float * lh, float * hl, float * hh, float *lh1, float *hh1, float BETA, float BETA2)
{
    float LL00, HL01, HH11, LH10, LH00, HH01, HH00, HL00, HH10;

    LL00 = *ll; HL00 = *hl; LH00 = *lh; HH00 = *hh;
    HL01 = *(hl); HH01 = *(hh);
    LH10 = *(lh1); HH10 = *(hh1);
    HH11 = *(hh1);

    LL00 += BETA2 * (HH11) + BETA * (LH10 + HL01);
    HL00 += BETA * HH10;
    LH00 += BETA * (HH01 + HH00);
    LL00 += BETA * (HL00 + LH00);
    HL00 += BETA * HH00;

    *lh = LH00;
    *ll = LL00;
    *hl = HL00;
}
#else
static inline void nsp_predict_with_unpack_kernel(__m128 R0C0, __m128 R0C1, __m128 R0C2, __m128 R1C0, __m128 R1C1, __m128 R1C2, __m128 R2C0, __m128 R2C1, __m128 R2C2, float * ll, float * lh, float *hl, float * hh, __m128 ALPHA, __m128 ALPHA2)
{
    __m128 LL00, LL01, LL11, LL10, LH00, LH01, HH00, HL00, HL10, RES, RES1, RES2;

    LL00 = _mm_shuffle_ps(R0C0, R0C1, _MM_SHUFFLE(2, 0, 2, 0));
    HL00 = _mm_shuffle_ps(R0C0, R0C1, _MM_SHUFFLE(3, 1, 3, 1));

    HH00 = _mm_shuffle_ps(R1C0, R1C1, _MM_SHUFFLE(3, 1, 3, 1));
    LH00 = _mm_shuffle_ps(R1C0, R1C1, _MM_SHUFFLE(2, 0, 2, 0));

    LL10 = _mm_shuffle_ps(R2C0, R2C1, _MM_SHUFFLE(2, 0, 2, 0));
    HL10 = _mm_shuffle_ps(R2C0, R2C1, _MM_SHUFFLE(3, 1, 3, 1));

    LL01 = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(LL00), 4));
    LL01 = _mm_insert_ps(LL01, R0C2, 0x30);

    LH01 = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(LH00), 4));
    LH01 = _mm_insert_ps(LH01, R1C2, 0x30);

    LL11 = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(LL10), 4));
    LL11 = _mm_insert_ps(LL11, R2C2, 0x30);

    RES = _mm_add_ps(LL00, LL01);
#ifdef NO_FMA
    RES = _mm_mul_ps(RES, ALPHA);
    HL00 = _mm_add_ps(RES, HL00);
    LL10 = _mm_mul_ps(LL10, ALPHA);
    LH00 = _mm_add_ps(LL10, LH00);
#else
    HL00 = _mm_fmadd_ps(RES, ALPHA, HL00);
    LH00 = _mm_fmadd_ps(LL10, ALPHA, LH00);
#endif

    RES1 = _mm_mul_ps(LL11, ALPHA2);
    RES = _mm_add_ps(HL00, LH00);
    RES2 = _mm_add_ps(HL10, LH01);
    RES = _mm_add_ps(RES, RES2);

#ifdef NO_FMA
    RES = _mm_mul_ps(RES, ALPHA);
    RES = _mm_add_ps(RES, RES1);
#else
    RES = _mm_fmadd_ps(RES, ALPHA, RES1);
#endif
    HH00 = _mm_add_ps(HH00, RES);

#ifdef NO_FMA
    RES2 = _mm_mul_ps(LL00, ALPHA);
    LH00 = _mm_add_ps(RES2, LH00);
#else
    LH00 = _mm_fmadd_ps(LL00, ALPHA, LH00);
#endif

    _mm_store_ps(ll, LL00);
    _mm_store_ps(hl, HL00);
    _mm_store_ps(hh, HH00);
    _mm_store_ps(lh, LH00);
}

static inline void nsp_predict_with_unpack_symmetric_extension_kernel(__m128 R0C0, __m128 R0C1, __m128 R1C0, __m128 R1C1, __m128 R2C0, __m128 R2C1, float * ll, float * lh, float *hl, float * hh, __m128 ALPHA, __m128 ALPHA2)
{
    __m128 LL00, LL01, LL11, LL10, LH00, LH01, HH00, HL00, HL10, RES, RES1, RES2;

    LL00 = _mm_shuffle_ps(R0C0, R0C1, _MM_SHUFFLE(2, 0, 2, 0));
    HL00 = _mm_shuffle_ps(R0C0, R0C1, _MM_SHUFFLE(3, 1, 3, 1));

    HH00 = _mm_shuffle_ps(R1C0, R1C1, _MM_SHUFFLE(3, 1, 3, 1));
    LH00 = _mm_shuffle_ps(R1C0, R1C1, _MM_SHUFFLE(2, 0, 2, 0));

    LL10 = _mm_shuffle_ps(R2C0, R2C1, _MM_SHUFFLE(2, 0, 2, 0));
    HL10 = _mm_shuffle_ps(R2C0, R2C1, _MM_SHUFFLE(3, 1, 3, 1));

    LL01 = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(LL00), 4));
    LL01 = _mm_insert_ps(LL01, LL00, 0xF0);

    LH01 = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(LH00), 4));
    LH01 = _mm_insert_ps(LH01, LH00, 0xF0);

    LL11 = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(LL10), 4));
    LL11 = _mm_insert_ps(LL11, LL10, 0xF0);

    RES = _mm_add_ps(LL00, LL01);
#ifdef NO_FMA
    RES = _mm_mul_ps(RES, ALPHA);
    HL00 = _mm_add_ps(RES, HL00);
    LL10 = _mm_mul_ps(LL10, ALPHA);
    LH00 = _mm_add_ps(LL10, LH00);
#else
    HL00 = _mm_fmadd_ps(RES, ALPHA, HL00);
    LH00 = _mm_fmadd_ps(LL10, ALPHA, LH00);
#endif

    RES1 = _mm_mul_ps(LL11, ALPHA2);
    RES = _mm_add_ps(HL00, LH00);
    RES2 = _mm_add_ps(HL10, LH01);
    RES = _mm_add_ps(RES, RES2);
#ifdef NO_FMA
    RES = _mm_mul_ps(RES, ALPHA);
    RES = _mm_add_ps(RES, RES1);
#else
    RES = _mm_fmadd_ps(RES, ALPHA, RES1);
#endif
    HH00 = _mm_add_ps(HH00, RES);

#ifdef NO_FMA
    RES2 = _mm_mul_ps(LL00, ALPHA);
    LH00 = _mm_add_ps(RES2, LH00);
#else
    LH00 = _mm_fmadd_ps(LL00, ALPHA, LH00);
#endif

    _mm_store_ps(ll, LL00);
    _mm_store_ps(hl, HL00);
    _mm_store_ps(hh, HH00);
    _mm_store_ps(lh, LH00);
}

static inline void nsp_update_unpacked_kernel(float * ll, float * lh, float * hl, float * hh, float *lh1, float *hh1, __m128 BETA, __m128 BETA2)
{
    __m128 LL00, HL01, HH11, LH10, LH00, HH01, HH00, HL00, HH10, RES, RES1, RES2;

    LL00 = _mm_load_ps(ll);
    HL00 = _mm_load_ps(hl);
    LH00 = _mm_load_ps(lh);
    HH00 = _mm_load_ps(hh);

    LH10 = _mm_load_ps(lh1);
    HH10 = _mm_load_ps(hh1);

    HL01 = _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(HL00), 4));
    HL01 = _mm_insert_ps(HL01, _mm_load_ps(hl - 4), 0xC0);

    HH01 = _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(HH00), 4));
    HH01 = _mm_insert_ps(HH01, _mm_load_ps(hh - 4), 0xC0);

    HH11 = _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(HH10), 4));
    HH11 = _mm_insert_ps(HH11, _mm_load_ps(hh1 - 4), 0xC0);

    RES = _mm_add_ps(HH01, HH00);
#ifdef NO_FMA
    RES = _mm_mul_ps(RES, BETA);
    LH00 = _mm_add_ps(RES, LH00);
    HH10 = _mm_mul_ps(HH10, BETA);
    HL00 = _mm_add_ps(HH10, HL00);
#else
    LH00 = _mm_fmadd_ps(RES, BETA, LH00);
    HL00 = _mm_fmadd_ps(HH10, BETA, HL00);
#endif

    RES1 = _mm_mul_ps(HH11, BETA2);
    RES = _mm_add_ps(LH00, HL00);
    RES2 = _mm_add_ps(LH10, HL01);
    RES = _mm_add_ps(RES, RES2);
#ifdef NO_FMA
    RES = _mm_mul_ps(RES, BETA);
    RES = _mm_add_ps(RES, RES1);
#else
    RES = _mm_fmadd_ps(RES, BETA, RES1);
#endif
    LL00 = _mm_add_ps(LL00, RES);

#ifdef NO_FMA
    RES = _mm_mul_ps(HH00, BETA);
    HL00 = _mm_add_ps(RES, HL00);
#else
    HL00 = _mm_fmadd_ps(HH00, BETA, HL00);
#endif

    _mm_store_ps(lh, LH00);
    _mm_store_ps(ll, LL00);
    _mm_store_ps(hl, HL00);
}

static inline void nsp_update_unpacked_symmetric_extension_kernel(float * ll, float * lh, float * hl, float * hh, float *lh1, float *hh1, __m128 BETA, __m128 BETA2)
{
    __m128 LL00, HL01, HH11, LH10, LH00, HH01, HH00, HL00, HH10, RES, RES1, RES2;

    LL00 = _mm_load_ps(ll);
    HL00 = _mm_load_ps(hl);
    LH00 = _mm_load_ps(lh);
    HH00 = _mm_load_ps(hh);

    LH10 = _mm_load_ps(lh1);
    HH10 = _mm_load_ps(hh1);

    HL01 = _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(HL00), 4));
    HL01 = _mm_insert_ps(HL01, HL00, 0);

    HH01 = _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(HH00), 4));
    HH01 = _mm_insert_ps(HH01, HH00, 0);

    HH11 = _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(HH10), 4));
    HH11 = _mm_insert_ps(HH11, HH10, 0);

    RES = _mm_add_ps(HH01, HH00);
#ifdef NO_FMA
    RES = _mm_mul_ps(RES, BETA);
    LH00 = _mm_add_ps(RES, LH00);
    HH10 = _mm_mul_ps(HH10, BETA);
    HL00 = _mm_add_ps(HH10, HL00);
#else
    LH00 = _mm_fmadd_ps(RES, BETA, LH00);
    HL00 = _mm_fmadd_ps(HH10, BETA, HL00);
#endif

    RES1 = _mm_mul_ps(HH11, BETA2);
    RES = _mm_add_ps(LH00, HL00);
    RES2 = _mm_add_ps(LH10, HL01);
    RES = _mm_add_ps(RES, RES2);
#ifdef NO_FMA
    RES = _mm_mul_ps(RES, BETA);
    RES = _mm_add_ps(RES, RES1);
#else
    RES = _mm_fmadd_ps(RES, BETA, RES1);
#endif
    LL00 = _mm_add_ps(LL00, RES);

#ifdef NO_FMA
    RES = _mm_mul_ps(HH00, BETA);
    HL00 = _mm_add_ps(RES, HL00);
#else
    HL00 = _mm_fmadd_ps(HH00, BETA, HL00);
#endif

    _mm_store_ps(lh, LH00);
    _mm_store_ps(ll, LL00);
    _mm_store_ps(hl, HL00);
}
#endif


#if defined(NO_SSE) && defined(NO_BARRIER)
void NO_TREE_VECTORIZE nsp_no_SSE_no_barrier(const Image * img, Bands * bands, TmpMem * tmp_mem, const BandsThreadingInfo * threading_info)
{
    if(threading_info->num_sockets > 1){
        #   pragma omp parallel proc_bind(spread) num_threads(threading_info->num_sockets)
        {
            size_t tid = (size_t)omp_get_thread_num();

            Chunk chunk;
            init_chunk(&chunk, &img[tid], &threading_info[tid], tid);

            #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
            {
                for(size_t idx = chunk.start_index; idx < chunk.end_index; ++idx){
                    size_t i = idx / img[tid].tiles_per_width;
                    size_t j = idx % img[tid].tiles_per_width;

                    Tile tile;
                    TileBands tile_bands;

                    get_tile(&img[tid], &tile, j, i);
                    get_tile_bands(&bands[tid], &tile_bands, j, i);

                    nsp_predict_with_unpack_no_SSE_no_barrier(&tile, &tile_bands, &threading_info[tid], &tmp_mem[tid]);
                    nsp_update_unpacked_no_SSE_no_barrier(&tile_bands, &threading_info[tid], &tmp_mem[tid]);
                }
            }
        }
    } else {
        size_t tid = 0;

        Chunk chunk;
        init_chunk(&chunk, &img[tid], &threading_info[tid], tid);

        #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
        {
            for(size_t idx = chunk.start_index; idx < chunk.end_index; ++idx){
                size_t i = idx / img[tid].tiles_per_width;
                size_t j = idx % img[tid].tiles_per_width;

                Tile tile;
                TileBands tile_bands;

                get_tile(&img[tid], &tile, j, i);
                get_tile_bands(&bands[tid], &tile_bands, j, i);

                nsp_predict_with_unpack_no_SSE_no_barrier(&tile, &tile_bands, &threading_info[tid], &tmp_mem[tid]);
                nsp_update_unpacked_no_SSE_no_barrier(&tile_bands, &threading_info[tid], &tmp_mem[tid]);
            }
        }
    }
}

void NO_TREE_VECTORIZE nsp_predict_with_unpack_no_SSE_no_barrier(const Tile * tile, TileBands * bands, const BandsThreadingInfo * threading_info, TmpMem * tmp)
{
    const float ALPHA = -0.5f;
    const float ALPHA2 = 0.25f;

    const size_t size_x = tile->size_x;
    const size_t stride_y = tile->stride_y;
    float * mem = tile->data;

    const size_t band_size_y = bands->size_y;
    const size_t band_stride_y = bands->stride_y;

    const size_t num_threads = (size_t)omp_get_num_threads();
    assert(num_threads <= band_size_y);
    const size_t tid = (size_t)omp_get_thread_num();

    const size_t band_start_y = threading_info->band_start_y[tid];
    const size_t band_end_y = threading_info->band_end_y[tid];

    const size_t img_start_y = band_start_y << 1;
    const size_t img_end_y = band_end_y << 1;

    const size_t next_tile_y = 2 * stride_y - size_x + 2;
    const size_t next_band_y = band_stride_y - bands->size_x + 1;

    float * m0 = mem + img_start_y * stride_y;
    float * m1 = mem + (img_start_y+1) * stride_y;
    float * m2 = mem + (img_start_y+2) * stride_y;

    float * ll = bands->LL + band_start_y * band_stride_y;
    float * hl = bands->HL + band_start_y * band_stride_y;
    float * lh = bands->LH + band_start_y * band_stride_y;
    float * hh = bands->HH + band_start_y * band_stride_y;

    for(size_t y = img_start_y; y < img_end_y - 2; y += 2){
        for(size_t x = 0; x < size_x - 2; x += 2){
            nsp_predict_with_unpack_no_SSE_kernel(m0, m1, m2, ll, hl, lh, hh, ALPHA, ALPHA2);

            ll++; hl++; hh++; lh++;
            m0 += 2; m1 += 2; m2 += 2;
        }

        nsp_predict_with_unpack_no_SSE_symmetric_extension_kernel(m0, m1, m2, ll, hl, lh, hh, ALPHA, ALPHA2);

        ll += next_band_y; hl += next_band_y; lh += next_band_y; hh += next_band_y;
        m0 += next_tile_y; m1 += next_tile_y; m2 += next_tile_y;
    }

    m2 = tid == num_threads - 1 ? m2 - 2 * stride_y : m2;
    for(size_t x = 0; x < size_x - 2; x += 2){
        nsp_predict_with_unpack_no_SSE_kernel(m0, m1, m2, ll, hl, lh, hh, ALPHA, ALPHA2);

        ll++; hl++; hh++; lh++;
        m0 += 2; m1 += 2; m2 += 2;
    }

    nsp_predict_with_unpack_no_SSE_symmetric_extension_kernel(m0, m1, m2, ll, hl, lh, hh, ALPHA, ALPHA2);

    // redundant computation
    float LL00, LL01, LL11, LL10, LH00, LH01, HH00, HL00, HL10;

    if(tid != 0){
        size_t y = img_start_y - 2;
        m0 = mem + y * stride_y;
        m1 = mem + (y+1) * stride_y;
        m2 = mem + (y+2) * stride_y;

        lh = tmp->LH + tid * tmp->band_size_x;
        hh = tmp->HH + tid * tmp->band_size_x;

        for(size_t x = 0; x < size_x - 2; x += 2){
            LL00 = *m0; LL01 = *(m0+2); LL10 = *m2; LL11 = *(m2+2);
            LH00 = *m1; LH01 = *(m1+2);
            HH00 = *(m1+1);
            HL00 = *(m0+1); HL10 = *(m2+1);

            HL00 += ALPHA * (LL01 + LL00);

            LH00 += ALPHA * (LL10);
            HH00 += ALPHA2 * LL11 + ALPHA * (HL10 + LH01 + LH00);
            LH00 += ALPHA * LL00;

            HH00 += ALPHA * HL00;

            *hh = HH00; *lh = LH00;

            hh++; lh++;
            m0 += 2; m1 += 2; m2 += 2;
        }

        LL00 = *m0; LL01 = *m0; LL10 = *m2; LL11 = *m2;
        LH00 = *m1; LH01 = *m1;
        HH00 = *(m1+1);
        HL00 = *(m0+1); HL10 = *(m2+1);

        HL00 += ALPHA * (LL01 + LL00);

        LH00 += ALPHA * (LL10);
        HH00 += ALPHA2 * LL11 + ALPHA * (HL10 + LH01 + LH00);
        LH00 += ALPHA * LL00;

        HH00 += ALPHA * HL00;

        *hh = HH00; *lh = LH00;
    }
}

void NO_TREE_VECTORIZE nsp_update_unpacked_no_SSE_no_barrier(TileBands * bands, const BandsThreadingInfo * threading_info, TmpMem * tmp)
{
    const float BETA = 0.25f;
    const float BETA2 = 0.0625f;

    const size_t band_size_x = bands->size_x;
    const size_t band_size_y = bands->size_y;
    const size_t band_stride_y = bands->stride_y;
    float * LL = bands->LL;
    float * HL = bands->HL;
    float * LH = bands->LH;
    float * HH = bands->HH;

    const size_t num_threads = (size_t)omp_get_num_threads();
    assert(num_threads <= band_size_y);
    const size_t tid = (size_t)omp_get_thread_num();

    const size_t band_start_y = threading_info->band_start_y[tid];
    const size_t band_end_y = threading_info->band_end_y[tid];

    const size_t next_band_y = - band_stride_y + bands->size_x - 1;

    float *tmp_mem_L = tmp->LH + tid * tmp->band_size_x;
    float *tmp_mem_H = tmp->HH + tid * tmp->band_size_x;

    float * ll, * lh, * hl, * hh, *lh1, *hh1;
    ll = LL + (band_end_y - 1) * band_stride_y + band_size_x - 1;
    hl = HL + (band_end_y - 1) * band_stride_y + band_size_x - 1;
    lh = LH + (band_end_y - 1) * band_stride_y + band_size_x - 1;
    hh = HH + (band_end_y - 1) * band_stride_y + band_size_x - 1;
    lh1 = LH + (band_end_y - 2) * band_stride_y + band_size_x - 1;
    hh1 = HH + (band_end_y - 2) * band_stride_y + band_size_x - 1;

    for(size_t y = band_end_y - 1; y > band_start_y; --y){
        for(size_t x = band_size_x - 1; x >= 1; --x){ // main area
            nsp_update_unpacked_no_SSE_kernel(ll, lh, hl, hh, lh1, hh1, BETA, BETA2);
            --lh; --ll; --hl; --hh; --lh1; --hh1;
        }
        // left extension
        nsp_update_unpacked_no_SSE_symmetric_extension_kernel(ll, lh, hl, hh, lh1, hh1, BETA, BETA2);
        ll += next_band_y; hl += next_band_y; lh += next_band_y; hh += next_band_y; lh1 += next_band_y; hh1 += next_band_y;
    }

    if(tid == 0){
        lh1 = LH + band_size_x - 1;
        hh1 = HH + band_size_x - 1;
    } else {
        lh1 = tmp_mem_L + band_size_x - 1;
        hh1 = tmp_mem_H + band_size_x - 1;
    }
    for(size_t x = band_size_x - 1; x >= 1; --x){ // top row
        nsp_update_unpacked_no_SSE_kernel(ll, lh, hl, hh, lh1, hh1, BETA, BETA2);
        --lh; --ll; --hl; --hh; --lh1; --hh1;
    }
    // left extension
    nsp_update_unpacked_no_SSE_symmetric_extension_kernel(ll, lh, hl, hh, lh1, hh1, BETA, BETA2);
}
#elif defined NO_SSE
void NO_TREE_VECTORIZE nsp_no_SSE(const Image * img, Bands * bands, TmpMem * tmp_mem, const BandsThreadingInfo * threading_info)
{
    if(threading_info->num_sockets > 1){
        #   pragma omp parallel proc_bind(spread) num_threads(threading_info->num_sockets)
        {
            size_t tid = (size_t)omp_get_thread_num();

            Chunk chunk;
            init_chunk(&chunk, &img[tid], &threading_info[tid], tid);

            #   pragma omp parallel num_threads(threading_info->num_threads)
            {
                for(size_t idx = chunk.start_index; idx < chunk.end_index; ++idx){
                    size_t i = idx / img[tid].tiles_per_width;
                    size_t j = idx % img[tid].tiles_per_width;

                    Tile tile;
                    TileBands tile_bands;

                    get_tile(&img[tid], &tile, j, i);
                    get_tile_bands(&bands[tid], &tile_bands, j, i);

                    nsp_predict_with_unpack_no_SSE(&tile, &tile_bands, &threading_info[tid]);
                    nsp_update_unpacked_no_SSE(&tile_bands, &threading_info[tid], &tmp_mem[tid], (int)(idx % 2));
                }
            }
        }
    } else {
        size_t tid = 0;

        Chunk chunk;
        init_chunk(&chunk, &img[tid], &threading_info[tid], tid);

        #   pragma omp parallel num_threads(threading_info->num_threads)
        {
            for(size_t idx = chunk.start_index; idx < chunk.end_index; ++idx){
                size_t i = idx / img[tid].tiles_per_width;
                size_t j = idx % img[tid].tiles_per_width;

                Tile tile;
                TileBands tile_bands;

                get_tile(&img[tid], &tile, j, i);
                get_tile_bands(&bands[tid], &tile_bands, j, i);

                nsp_predict_with_unpack_no_SSE(&tile, &tile_bands, &threading_info[tid]);
                nsp_update_unpacked_no_SSE(&tile_bands, &threading_info[tid], &tmp_mem[tid], (int)(idx % 2));
            }
        }
    }
}

void NO_TREE_VECTORIZE nsp_predict_with_unpack_no_SSE(const Tile * tile, TileBands * bands, const BandsThreadingInfo * threading_info)
{
    const float ALPHA = -0.5f;
    const float ALPHA2 = 0.25f;

    const size_t size_x = tile->size_x;
    const size_t stride_y = tile->stride_y;
    float * mem = tile->data;

    const size_t band_size_y = bands->size_y;
    const size_t band_stride_y = bands->stride_y;

    const size_t num_threads = (size_t)omp_get_num_threads();
    assert(num_threads <= band_size_y); // nema smysl poustet vice vlaken nez je radku, kdyz jedno vlakno zpracovava jeden radek !!!
    const size_t tid = (size_t)omp_get_thread_num();

    const size_t band_start_y = threading_info->band_start_y[tid];
    const size_t band_end_y = threading_info->band_end_y[tid];

    const size_t img_start_y = band_start_y << 1;
    const size_t img_end_y = band_end_y << 1;

    const size_t next_tile_y = 2 * stride_y - size_x + 2;
    const size_t next_band_y = band_stride_y - bands->size_x + 1;

    float * m0 = mem + img_start_y * stride_y;
    float * m1 = mem + (img_start_y+1) * stride_y;
    float * m2 = mem + (img_start_y+2) * stride_y;

    float * ll = bands->LL + band_start_y * band_stride_y;
    float * hl = bands->HL + band_start_y * band_stride_y;
    float * lh = bands->LH + band_start_y * band_stride_y;
    float * hh = bands->HH + band_start_y * band_stride_y;

    for(size_t y = img_start_y; y < img_end_y - 2; y += 2){
        for(size_t x = 0; x < size_x - 2; x += 2){
            nsp_predict_with_unpack_no_SSE_kernel(m0, m1, m2, ll, hl, lh, hh, ALPHA, ALPHA2);

            ll++; hl++; hh++; lh++;
            m0 += 2; m1 += 2; m2 += 2;
        }

        nsp_predict_with_unpack_no_SSE_symmetric_extension_kernel(m0, m1, m2, ll, hl, lh, hh, ALPHA, ALPHA2);

        ll += next_band_y; hl += next_band_y; lh += next_band_y; hh += next_band_y;
        m0 += next_tile_y; m1 += next_tile_y; m2 += next_tile_y;
    }

    m2 = tid == num_threads - 1 ? m2 - 2 * stride_y : m2;
    for(size_t x = 0; x < size_x - 2; x += 2){
        nsp_predict_with_unpack_no_SSE_kernel(m0, m1, m2, ll, hl, lh, hh, ALPHA, ALPHA2);

        ll++; hl++; hh++; lh++;
        m0 += 2; m1 += 2; m2 += 2;
    }

    nsp_predict_with_unpack_no_SSE_symmetric_extension_kernel(m0, m1, m2, ll, hl, lh, hh, ALPHA, ALPHA2);
}

void NO_TREE_VECTORIZE nsp_update_unpacked_no_SSE(TileBands * bands, const BandsThreadingInfo * threading_info, TmpMem * tmp, int mem_flag)
{
    const float BETA = 0.25f;
    const float BETA2 = 0.0625f;

    const size_t band_size_x = bands->size_x;
    const size_t band_size_y = bands->size_y;
    const size_t band_stride_y = bands->stride_y;
    float * LL = bands->LL;
    float * HL = bands->HL;
    float * LH = bands->LH;
    float * HH = bands->HH;

    const size_t num_threads = (size_t)omp_get_num_threads();
    assert(num_threads <= band_size_y);
    const size_t tid = (size_t)omp_get_thread_num();

    const size_t band_start_y = threading_info->band_start_y[tid];
    const size_t band_end_y = threading_info->band_end_y[tid];

    const size_t next_band_y = - band_stride_y + bands->size_x - 1;

    if(tid < num_threads - 1){
        if(mem_flag)
            put_tmp_mem_LH_no_SSE(tmp, LH + (band_end_y - 1) * band_stride_y, tid+1);
        else
            put_tmp_mem_HH_no_SSE(tmp, LH + (band_end_y - 1) * band_stride_y, tid+1);
    }

#       pragma omp barrier
    // create private tmp mem
    float * ll = LL + (band_end_y-1) * band_stride_y + band_size_x - 1;
    float * hl = HL + (band_end_y-1) * band_stride_y + band_size_x - 1;
    float * lh = LH + (band_end_y-1) * band_stride_y + band_size_x - 1;
    float * hh = HH + (band_end_y-1) * band_stride_y + band_size_x - 1;
    float * lh1 = LH + (band_end_y-2) * band_stride_y + band_size_x - 1;
    float * hh1 = HH + (band_end_y-2) * band_stride_y + band_size_x - 1;

    for(size_t y = band_end_y-1; y > band_start_y; --y){
        for(size_t x = band_size_x - 1; x >= 1; --x){ // main area
            
            nsp_update_unpacked_no_SSE_kernel(ll, lh, hl, hh, lh1, hh1, BETA, BETA2);

            lh--; ll--; hl--; hh--; lh1--; hh1--;
        }
        
        // left extension
        nsp_update_unpacked_no_SSE_symmetric_extension_kernel(ll, lh, hl, hh, lh1, hh1, BETA, BETA2);
        ll += next_band_y; hl += next_band_y; lh += next_band_y; hh += next_band_y; lh1 += next_band_y; hh1 += next_band_y;
    }

    float *tmp_mem;
    if(mem_flag)
        tmp_mem = tid == 0 ? LH : tmp->LH + tid * tmp->band_size_x;
    else
        tmp_mem = tid == 0 ? LH : tmp->HH + tid * tmp->band_size_x;

    lh1 = tid == 0 ? LH + band_size_x - 1 : tmp_mem + band_size_x - 1;
    hh1 = tid == 0 ? HH + band_start_y * band_stride_y + band_size_x - 1 : HH + (band_start_y-1) * band_stride_y + band_size_x - 1;
    for(size_t x = band_size_x - 1; x >= 1; --x){ // top row
        nsp_update_unpacked_no_SSE_kernel(ll, lh, hl, hh, lh1, hh1, BETA, BETA2);

        lh--; ll--; hl--; hh--; hh1--; lh1--;
    }

    // left extension
    nsp_update_unpacked_no_SSE_symmetric_extension_kernel(ll, lh, hl, hh, lh1, hh1, BETA, BETA2);
}
#elif NO_BARRIER
void nsp_no_barrier(const Image * img, Bands * bands, TmpMem * tmp_mem, const BandsThreadingInfo * threading_info)
{
    if(threading_info->num_sockets > 1){
        #   pragma omp parallel proc_bind(spread) num_threads(threading_info->num_sockets)
        {
            size_t tid = (size_t)omp_get_thread_num();

            Chunk chunk;
            init_chunk(&chunk, &img[tid], &threading_info[tid], tid);

            #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
            {
                for(size_t idx = chunk.start_index; idx < chunk.end_index; ++idx){
                    size_t i = idx / img[tid].tiles_per_width;
                    size_t j = idx % img[tid].tiles_per_width;

                    Tile tile;
                    TileBands tile_bands;

                    get_tile(&img[tid], &tile, j, i);
                    get_tile_bands(&bands[tid], &tile_bands, j, i);

                    nsp_predict_with_unpack_no_barrier(&tile, &tile_bands, &threading_info[tid], &tmp_mem[tid]);
                    nsp_update_unpacked_no_barrier(&tile_bands, &threading_info[tid], &tmp_mem[tid]);
                }
            }
        }
    } else {
        size_t tid = 0;
        
        Chunk chunk;
        init_chunk(&chunk, &img[tid], &threading_info[tid], tid);

        #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
        {
            for(size_t idx = chunk.start_index; idx < chunk.end_index; ++idx){
                size_t i = idx / img[tid].tiles_per_width;
                size_t j = idx % img[tid].tiles_per_width;

                Tile tile;
                TileBands tile_bands;

                get_tile(&img[tid], &tile, j, i);
                get_tile_bands(&bands[tid], &tile_bands, j, i);

                nsp_predict_with_unpack_no_barrier(&tile, &tile_bands, &threading_info[tid], &tmp_mem[tid]);
                nsp_update_unpacked_no_barrier(&tile_bands, &threading_info[tid], &tmp_mem[tid]);
            }
        }
    }
}

void nsp_predict_with_unpack_no_barrier(const Tile * tile, TileBands * bands, const BandsThreadingInfo * threading_info, TmpMem * tmp)
{
    const __m128 ALPHA = _mm_set1_ps(-0.5f);
    const __m128 ALPHA2 = _mm_set1_ps(0.25f);

    const size_t size_x = tile->size_x;
    const size_t stride_y = tile->stride_y;
    float * mem = tile->data;

    const size_t band_size_y = bands->size_y;
    const size_t band_stride_y = bands->stride_y;

    const size_t num_threads = (size_t)omp_get_num_threads();
    assert(num_threads <= band_size_y);
    const size_t tid = (size_t)omp_get_thread_num();

    const size_t band_start_y = threading_info->band_start_y[tid];
    const size_t band_end_y = threading_info->band_end_y[tid];

    const size_t img_start_y = band_start_y << 1;
    const size_t img_end_y = band_end_y << 1;

    const size_t next_tile_y = 2 * stride_y - size_x + 8;
    const size_t next_band_y = band_stride_y - bands->size_x + 4;

    float * m0 = mem + img_start_y * stride_y;
    float * m1 = mem + (img_start_y+1) * stride_y;
    float * m2 = mem + (img_start_y+2) * stride_y;

    float * ll = bands->LL + band_start_y * band_stride_y;
    float * hl = bands->HL + band_start_y * band_stride_y;
    float * lh = bands->LH + band_start_y * band_stride_y;
    float * hh = bands->HH + band_start_y * band_stride_y;

    __m128 R0C0, R0C1, R0C2, R1C0, R1C1, R1C2, R2C0, R2C1, R2C2;

    for(size_t y = img_start_y; y < img_end_y - 2; y += 2){
        R0C0 = _mm_load_ps(m0); R1C0 = _mm_load_ps(m1); R2C0 = _mm_load_ps(m2);
        for(size_t x = 0; x < size_x - 8; x += 8){
            R0C1 = _mm_load_ps(m0 + 4); R0C2 = _mm_load_ps(m0 + 8);
            R1C1 = _mm_load_ps(m1 + 4);R1C2 = _mm_load_ps(m1 + 8);
            R2C1 = _mm_load_ps(m2 + 4); R2C2 = _mm_load_ps(m2 + 8);

            nsp_predict_with_unpack_kernel(R0C0, R0C1, R0C2, R1C0, R1C1, R1C2, R2C0, R2C1, R2C2, ll, lh, hl, hh, ALPHA, ALPHA2);

            R0C0 = R0C2; R1C0 = R1C2; R2C0 = R2C2;

            ll += 4; hl += 4; hh += 4; lh += 4;
            m0 += 8; m1 += 8; m2 += 8;
        }
        R0C1 = _mm_load_ps(m0 + 4); R1C1 = _mm_load_ps(m1 + 4); R2C1 = _mm_load_ps(m2 + 4);

        nsp_predict_with_unpack_symmetric_extension_kernel(R0C0, R0C1, R1C0, R1C1, R2C0, R2C1, ll, lh, hl, hh, ALPHA, ALPHA2);

        ll += next_band_y; hl += next_band_y; lh += next_band_y; hh += next_band_y;
        m0 += next_tile_y; m1 += next_tile_y; m2 += next_tile_y;
    }

    m2 = tid == num_threads - 1 ? m2 - 2 * stride_y : m2;
    R0C0 = _mm_load_ps(m0); R1C0 = _mm_load_ps(m1); R2C0 = _mm_load_ps(m2);
    for(size_t x = 0; x < size_x - 8; x += 8){ // size_x
        R0C1 = _mm_load_ps(m0 + 4); R0C2 = _mm_load_ps(m0 + 8);
        R1C1 = _mm_load_ps(m1 + 4);R1C2 = _mm_load_ps(m1 + 8);
        R2C1 = _mm_load_ps(m2 + 4); R2C2 = _mm_load_ps(m2 + 8);

        nsp_predict_with_unpack_kernel(R0C0, R0C1, R0C2, R1C0, R1C1, R1C2, R2C0, R2C1, R2C2, ll, lh, hl, hh, ALPHA, ALPHA2);

        R0C0 = R0C2; R1C0 = R1C2; R2C0 = R2C2;
        ll += 4; hl += 4; hh += 4; lh += 4;
        m0 += 8; m1 += 8; m2 += 8;
    }
    R0C1 = _mm_load_ps(m0 + 4); R1C1 = _mm_load_ps(m1 + 4); R2C1 = _mm_load_ps(m2 + 4);

    nsp_predict_with_unpack_symmetric_extension_kernel(R0C0, R0C1, R1C0, R1C1, R2C0, R2C1, ll, lh, hl, hh, ALPHA, ALPHA2);

    // redundant computation
    __m128 LL00, LL01, LL11, LL10, LH00, LH01, HH00, HL00, HL10, RES, RES1, RES2;

    if(tid != 0){
        size_t y = img_start_y - 2;
        m0 = mem + y * stride_y;
        m1 = mem + (y+1) * stride_y;
        m2 = mem + (y+2) * stride_y;

        lh = tmp->LH + tid * tmp->band_size_x;
        hh = tmp->HH + tid * tmp->band_size_x;

        R0C0 = _mm_load_ps(m0); R1C0 = _mm_load_ps(m1); R2C0 = _mm_load_ps(m2);
        for(size_t x = 0; x < size_x - 8; x += 8){ // size_x
            R0C1 = _mm_load_ps(m0 + 4); R0C2 = _mm_load_ps(m0 + 8);
            R1C1 = _mm_load_ps(m1 + 4); R1C2 = _mm_load_ps(m1 + 8);
            R2C1 = _mm_load_ps(m2 + 4); R2C2 = _mm_load_ps(m2 + 8);

            LL00 = _mm_shuffle_ps(R0C0, R0C1, _MM_SHUFFLE(2, 0, 2, 0));
            HL00 = _mm_shuffle_ps(R0C0, R0C1, _MM_SHUFFLE(3, 1, 3, 1));

            HH00 = _mm_shuffle_ps(R1C0, R1C1, _MM_SHUFFLE(3, 1, 3, 1));
            LH00 = _mm_shuffle_ps(R1C0, R1C1, _MM_SHUFFLE(2, 0, 2, 0));

            LL10 = _mm_shuffle_ps(R2C0, R2C1, _MM_SHUFFLE(2, 0, 2, 0));
            HL10 = _mm_shuffle_ps(R2C0, R2C1, _MM_SHUFFLE(3, 1, 3, 1));

            LL01 = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(LL00), 4));
            LL01 = _mm_insert_ps(LL01, R0C2, 0x30);

            LH01 = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(LH00), 4));
            LH01 = _mm_insert_ps(LH01, R1C2, 0x30);

            LL11 = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(LL10), 4));
            LL11 = _mm_insert_ps(LL11, R2C2, 0x30);

            RES = _mm_add_ps(LL00, LL01);
#ifdef NO_FMA
            RES = _mm_mul_ps(RES, ALPHA);
            HL00 = _mm_add_ps(RES, HL00);
            LL10 = _mm_mul_ps(LL10, ALPHA);
            LH00 = _mm_add_ps(LL10, LH00);
#else
            HL00 = _mm_fmadd_ps(RES, ALPHA, HL00);
            LH00 = _mm_fmadd_ps(LL10, ALPHA, LH00);
#endif

            RES1 = _mm_mul_ps(LL11, ALPHA2);
            RES = _mm_add_ps(HL00, LH00);
            RES2 = _mm_add_ps(HL10, LH01);
            RES = _mm_add_ps(RES, RES2);

#ifdef NO_FMA
            RES = _mm_mul_ps(RES, ALPHA);
            RES = _mm_add_ps(RES, RES1);
#else
            RES = _mm_fmadd_ps(RES, ALPHA, RES1);
#endif
            HH00 = _mm_add_ps(HH00, RES);

#ifdef NO_FMA
            RES2 = _mm_mul_ps(LL00, ALPHA);
            LH00 = _mm_add_ps(RES2, LH00);
#else
            LH00 = _mm_fmadd_ps(LL00, ALPHA, LH00);
#endif

            _mm_store_ps(hh, HH00);
            _mm_store_ps(lh, LH00);

            R0C0 = R0C2; R1C0 = R1C2; R2C0 = R2C2;

            hh += 4; lh += 4;

            m0 += 8; m1 += 8; m2 += 8;
        }

        R0C1 = _mm_load_ps(m0 + 4);
        R1C1 = _mm_load_ps(m1 + 4);
        R2C1 = _mm_load_ps(m2 + 4);

        LL00 = _mm_shuffle_ps(R0C0, R0C1, _MM_SHUFFLE(2, 0, 2, 0));
        HL00 = _mm_shuffle_ps(R0C0, R0C1, _MM_SHUFFLE(3, 1, 3, 1));

        HH00 = _mm_shuffle_ps(R1C0, R1C1, _MM_SHUFFLE(3, 1, 3, 1));
        LH00 = _mm_shuffle_ps(R1C0, R1C1, _MM_SHUFFLE(2, 0, 2, 0));

        LL10 = _mm_shuffle_ps(R2C0, R2C1, _MM_SHUFFLE(2, 0, 2, 0));
        HL10 = _mm_shuffle_ps(R2C0, R2C1, _MM_SHUFFLE(3, 1, 3, 1));

        LL01 = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(LL00), 4));
        LL01 = _mm_insert_ps(LL01, LL00, 0xF0);

        LH01 = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(LH00), 4));
        LH01 = _mm_insert_ps(LH01, LH00, 0xF0);

        LL11 = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(LL10), 4));
        LL11 = _mm_insert_ps(LL11, LL10, 0xF0);

        RES = _mm_add_ps(LL00, LL01);
#ifdef NO_FMA
        RES = _mm_mul_ps(RES, ALPHA);
        HL00 = _mm_add_ps(RES, HL00);
        LL10 = _mm_mul_ps(LL10, ALPHA);
        LH00 = _mm_add_ps(LL10, LH00);
#else
        HL00 = _mm_fmadd_ps(RES, ALPHA, HL00);
        LH00 = _mm_fmadd_ps(LL10, ALPHA, LH00);
#endif

        RES1 = _mm_mul_ps(LL11, ALPHA2);
        RES = _mm_add_ps(HL00, LH00);
        RES2 = _mm_add_ps(HL10, LH01);
        RES = _mm_add_ps(RES, RES2);
#ifdef NO_FMA
        RES = _mm_mul_ps(RES, ALPHA);
        RES = _mm_add_ps(RES, RES1);
#else
        RES = _mm_fmadd_ps(RES, ALPHA, RES1);
#endif
        HH00 = _mm_add_ps(HH00, RES);

#ifdef NO_FMA
        RES2 = _mm_mul_ps(LL00, ALPHA);
        LH00 = _mm_add_ps(RES2, LH00);
#else
        LH00 = _mm_fmadd_ps(LL00, ALPHA, LH00);
#endif

        _mm_store_ps(hh, HH00);
        _mm_store_ps(lh, LH00);
    }
}

void nsp_update_unpacked_no_barrier(TileBands * bands, const BandsThreadingInfo * threading_info, TmpMem * tmp)
{
    const __m128 BETA = _mm_set1_ps(0.25f);
    const __m128 BETA2 = _mm_set1_ps(0.0625f);

    const size_t band_size_x = bands->size_x;
    const size_t band_size_y = bands->size_y;
    const size_t band_stride_y = bands->stride_y;
    float * LL = bands->LL;
    float * HL = bands->HL;
    float * LH = bands->LH;
    float * HH = bands->HH;

    const size_t num_threads = (size_t)omp_get_num_threads();
    assert(num_threads <= band_size_y);
    const size_t tid = (size_t)omp_get_thread_num();

    const size_t band_start_y = threading_info->band_start_y[tid];
    const size_t band_end_y = threading_info->band_end_y[tid];

    const size_t next_band_y = - band_stride_y + bands->size_x - 4;

    float *tmp_mem_L = tmp->LH + tid * tmp->band_size_x;
    float *tmp_mem_H = tmp->HH + tid * tmp->band_size_x;

    float * ll, * lh, * hl, * hh, *lh1, *hh1;
    ll = LL + (band_end_y - 1) * band_stride_y + band_size_x - 4;
    hl = HL + (band_end_y - 1) * band_stride_y + band_size_x - 4;
    lh = LH + (band_end_y - 1) * band_stride_y + band_size_x - 4;
    hh = HH + (band_end_y - 1) * band_stride_y + band_size_x - 4;
    lh1 = LH + (band_end_y - 2) * band_stride_y + band_size_x - 4;
    hh1 = HH + (band_end_y - 2) * band_stride_y + band_size_x - 4;

    for(size_t y = band_end_y - 1; y > band_start_y; --y){
        for(size_t x = band_size_x - 4; x >= 4; x-=4){ // main area
            nsp_update_unpacked_kernel(ll, lh, hl, hh, lh1, hh1, BETA, BETA2);
            lh -= 4; ll -= 4; hl-= 4; hh -= 4; lh1 -= 4; hh1 -= 4;
        }
        // left extension
        nsp_update_unpacked_symmetric_extension_kernel(ll, lh, hl, hh, lh1, hh1, BETA, BETA2);
        ll += next_band_y; hl += next_band_y; lh += next_band_y; hh += next_band_y; lh1 += next_band_y; hh1 += next_band_y;
    }

    if(tid == 0){
        lh1 = LH + band_size_x - 4;
        hh1 = HH + band_size_x - 4;
    } else {
        lh1 = tmp_mem_L + band_size_x - 4;
        hh1 = tmp_mem_H + band_size_x - 4;
    }
    for(size_t x = band_size_x - 4; x >= 4; x-=4){ // top row
        nsp_update_unpacked_kernel(ll, lh, hl, hh, lh1, hh1, BETA, BETA2);
        lh -= 4; ll -= 4; hl-= 4; hh -= 4; lh1-=4; hh1 -= 4;
    }
    // left extension
    nsp_update_unpacked_symmetric_extension_kernel(ll, lh, hl, hh, lh1, hh1, BETA, BETA2);
}
#else
void nsp(const Image * img, Bands * bands, TmpMem * tmp_mem, const BandsThreadingInfo * threading_info)
{
    if(threading_info->num_sockets > 1){
        #   pragma omp parallel proc_bind(spread) num_threads(threading_info->num_sockets)
        {
            size_t tid = (size_t)omp_get_thread_num();

            Chunk chunk;
            init_chunk(&chunk, &img[tid], &threading_info[tid], tid);

            #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
            {
                for(size_t idx = chunk.start_index; idx < chunk.end_index; ++idx){
                    size_t i = idx / img[tid].tiles_per_width;
                    size_t j = idx % img[tid].tiles_per_width;

                    Tile tile;
                    TileBands tile_bands;

                    get_tile(&img[tid], &tile, j, i);
                    get_tile_bands(&bands[tid], &tile_bands, j, i);

                    nsp_predict_with_unpack(&tile, &tile_bands, &threading_info[tid]);
                    nsp_update_unpacked(&tile_bands, &threading_info[tid], &tmp_mem[tid], (int)(idx % 2));
                }
            }
        }
    } else {
        Chunk chunk;
        init_chunk(&chunk, &img[0], &threading_info[0], 0);

        #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
        {
            for(size_t idx = chunk.start_index; idx < chunk.end_index; ++idx){
                size_t i = idx / img[0].tiles_per_width;
                size_t j = idx % img[0].tiles_per_width;

                Tile tile;
                TileBands tile_bands;

                get_tile(&img[0], &tile, j, i);
                get_tile_bands(&bands[0], &tile_bands, j, i);

                nsp_predict_with_unpack(&tile, &tile_bands, &threading_info[0]);
                nsp_update_unpacked(&tile_bands, &threading_info[0], &tmp_mem[0], (int)(idx % 2));
            }
        }
    }
}

void nsp_predict_with_unpack(const Tile * tile, TileBands * bands, const BandsThreadingInfo * threading_info)
{
    const __m128 ALPHA = _mm_set1_ps(-0.5f);
    const __m128 ALPHA2 = _mm_set1_ps(0.25f);

    const size_t size_x = tile->size_x;
    const size_t stride_y = tile->stride_y;
    float * mem = tile->data;

    const size_t band_size_y = bands->size_y;
    const size_t band_stride_y = bands->stride_y;

    const size_t num_threads = (size_t)omp_get_num_threads();
    assert(num_threads <= band_size_y);
    const size_t tid = (size_t)omp_get_thread_num();

    const size_t band_start_y = threading_info->band_start_y[tid];
    const size_t band_end_y = threading_info->band_end_y[tid];

    const size_t img_start_y = band_start_y << 1;
    const size_t img_end_y = band_end_y << 1;

    const size_t next_tile_y = 2 * stride_y - size_x + 8;
    const size_t next_band_y = band_stride_y - bands->size_x + 4;

    float * m0 = mem + img_start_y * stride_y;
    float * m1 = mem + (img_start_y+1) * stride_y;
    float * m2 = mem + (img_start_y+2) * stride_y;

    float * ll = bands->LL + band_start_y * band_stride_y;
    float * hl = bands->HL + band_start_y * band_stride_y;
    float * lh = bands->LH + band_start_y * band_stride_y;
    float * hh = bands->HH + band_start_y * band_stride_y;

    __m128 R0C0, R0C1, R0C2, R1C0, R1C1, R1C2, R2C0, R2C1, R2C2;

    for(size_t y = img_start_y; y < img_end_y - 2; y += 2){
        R0C0 = _mm_load_ps(m0); R1C0 = _mm_load_ps(m1); R2C0 = _mm_load_ps(m2);
        for(size_t x = 0; x < size_x - 8; x += 8){
            R0C1 = _mm_load_ps(m0 + 4); R0C2 = _mm_load_ps(m0 + 8);
            R1C1 = _mm_load_ps(m1 + 4);R1C2 = _mm_load_ps(m1 + 8);
            R2C1 = _mm_load_ps(m2 + 4); R2C2 = _mm_load_ps(m2 + 8);

            nsp_predict_with_unpack_kernel(R0C0, R0C1, R0C2, R1C0, R1C1, R1C2, R2C0, R2C1, R2C2, ll, lh, hl, hh, ALPHA, ALPHA2);

            R0C0 = R0C2; R1C0 = R1C2; R2C0 = R2C2;

            ll += 4; hl += 4; hh += 4; lh += 4;
            m0 += 8; m1 += 8; m2 += 8;
        }
        R0C1 = _mm_load_ps(m0 + 4); R1C1 = _mm_load_ps(m1 + 4); R2C1 = _mm_load_ps(m2 + 4);

        nsp_predict_with_unpack_symmetric_extension_kernel(R0C0, R0C1, R1C0, R1C1, R2C0, R2C1, ll, lh, hl, hh, ALPHA, ALPHA2);

        ll += next_band_y; hl += next_band_y; lh += next_band_y; hh += next_band_y;
        m0 += next_tile_y; m1 += next_tile_y; m2 += next_tile_y;
    }

    m2 = tid == num_threads - 1 ? m2 - 2 * stride_y : m2;
    R0C0 = _mm_load_ps(m0); R1C0 = _mm_load_ps(m1); R2C0 = _mm_load_ps(m2);
    for(size_t x = 0; x < size_x - 8; x += 8){ // size_x
        R0C1 = _mm_load_ps(m0 + 4); R0C2 = _mm_load_ps(m0 + 8);
        R1C1 = _mm_load_ps(m1 + 4);R1C2 = _mm_load_ps(m1 + 8);
        R2C1 = _mm_load_ps(m2 + 4); R2C2 = _mm_load_ps(m2 + 8);

        nsp_predict_with_unpack_kernel(R0C0, R0C1, R0C2, R1C0, R1C1, R1C2, R2C0, R2C1, R2C2, ll, lh, hl, hh, ALPHA, ALPHA2);

        R0C0 = R0C2; R1C0 = R1C2; R2C0 = R2C2;
        ll += 4; hl += 4; hh += 4; lh += 4;
        m0 += 8; m1 += 8; m2 += 8;
    }
    R0C1 = _mm_load_ps(m0 + 4); R1C1 = _mm_load_ps(m1 + 4); R2C1 = _mm_load_ps(m2 + 4);

    nsp_predict_with_unpack_symmetric_extension_kernel(R0C0, R0C1, R1C0, R1C1, R2C0, R2C1, ll, lh, hl, hh, ALPHA, ALPHA2);

}

void nsp_update_unpacked(TileBands * bands, const BandsThreadingInfo * threading_info, TmpMem * tmp, int mem_flag)
{
    const __m128 BETA = _mm_set1_ps(0.25f);
    const __m128 BETA2 = _mm_set1_ps(0.0625f);

    const size_t band_size_x = bands->size_x;
    const size_t band_size_y = bands->size_y;
    const size_t band_stride_y = bands->stride_y;
    float * LL = bands->LL;
    float * HL = bands->HL;
    float * LH = bands->LH;
    float * HH = bands->HH;

    const size_t num_threads = (size_t)omp_get_num_threads();
    assert(num_threads <= band_size_y);
    const size_t tid = (size_t)omp_get_thread_num();

    const size_t band_start_y = threading_info->band_start_y[tid];
    const size_t band_end_y = threading_info->band_end_y[tid];

    const size_t next_band_y = - band_stride_y + bands->size_x - 4;

    if(tid < num_threads - 1){
        if(mem_flag)
            put_tmp_mem_LH(tmp, LH + (band_end_y - 1) * band_stride_y, tid+1);
        else
            put_tmp_mem_HH(tmp, LH + (band_end_y - 1) * band_stride_y, tid+1);
    }

    # pragma omp barrier

    float * ll = LL + (band_end_y - 1) * band_stride_y + band_size_x - 4;
    float * hl = HL + (band_end_y - 1) * band_stride_y + band_size_x - 4;
    float * lh = LH + (band_end_y - 1) * band_stride_y + band_size_x - 4;
    float * hh = HH + (band_end_y - 1) * band_stride_y + band_size_x - 4;
    float * lh1 = LH + (band_end_y - 2) * band_stride_y + band_size_x - 4;
    float * hh1 = HH + (band_end_y - 2) * band_stride_y + band_size_x - 4;

    for(size_t y = band_end_y - 1; y > band_start_y; --y){
        for(size_t x = band_size_x - 4; x >= 4; x-=4){ // main area
            nsp_update_unpacked_kernel(ll, lh, hl, hh, lh1, hh1, BETA, BETA2);
            lh -= 4; ll -= 4; hl-= 4; hh -= 4; lh1 -= 4; hh1 -= 4;
        }
        // left extension
        nsp_update_unpacked_symmetric_extension_kernel(ll, lh, hl, hh, lh1, hh1, BETA, BETA2);
        ll += next_band_y; hl += next_band_y; lh += next_band_y; hh += next_band_y; lh1 += next_band_y; hh1 += next_band_y;
    }

    float *tmp_mem;
    if(mem_flag)
        tmp_mem = tmp->LH + tid * tmp->band_size_x;
    else
        tmp_mem = tmp->HH + tid * tmp->band_size_x;
    lh1 = tid == 0 ? LH + band_size_x - 4 : tmp_mem + band_size_x - 4;
    hh1 = tid == 0 ? HH + band_size_x - 4 : HH + (band_start_y-1) * band_stride_y + band_size_x - 4;
    for(size_t x = band_size_x - 4; x >= 4; x-=4){ // top row
        nsp_update_unpacked_kernel(ll, lh, hl, hh, lh1, hh1, BETA, BETA2);
        lh -= 4; ll -= 4; hl-= 4; hh -= 4; lh1-=4; hh1 -= 4;
    }
    // left extension
    nsp_update_unpacked_symmetric_extension_kernel(ll, lh, hl, hh, lh1, hh1, BETA, BETA2);
}
#endif

void test_nsp(const char * filename, Image * img, Bands * bands, TmpMem * tmp_mem, const BandsThreadingInfo * threading_info)
{
    Tile tile;
    TileBands tile_bands;

    FILE * f = fopen(filename, "r");
    if(!f)
        return;
    fclose(f);

    Image test;
    allocate_image(&test, img->size_x, img->size_y);
    set_tile_size(&test, img->tile_size_x, img->tile_size_y);

    load_image(filename, &test);

#if defined(NO_SSE) && defined(NO_BARRIER)
    nsp_no_SSE_no_barrier(img, bands, tmp_mem, threading_info);
#elif defined NO_SSE
    nsp_no_SSE(img, bands, tmp_mem, threading_info);
#elif defined NO_BARRIER
    nsp_no_barrier(img, bands, tmp_mem, threading_info);
#else
    nsp(img, bands, tmp_mem, threading_info);
#endif

    const float EPS = 1e-5f;

    size_t num_tiles = img->tiles_per_height * img->tiles_per_width;
    size_t chunk = (num_tiles + threading_info->num_sockets - 1) / threading_info->num_sockets;

    float n1, n2;
    float *ll, *hl, *lh, *hh;
    for (size_t y = 0; y < img->tiles_per_height; ++y) {
        for (size_t x = 0; x < img->tiles_per_width; ++x) {
            get_tile(&test, &tile, x, y);
            size_t idx = (y * img->tiles_per_width + x) / chunk;
            get_tile_bands(&bands[idx], &tile_bands, x, y);

            for (size_t i = 0; i < tile.size_y; ++i) {
                for (size_t j = 0; j < tile.size_x; j += 2) {
                    n1 = *(tile.data + i * tile.stride_y + j);
                    n2 = *(tile.data + i * tile.stride_y + j + 1);

                    if (i % 2 == 0) {
                        ll = tile_bands.LL + (i>>1) * bands->stride_y + (j >> 1);
                        hl = tile_bands.HL + (i>>1) * bands->stride_y + (j >> 1);
                        //fprintf(stderr,"%zu %zu %zu %zu %f %f\n", x, y, j, i, *ll, *hl);
                        assert(*ll - n1 < EPS);
                        assert(*hl - n2 < EPS);
                    } else {
                        lh = tile_bands.LH + (i>>1) * bands->stride_y + (j >> 1);
                        hh = tile_bands.HH + (i>>1) * bands->stride_y + (j >> 1);
                        //fprintf(stderr,"%zu %zu %zu %zu %f %f\n", x, y, j, i, *lh, *hh);
                        assert(*lh - n1 < EPS);
                        assert(*hh - n2 < EPS);
                    }
                }
            }
        }
    }
    free_image(&test);
}

double measure_nsp(Image * img, Bands * bands, TmpMem * tmp_mem, const BandsThreadingInfo * threading_info, TestConfig config)
{
    double result = -1;
    switch(config){
        case IMAGE:
            result = measure_nsp_image(img, bands, tmp_mem, threading_info);
            break;
        case TILE:
            result = measure_nsp_tile(img, bands, tmp_mem, threading_info);
            break;
        case TILES_IN_IMAGE:
            result = measure_nsp_tiles_in_image(img, bands, tmp_mem, threading_info);
            break;

    }
    return result;
}

double measure_nsp_tile(Image * img, Bands * bands, TmpMem * tmp_mem, const BandsThreadingInfo * threading_info)
{
    assert(threading_info->num_sockets == 1 && img->num_tiles == 1);

    const size_t repetitions = REPETITIONS;
    const size_t attempts = ATTEMPTS;

    long long t = -1;
    long long times[attempts];

    Tile tile;
    TileBands tile_bands;

    get_tile(&img[0], &tile, 0, 0);
    get_tile_bands(&bands[0], &tile_bands, 0, 0);

    for(size_t a = 0; a < attempts; ++a)
    {
/*
        #pragma omp parallel proc_bind(spread) num_threads(threading_info->num_sockets)
        {
            #pragma omp parallel proc_bind(spread) num_threads(threading_info->num_threads)
            {
                flush_cache();
            }
        }
*/
        long long t0 = gettimer(); // ns


        for(size_t t = 0; t < repetitions; ++t){
        #if defined(NO_SSE) && defined(NO_BARRIER)
            #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
            {
                nsp_predict_with_unpack_no_SSE_no_barrier(&tile, &tile_bands, &threading_info[0], &tmp_mem[0]);
                nsp_update_unpacked_no_SSE_no_barrier(&tile_bands, &threading_info[0], &tmp_mem[0]);
            }
        #elif defined NO_SSE
            #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
            {
                nsp_predict_with_unpack_no_SSE(&tile, &tile_bands, &threading_info[0]);
                nsp_update_unpacked_no_SSE(&tile_bands, &threading_info[0], &tmp_mem[0], 0);
            }
        #elif defined NO_BARRIER
            #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
            {
                nsp_predict_with_unpack_no_barrier(&tile, &tile_bands, &threading_info[0], &tmp_mem[0]);
                nsp_update_unpacked_no_barrier(&tile_bands, &threading_info[0], &tmp_mem[0]);
            }
        #else
            #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
            {
                nsp_predict_with_unpack(&tile, &tile_bands, &threading_info[0]);
                nsp_update_unpacked(&tile_bands, &threading_info[0], &tmp_mem[0], 0);
            }
        #endif
        }

        long long t1 = gettimer();

        long long t_ = t1 - t0;

    times[a] = t_;
    }

    qsort(times, attempts, sizeof(long long), compare_times);

    t = times[attempts/2];

    return 1000.0*(double)t/(double)((tile.size_x * tile.size_y) * repetitions);
}

double measure_nsp_image(Image * img, Bands * bands, TmpMem * tmp_mem, const BandsThreadingInfo * threading_info)
{
    const size_t repetitions = REPETITIONS;
    const size_t attempts = ATTEMPTS;

    long long t = -1;
    long long times[attempts];

    for(size_t a = 0; a < attempts; ++a)
    {
/*
        #pragma omp parallel proc_bind(spread) num_threads(threading_info->num_sockets)
        {
            #pragma omp parallel proc_bind(spread) num_threads(threading_info->num_threads)
            {
                flush_cache();
            }
        }
*/
        long long t0 = gettimer(); // ns


        for(size_t t = 0; t < repetitions; ++t){
        #if defined(NO_SSE) && defined(NO_BARRIER)
            nsp_no_SSE_no_barrier(img, bands, tmp_mem, threading_info);
        #elif defined NO_SSE
            nsp_no_SSE(img, bands, tmp_mem, threading_info);
        #elif defined NO_BARRIER
            nsp_no_barrier(img, bands, tmp_mem, threading_info);
        #else
            nsp(img, bands, tmp_mem, threading_info);
        #endif
        }

        long long t1 = gettimer();

        long long t_ = t1 - t0;

    times[a] = t_;
    }

    qsort(times, attempts, sizeof(long long), compare_times);

    t = times[attempts/2];

    return 1000.0*(double)t/(double)(img->size * repetitions);
}

double measure_nsp_tiles_in_image(Image * img, Bands * bands, TmpMem * tmp_mem, const BandsThreadingInfo * threading_info)
{
    assert(threading_info->num_sockets == 1);

    const size_t repetitions = REPETITIONS;
    const size_t attempts = ATTEMPTS;

    long long t = -1;
    long long * times =(long long *) malloc(attempts * img[0].num_tiles * sizeof(long long));

    Tile tile;
    TileBands tile_bands;

    get_tile(&img[0], &tile, 0, 0);
    get_tile_bands(&bands[0], &tile_bands, 0, 0);


    for(size_t i = 0; i < img[0].tiles_per_height; ++i){
        for(size_t j = 0; j < img[0].tiles_per_width; ++j){
            Tile tile;
            TileBands tile_bands;

            get_tile(&img[0], &tile, j, i);
            get_tile_bands(&bands[0], &tile_bands, j, i);

            for(size_t a = 0; a < attempts; ++a){

                /*
                #pragma omp parallel proc_bind(spread) num_threads(threading_info->num_sockets)
                {
                    #pragma omp parallel proc_bind(spread) num_threads(threading_info->num_threads)
                    {
                        flush_cache();
                    }
                }
                */

                long long t0 = gettimer(); // ns

                for(size_t t = 0; t < repetitions; ++t){
                #if defined(NO_SSE) && defined(NO_BARRIER)
                    #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
                    {
                        nsp_predict_with_unpack_no_SSE_no_barrier(&tile, &tile_bands, &threading_info[0], &tmp_mem[0]);
                        nsp_update_unpacked_no_SSE_no_barrier(&tile_bands, &threading_info[0], &tmp_mem[0]);
                    }
                #elif defined NO_SSE
                    #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
                    {
                        nsp_predict_with_unpack_no_SSE(&tile, &tile_bands, &threading_info[0]);
                        nsp_update_unpacked_no_SSE(&tile_bands, &threading_info[0], &tmp_mem[0], 0);
                    }
                #elif defined NO_BARRIER
                    #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
                    {
                        nsp_predict_with_unpack_no_barrier(&tile, &tile_bands, &threading_info[0], &tmp_mem[0]);
                        nsp_update_unpacked_no_barrier(&tile_bands, &threading_info[0], &tmp_mem[0]);
                    }
                #else
                    #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
                    {
                        nsp_predict_with_unpack(&tile, &tile_bands, &threading_info[0]);
                        nsp_update_unpacked(&tile_bands, &threading_info[0], &tmp_mem[0], 0);
                    }
                #endif
                }

                long long t1 = gettimer();

                long long t_ = t1 - t0;

                size_t idx = i * img[0].tiles_per_width + j;
                times[idx * attempts + a] = t_;

            }
        }
    }

    qsort(times, attempts * img[0].num_tiles, sizeof(long long), compare_times);

    t = times[(attempts * img[0].num_tiles)/2];

    free(times);
    return 1000.0*(double)t/(double)((tile.size_x * tile.size_y) * repetitions);

}
