/*
 * File:   main.c
 * Author: Pavel Najman <najman.pavel at gmail.com>
 *
 * Created on April 28, 2017, 9:39 AM
 */

#include <stdio.h>
#include <assert.h>
#include <time.h>
#include <malloc.h>
#include <stdlib.h>

#ifdef _OPENMP
    #include <omp.h>
#endif

#ifndef NO_SSE
    #ifndef NO_FMA
        #include <immintrin.h>
    #endif
#include <xmmintrin.h>
#include <smmintrin.h>
#endif

#include "common.h"

//#include <iacaMarks.h>

#if defined NO_SSE && defined NO_BARRIER
    void sep_no_SSE_no_barrier(const Image * img, Bands * bands, TmpMem * tmp0, TmpMem * tmp1, const BandsThreadingInfo * threading_info);
    void sep_H_predict_with_unpack_no_SSE_no_barrier(const Tile * tile, TileBands * tile_bands, const BandsThreadingInfo * threading_info, TmpMem * tmp0, TmpMem * tmp1);
    void sep_V_predict_unpacked_no_SSE_no_barrier(TileBands * tile_bands, const BandsThreadingInfo * threading_info, TmpMem * tmp0, TmpMem * tmp1);
    void sep_H_update_unpacked_no_SSE_no_barrier(TileBands * tile_bands, const BandsThreadingInfo * threading_info, TmpMem * tmp0);
    void sep_V_update_unpacked_no_SSE_no_barrier(TileBands * tile_bands, const BandsThreadingInfo * threading_info, TmpMem * tmp0);
#elif defined NO_SSE
    void sep_no_SSE(const Image * img, Bands * bands, const BandsThreadingInfo * threading_info);
    void sep_H_predict_with_unpack_no_SSE(const Tile * tile, TileBands * tile_bands, const BandsThreadingInfo * threading_info);
    void sep_V_predict_unpacked_no_SSE(TileBands * tile_bands, const BandsThreadingInfo * threading_info);
    void sep_H_update_unpacked_no_SSE(TileBands * tile_bands, const BandsThreadingInfo * threading_info);
    void sep_V_update_unpacked_no_SSE(TileBands * tile_bands, const BandsThreadingInfo * threading_info);
#elif defined NO_BARRIER
    void sep_no_barrier(const Image * img, Bands * bands, TmpMem * tmp_mem0, TmpMem * tmp_mem1, const BandsThreadingInfo * threading_info);
    void sep_H_predict_with_unpack_no_barrier(const Tile * tile, TileBands * tile_bands, const BandsThreadingInfo * threading_info, TmpMem * tmp0, TmpMem * tmp1);
    void sep_V_predict_unpacked_no_barrier(TileBands * tile_bands, const BandsThreadingInfo * threading_info, TmpMem * tmp0, TmpMem * tmp1);
    void sep_H_update_unpacked_no_barrier(TileBands * tile_bands, const BandsThreadingInfo * threading_info, TmpMem * tmp0);
    void sep_V_update_unpacked_no_barrier(TileBands * tile_bands, const BandsThreadingInfo * threading_info, TmpMem * tmp0);
#else
    void sep(const Image * img, Bands * bands, const BandsThreadingInfo * threading_info);
    void sep_H_predict_with_unpack(const Tile * tile, TileBands * tile_bands, const BandsThreadingInfo * threading_info);
    void sep_V_predict_unpacked(TileBands * tile_bands, const BandsThreadingInfo * threading_info);
    void sep_H_update_unpacked(TileBands * tile_bands, const BandsThreadingInfo * threading_info);
    void sep_V_update_unpacked(TileBands * tile_bands, const BandsThreadingInfo * threading_info);
#endif

#ifdef NO_BARRIER
    void test_sep(const char * filename, Image * img, Bands * bands, TmpMem * tmp_mem0, TmpMem * tmp_mem1, const BandsThreadingInfo * threading_info);
#else
    void test_sep(const char * filename, Image * img, Bands * bands, const BandsThreadingInfo * threading_info);
#endif

#ifdef NO_BARRIER
    double measure_sep(Image * tile, Bands * bands, TmpMem * tmp_mem0, TmpMem * tmp_mem1, const BandsThreadingInfo * threading_info, TestConfig config);
    double measure_sep_tile(Image * img, Bands * bands, TmpMem * tmp_mem0, TmpMem * tmp_mem1, const BandsThreadingInfo * threading_info);
    double measure_sep_image(Image * img, Bands * bands, TmpMem * tmp_mem0, TmpMem * tmp_mem1, const BandsThreadingInfo * threading_info);
    double measure_sep_tiles_in_image(Image * img, Bands * bands, TmpMem * tmp_mem0, TmpMem * tmp_mem1, const BandsThreadingInfo * threading_info);
#else
    double measure_sep(Image * tile, Bands * bands, const BandsThreadingInfo * threading_info, TestConfig config);
    double measure_sep_tile(Image * img, Bands * bands, const BandsThreadingInfo * threading_info);
    double measure_sep_image(Image * img, Bands * bands, const BandsThreadingInfo * threading_info);
    double measure_sep_tiles_in_image(Image * img, Bands * bands, const BandsThreadingInfo * threading_info);
#endif

/*
 *
 */
int main(int argc, char** argv) {
    omp_set_nested(1);

    TestConfig config = IMAGE;
    size_t num_sockets = 1;
    size_t num_threads = 1;

    if (argc > 1)
        num_sockets = (size_t) atoi(argv[1]);

    if (argc > 2)
        num_threads = (size_t) atoi(argv[2]);

    size_t size_x = 512, size_y = 512;
    size_t tile_size_x = 256, tile_size_y = 256;

    if (argc > 3)
        size_x = size_y = (size_t) atoi(argv[3]);

    if (argc > 4)
        size_y = (size_t) atoi(argv[4]);

    if (argc > 5)
        tile_size_x = tile_size_y = (size_t) atoi(argv[5]);

    if (argc > 6)
        config = (TestConfig)atoi(argv[6]);

    if (argc > 7)
        tile_size_y = (size_t) atoi(argv[7]);

    assert(size_x % 8 == 0 && "transform size must be a power of two");
    assert(size_y % 8 == 0 && "transform size must be a power of two");
#ifdef NO_SSE
    Image * img = memalign(16, num_sockets * sizeof(Image));
    Bands * bands = memalign(16, num_sockets * sizeof(Bands));
    BandsThreadingInfo * threading_info = memalign(16, num_sockets * sizeof(BandsThreadingInfo));
    TmpMem * mem0 = memalign(16, num_sockets * sizeof(TmpMem));
    TmpMem * mem1 = memalign(16, num_sockets * sizeof(TmpMem));
#else
    Image * img = _mm_malloc(num_sockets * sizeof(Image), 16);
    Bands * bands = _mm_malloc(num_sockets * sizeof(Bands), 16);
    BandsThreadingInfo * threading_info = _mm_malloc(num_sockets * sizeof(BandsThreadingInfo), 16);
    TmpMem * mem0 = _mm_malloc(num_sockets * sizeof(TmpMem), 16);
    TmpMem * mem1 = _mm_malloc(num_sockets * sizeof(TmpMem), 16);
#endif
    #pragma omp parallel proc_bind(spread) num_threads(num_sockets)
    {
        int sid = omp_get_thread_num();
        allocate_image(&img[sid], size_x, size_y);

        set_tile_size(&img[sid], tile_size_x, tile_size_y);

        allocate_bands(&bands[sid], &img[sid]);

        allocate_bands_threading_info(&threading_info[sid], num_threads);

        allocate_tmp_mem(&mem0[sid], num_threads, tile_size_x);

        allocate_tmp_mem(&mem1[sid], num_threads, tile_size_x);
    }

    for(size_t sid = 0; sid < num_sockets; ++sid){
        srand(0);
        init_image(&img[sid], tile_size_x, tile_size_y);
        init_bands_threading_info(&threading_info[sid], &bands[sid], num_sockets, num_threads);
    }

    char filename[50];
    sprintf(filename, "%zu_%zu.mat", size_x, tile_size_x);

    #ifdef NO_BARRIER
        test_sep(filename, img, bands, mem0, mem1, threading_info);
    #else
        test_sep(filename, img, bands, threading_info);
    #endif

    double result;
    #ifdef NO_BARRIER
        result = measure_sep(img, bands, mem0, mem1, threading_info, config);
    #else
        result = measure_sep(img, bands, threading_info, config);
    #endif
    printf("%f\n", result/1000.0);
    //printf("nsp: done in %lli picoseconds/pixel => %lli MB/s\n", result, (long long) sizeof(float) * 1000000LL / result);
    //result = measure_sep(&img, &bands);
    //printf("nsp: done in %lli picoseconds/pixel => %lli MB/s\n", result, (long long) sizeof(float) * 1000000LL / result);

    #pragma omp parallel proc_bind(spread) num_threads(num_sockets)
    {
        int sid = omp_get_thread_num();
        free_image(&img[sid]);
        free_bands(&bands[sid]);
        free_bands_threading_info(&threading_info[sid]);
        free_tmp_mem(&mem0[sid]);
        free_tmp_mem(&mem1[sid]);
    }
    
#ifdef NO_SSE
    free(img);
    free(bands);
    free(threading_info);
    free(mem0);
    free(mem1);
#else
    _mm_free(img);
    _mm_free(bands);
    _mm_free(threading_info);
    _mm_free(mem0);
    _mm_free(mem1);
#endif

    return EXIT_SUCCESS;
}

#ifdef NO_SSE
static inline void NO_TREE_VECTORIZE sep_H_predict_with_unpack_no_SSE_kernel(float * m0, float * m1, float * ll, float * hl, float * lh, float * hh, float ALPHA)
{
    float LL00, LL01, HL00, LH00, LH01, HH00;

    LL00 = *(m0); LL01 = *(m0+2);
    HL00 = *(m0+1);
    LH00 = *(m1); LH01 = *(m1+2);
    HH00 = *(m1+1);

    HL00 += ALPHA * (LL00 + LL01);
    HH00 += ALPHA * (LH00 + LH01);

    *ll = LL00; *hl = HL00; *lh = LH00; *hh = HH00;
}

static inline void NO_TREE_VECTORIZE sep_H_predict_with_unpack_no_SSE_symmetric_extension_kernel(float * m0, float * m1, float * ll, float * hl, float * lh, float * hh, float ALPHA)
{
    float LL00, LL01, HL00, LH00, LH01, HH00;

    LL00 = *(m0); LL01 = *(m0);
    HL00 = *(m0+1);
    LH00 = *(m1); LH01 = *(m1);
    HH00 = *(m1+1);

    HL00 += ALPHA * (LL00 + LL01);
    HH00 += ALPHA * (LH00 + LH01);

    *ll = LL00; *hl = HL00; *lh = LH00; *hh = HH00;
}

static inline void NO_TREE_VECTORIZE sep_V_predict_unpacked_no_SSE_kernel(float * ll, float * hl, float * lh, float * hh, float * ll1, float * hl1, float ALPHA)
{
    float LL00, LL10, HL00, HL10, LH00, HH00;

    LL00 = *(ll); LL10 = *(ll1);
    HL00 = *(hl); HL10 = *(hl1);
    LH00 = *(lh);
    HH00 = *(hh);

    LH00 += ALPHA * (LL00 + LL10);
    HH00 += ALPHA * (HL00 + HL10);

    *lh = LH00;
    *hh = HH00;
}

static inline void NO_TREE_VECTORIZE sep_V_predict_unpacked_no_SSE_symmetric_extension_kernel(float * ll, float * hl, float * lh, float * hh, float ALPHA)
{
    float LL00, LL10, HL00, HL10, LH00, HH00;

    LL00 = *(ll); LL10 = *(ll);
    HL00 = *(hl); HL10 = *(hl);
    LH00 = *(lh);
    HH00 = *(hh);

    LH00 += ALPHA * (LL00 + LL10);
    HH00 += ALPHA * (HL00 + HL10);

    *(lh) = LH00;
    *(hh) = HH00;
}

static inline void NO_TREE_VECTORIZE sep_H_update_unpacked_no_SSE_kernel(float * ll, float * hl, float * lh, float * hh, float BETA)
{
    float LL00, HL01, HL00, LH00, HH01, HH00;

    LL00 = *ll;
    HL01 = *(hl-1); HL00 = *(hl);
    LH00 = *(lh);
    HH01 = *(hh-1); HH00 = *(hh);

    LL00 += BETA * (HL00 + HL01);
    LH00 += BETA * (HH00 + HH01);

    *ll = LL00;
    *lh = LH00;
}

static inline void NO_TREE_VECTORIZE sep_H_update_unpacked_no_SSE_symmetric_extension_kernel(float * ll, float * hl, float * lh, float * hh, float BETA)
{
    float LL00, HL01, HL00, LH00, HH01, HH00;

    LL00 = *ll;
    HL01 = *(hl); HL00 = *(hl);
    LH00 = *(lh);
    HH01 = *(hh); HH00 = *(hh);

    LL00 += BETA * (HL00 + HL01);
    LH00 += BETA * (HH00 + HH01);

    *ll = LL00;
    *lh = LH00;
}

static inline void NO_TREE_VECTORIZE sep_V_update_unpacked_no_SSE_kernel(float * ll, float * hl, float * lh, float * hh, float * lh1, float * hh1, float BETA)
{
    float LH00, LH10, HH00, HH10, LL00, HL00;

    LH00 = *lh; LH10 = *(lh1);
    HH00 = *hh; HH10 = *(hh1);
    LL00 = *ll;
    HL00 = *hl;

    LL00 += BETA * (LH00 + LH10);
    HL00 += BETA * (HH00 + HH10);

    *ll = LL00;
    *hl = HL00;
}

static inline void NO_TREE_VECTORIZE sep_V_update_unpacked_no_SSE_symmetric_extension_kernel(float * ll, float * hl, float * lh, float * hh, float BETA)
{
    float LH00, LH10, HH00, HH10, LL00, HL00;

    LH00 = *(lh); LH10 = *(lh);
    HH00 = *(hh); HH10 = *(hh);
    LL00 = *(ll);
    HL00 = *(hl);

    LL00 += BETA * (LH00 + LH10);
    HL00 += BETA * (HH00 + HH10);

    *(ll) = LL00;
    *(hl) = HL00;
}
#else
static inline void sep_H_predict_with_unpack_kernel(__m128 R0C0, __m128 R0C1, __m128 R0C2, __m128 R1C0, __m128 R1C1, __m128 R1C2, float * ll, float * hl, float * lh, float * hh, __m128 ALPHA)
{
    __m128 LL00, HL00, LH00, HH00, TMP, RES;

    LL00 = _mm_shuffle_ps(R0C0, R0C1, _MM_SHUFFLE(2, 0, 2, 0));
    HL00 = _mm_shuffle_ps(R0C0, R0C1, _MM_SHUFFLE(3, 1, 3, 1));
    LH00 = _mm_shuffle_ps(R1C0, R1C1, _MM_SHUFFLE(2, 0, 2, 0));
    HH00 = _mm_shuffle_ps(R1C0, R1C1, _MM_SHUFFLE(3, 1, 3, 1));

    TMP = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(LL00), 4));
    TMP = _mm_insert_ps(TMP, R0C2, 0x30);

    RES = _mm_add_ps(LL00, TMP);
#ifdef NO_FMA
    RES = _mm_mul_ps(RES, ALPHA);
    HL00 = _mm_add_ps(RES, HL00);
#else
    HL00 = _mm_fmadd_ps(RES, ALPHA, HL00);
#endif

    TMP = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(LH00), 4));
    TMP = _mm_insert_ps(TMP, R1C2, 0x30);

    RES = _mm_add_ps(LH00, TMP);
#ifdef NO_FMA
    RES = _mm_mul_ps(RES, ALPHA);
    HH00 = _mm_add_ps(RES, HH00);
#else
    HH00 = _mm_fmadd_ps(RES, ALPHA, HH00);
#endif

    _mm_store_ps(ll, LL00);
    _mm_store_ps(hl, HL00);
    _mm_store_ps(lh, LH00);
    _mm_store_ps(hh, HH00);
}

static inline void sep_H_predict_with_unpack_symmetric_extension_kernel(__m128 R0C0, __m128 R0C1, __m128 R1C0, __m128 R1C1, float * ll, float * hl, float * lh, float * hh, __m128 ALPHA)
{
    __m128 LL00, HL00, LH00, HH00, TMP, RES;

    LL00 = _mm_shuffle_ps(R0C0, R0C1, _MM_SHUFFLE(2, 0, 2, 0));
    HL00 = _mm_shuffle_ps(R0C0, R0C1, _MM_SHUFFLE(3, 1, 3, 1));
    LH00 = _mm_shuffle_ps(R1C0, R1C1, _MM_SHUFFLE(2, 0, 2, 0));
    HH00 = _mm_shuffle_ps(R1C0, R1C1, _MM_SHUFFLE(3, 1, 3, 1));

    TMP = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(LL00), 4));
    TMP = _mm_insert_ps(TMP, TMP, 0xB0);

    RES = _mm_add_ps(LL00, TMP);
#ifdef NO_FMA
    RES = _mm_mul_ps(RES, ALPHA);
    HL00 = _mm_add_ps(RES, HL00);
#else
    HL00 = _mm_fmadd_ps(RES, ALPHA, HL00);
#endif

    TMP = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(LH00), 4));
    TMP = _mm_insert_ps(TMP, TMP, 0xB0);

    RES = _mm_add_ps(LH00, TMP);
#ifdef NO_FMA
    RES = _mm_mul_ps(RES, ALPHA);
    HH00 = _mm_add_ps(RES, HH00);
#else
    HH00 = _mm_fmadd_ps(RES, ALPHA, HH00);
#endif

    _mm_store_ps(ll, LL00);
    _mm_store_ps(hl, HL00);
    _mm_store_ps(lh, LH00);
    _mm_store_ps(hh, HH00);
}

static inline void sep_V_predict_unpacked_kernel(float * ll, float * hl, float * lh, float * hh, float * ll1, float * hl1, __m128 ALPHA)
{
    __m128 LL0, LL1, HL0, HL1, LH0, HH0;

    LL0 = _mm_load_ps(ll);
    LL1 = _mm_load_ps(ll1);
    HL0 = _mm_load_ps(hl);
    HL1 = _mm_load_ps(hl1);
    LH0 = _mm_load_ps(lh);
    HH0 = _mm_load_ps(hh);

    LL0 = _mm_add_ps(LL0, LL1);
    HL0 = _mm_add_ps(HL0, HL1);
#ifdef NO_FMA
    LL0 = _mm_mul_ps(LL0, ALPHA);
    LH0 = _mm_add_ps(LL0, LH0);
    HL0 = _mm_mul_ps(HL0, ALPHA);
    HH0 = _mm_add_ps(HL0, HH0);
#else
    LH0 = _mm_fmadd_ps(LL0, ALPHA, LH0);
    HH0 = _mm_fmadd_ps(HL0, ALPHA, HH0);
#endif

    _mm_store_ps(lh, LH0);
    _mm_store_ps(hh, HH0);
}

static inline void sep_V_predict_unpacked_symmetric_extension_kernel(float * ll, float * hl, float * lh, float * hh, __m128 ALPHA)
{
    __m128 LL0, HL0, LH0, HH0;

    LL0 = _mm_load_ps(ll);
    HL0 = _mm_load_ps(hl);
    LH0 = _mm_load_ps(lh);
    HH0 = _mm_load_ps(hh);

    LL0 = _mm_add_ps(LL0, LL0);
    HL0 = _mm_add_ps(HL0, HL0);

#ifdef NO_FMA
    LL0 = _mm_mul_ps(LL0, ALPHA);
    LH0 = _mm_add_ps(LL0, LH0);
    HL0 = _mm_mul_ps(HL0, ALPHA);
    HH0 = _mm_add_ps(HL0, HH0);
#else
    LH0 = _mm_fmadd_ps(LL0, ALPHA, LH0);
    HH0 = _mm_fmadd_ps(HL0, ALPHA, HH0);
#endif

    _mm_store_ps(lh, LH0);
    _mm_store_ps(hh, HH0);
}

static inline void sep_H_update_unpacked_kernel(__m128 C0, __m128 X0, __m128 C1, __m128 X1, float * ll, float *lh, __m128 BETA)
{
    __m128 RES1, RES2, TMP1, TMP2;

    RES1 = _mm_load_ps(ll);
    RES2 = _mm_load_ps(lh);

    TMP1 = _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(C0), 4));
    TMP2 = _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(X0), 4));

    TMP1 = _mm_insert_ps(TMP1, C1, 0xC0);
    TMP2 = _mm_insert_ps(TMP2, X1, 0xC0);

    C1 = _mm_add_ps(C0, TMP1);
    X1 = _mm_add_ps(X0, TMP2);

#ifdef NO_FMA
    C1 = _mm_mul_ps(C1, BETA);
    RES1 = _mm_add_ps(C1, RES1);
    X1 = _mm_mul_ps(X1, BETA);
    RES2 = _mm_add_ps(X1, RES2);
#else
    RES1 = _mm_fmadd_ps(C1, BETA, RES1);
    RES2 = _mm_fmadd_ps(X1, BETA, RES2);
#endif
    _mm_store_ps(ll, RES1);
    _mm_store_ps(lh, RES2);
}

static inline void sep_H_update_unpacked_symmetric_extension_kernel(__m128 C0, __m128 X0, float * ll, float *lh, __m128 BETA)
{
    __m128 C1, X1, RES1, RES2, TMP1, TMP2;

    RES1 = _mm_load_ps(ll);
    RES2 = _mm_load_ps(lh);

    TMP1 = _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(C0), 4));
    TMP2 = _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(X0), 4));

    TMP1 = _mm_insert_ps(TMP1, C0, 0);
    TMP2 = _mm_insert_ps(TMP2, X0, 0);

    C1 = _mm_add_ps(C0, TMP1);
    X1 = _mm_add_ps(X0, TMP2);
#ifdef NO_FMA
    C1 = _mm_mul_ps(C1, BETA);
    RES1 = _mm_add_ps(C1, RES1);
    X1 = _mm_mul_ps(X1, BETA);
    RES2 = _mm_add_ps(X1, RES2);
#else
    RES1 = _mm_fmadd_ps(C1, BETA, RES1);
    RES2 = _mm_fmadd_ps(X1, BETA, RES2);
#endif
    _mm_store_ps(ll, RES1);
    _mm_store_ps(lh, RES2);
}

static inline void sep_V_update_unpacked_kernel(float * ll, float * hl, float * lh, float * hh, float * lh1, float * hh1, __m128 BETA)
{
    __m128 LH0, LH1, HH0, HH1, LL0, HL0;

    LH0 = _mm_load_ps(lh);
    LH1 = _mm_load_ps(lh1);
    HH0 = _mm_load_ps(hh);
    HH1 = _mm_load_ps(hh1);
    LL0 = _mm_load_ps(ll);
    HL0 = _mm_load_ps(hl);

    LH0 = _mm_add_ps(LH0, LH1);
    HH0 = _mm_add_ps(HH0, HH1);
#ifdef NO_FMA
    LH0 = _mm_mul_ps(LH0, BETA);
    LL0 = _mm_add_ps(LH0, LL0);
    HH0 = _mm_mul_ps(HH0, BETA);
    HL0 = _mm_add_ps(HH0, HL0);
#else
    LL0 = _mm_fmadd_ps(LH0, BETA, LL0);
    HL0 = _mm_fmadd_ps(HH0, BETA, HL0);
#endif

    _mm_store_ps(ll, LL0);
    _mm_store_ps(hl, HL0);
}

static inline void sep_V_update_unpacked_symmetric_extension_kernel(float * ll, float * hl, float * lh, float * hh, __m128 BETA)
{
    __m128 LH0, HH0, LL0, HL0;

    LH0 = _mm_load_ps(lh);
    HH0 = _mm_load_ps(hh);
    LL0 = _mm_load_ps(ll);
    HL0 = _mm_load_ps(hl);

    LH0 = _mm_add_ps(LH0, LH0);
    HH0 = _mm_add_ps(HH0, HH0);

#ifdef NO_FMA
    LH0 = _mm_mul_ps(LH0, BETA);
    LL0 = _mm_add_ps(LH0, LL0);
    HH0 = _mm_mul_ps(HH0, BETA);
    HL0 = _mm_add_ps(HH0, HL0);
#else
    LL0 = _mm_fmadd_ps(LH0, BETA, LL0);
    HL0 = _mm_fmadd_ps(HH0, BETA, HL0);
#endif

    _mm_store_ps(ll, LL0);
    _mm_store_ps(hl, HL0);
}
#endif

#if defined NO_SSE && defined NO_BARRIER
void NO_TREE_VECTORIZE sep_no_SSE_no_barrier(const Image * img, Bands * bands, TmpMem * tmp0, TmpMem * tmp1, const BandsThreadingInfo * threading_info)
{
    if(threading_info->num_sockets > 1){
        #   pragma omp parallel proc_bind(spread) num_threads(threading_info->num_sockets)
        {
            size_t tid = (size_t)omp_get_thread_num();

            Chunk chunk;
            init_chunk(&chunk, &img[tid], &threading_info[tid], tid);

            #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
            {
                for(size_t idx = chunk.start_index; idx < chunk.end_index; ++idx){
                    size_t i = idx / img[tid].tiles_per_width;
                    size_t j = idx % img[tid].tiles_per_width;

                    Tile tile;
                    TileBands tile_bands;

                    get_tile(&img[tid], &tile, j, i);
                    get_tile_bands(&bands[tid], &tile_bands, j, i);

                    sep_H_predict_with_unpack_no_SSE_no_barrier(&tile, &tile_bands, &threading_info[tid], &tmp0[tid], &tmp1[tid]);
                    sep_V_predict_unpacked_no_SSE_no_barrier(&tile_bands, &threading_info[tid], &tmp0[tid], &tmp1[tid]);
                    sep_H_update_unpacked_no_SSE_no_barrier(&tile_bands, &threading_info[tid], &tmp0[tid]);
                    sep_V_update_unpacked_no_SSE_no_barrier(&tile_bands, &threading_info[tid], &tmp0[tid]);
                }
            }
        }
    } else {
        size_t tid = 0;

        Chunk chunk;
        init_chunk(&chunk, &img[tid], &threading_info[tid], tid);

        #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
        {
            for(size_t idx = chunk.start_index; idx < chunk.end_index; ++idx){
                size_t i = idx / img[tid].tiles_per_width;
                size_t j = idx % img[tid].tiles_per_width;

                Tile tile;
                TileBands tile_bands;

                get_tile(&img[tid], &tile, j, i);
                get_tile_bands(&bands[tid], &tile_bands, j, i);

                sep_H_predict_with_unpack_no_SSE_no_barrier(&tile, &tile_bands, &threading_info[tid], &tmp0[tid], &tmp1[tid]);
                sep_V_predict_unpacked_no_SSE_no_barrier(&tile_bands, &threading_info[tid], &tmp0[tid], &tmp1[tid]);
                sep_H_update_unpacked_no_SSE_no_barrier(&tile_bands, &threading_info[tid], &tmp0[tid]);
                sep_V_update_unpacked_no_SSE_no_barrier(&tile_bands, &threading_info[tid], &tmp0[tid]);
            }
        }
    }
}

void NO_TREE_VECTORIZE sep_H_predict_with_unpack_no_SSE_no_barrier(const Tile * tile, TileBands * tile_bands, const BandsThreadingInfo * threading_info, TmpMem * tmp0, TmpMem * tmp1)
{
    const float ALPHA = -0.5f;

    const size_t size_x = tile->size_x;
    const size_t stride_y = tile->stride_y;
    float * mem = tile->data;

    const size_t band_size_y = tile_bands->size_y;
    const size_t band_stride_y = tile_bands->stride_y;

    const size_t num_threads = (size_t)omp_get_num_threads();
    assert(num_threads <= band_size_y);
    const size_t tid = (size_t)omp_get_thread_num();

    const size_t band_start_y = threading_info->band_start_y[tid];
    const size_t band_end_y = threading_info->band_end_y[tid];

    const size_t img_start_y = band_start_y << 1;
    const size_t img_end_y = band_end_y << 1;

    const size_t next_band_y = band_stride_y - tile_bands->size_x + 1;
    const size_t next_tile_y = 2 * stride_y - size_x + 2;

    float * m0 = mem + img_start_y * stride_y;
    float * m1 = mem + (img_start_y+1) * stride_y;

    float * ll = tile_bands->LL + band_start_y * band_stride_y;
    float * hl = tile_bands->HL + band_start_y * band_stride_y;
    float * lh = tile_bands->LH + band_start_y * band_stride_y;
    float * hh = tile_bands->HH + band_start_y * band_stride_y;

    for(size_t y = img_start_y; y < img_end_y; y += 2){
        for(size_t x = 0; x < size_x-2; x+=2){
            sep_H_predict_with_unpack_no_SSE_kernel(m0, m1, ll, hl, lh, hh, ALPHA);

            ++ll; ++hl; ++hh; ++lh;
            m0+=2; m1+=2;
        }
        sep_H_predict_with_unpack_no_SSE_symmetric_extension_kernel(m0, m1, ll, hl, lh, hh, ALPHA);

        ll += next_band_y; hl += next_band_y; lh += next_band_y; hh += next_band_y;
        m0 += next_tile_y; m1 += next_tile_y;
    }

    // redundant computation
    lh = tmp0->LH + tid * tmp0->band_size_x;
    hh = tmp0->HH + tid * tmp0->band_size_x;
    ll = tmp0->LL + tid * tmp0->band_size_x;
    hl = tmp0->HL + tid * tmp0->band_size_x;

    if(tid != 0){
        size_t y = img_start_y - 2;
        m0 = mem + (y) * stride_y;
        m1 = mem + (y+1) * stride_y;

        for(size_t x = 0; x < size_x-2; x+=2){
            sep_H_predict_with_unpack_no_SSE_kernel(m0, m1, ll, hl, lh, hh, ALPHA);

            ++ll; ++hl; ++hh; ++lh;
            m0+=2; m1+=2;
        }
        sep_H_predict_with_unpack_no_SSE_symmetric_extension_kernel(m0, m1, ll, hl, lh, hh, ALPHA);
    }

    float LL00, LL01, HL00;

    ll = tmp1->LL + tid * tmp1->band_size_x;
    hl = tmp1->HL + tid * tmp1->band_size_x;
    if(tid != num_threads - 1){
        m0 = mem + img_end_y * stride_y;
        for(size_t x = 0; x < size_x-2; x+=2){
            LL00 = *(m0);
            LL01 = *(m0+2);
            HL00 = *(m0+1);

            HL00 += ALPHA * (LL00 + LL01);

            *ll = LL00; *hl = HL00;

            ++ll; ++hl;
            m0 += 2;
        }

        LL00 = *(m0); LL01 = *(m0);
        HL00 = *(m0+1);

        HL00 += ALPHA * (LL00 + LL01);

        *ll = LL00; *hl = HL00;
    }
}

void NO_TREE_VECTORIZE sep_V_predict_unpacked_no_SSE_no_barrier(TileBands * tile_bands, const BandsThreadingInfo * threading_info, TmpMem * tmp0, TmpMem * tmp1)
{
    const float ALPHA = -0.5f;

    const size_t band_size_x = tile_bands->size_x;
    const size_t band_size_y = tile_bands->size_y;
    const size_t band_stride_y = tile_bands->stride_y;
    float * LL = tile_bands->LL;
    float * HL = tile_bands->HL;
    float * LH = tile_bands->LH;
    float * HH = tile_bands->HH;

    const size_t num_threads = (size_t)omp_get_num_threads();
    assert(num_threads <= band_size_y);
    const size_t tid = (size_t)omp_get_thread_num();

    const size_t band_start_y = threading_info->band_start_y[tid];
    const size_t band_end_y = threading_info->band_end_y[tid];

    const size_t next_y = band_stride_y - band_size_x;

    float * ll, *hl, *lh, *hh, *ll1, *hl1;

    ll = LL + band_start_y * band_stride_y;
    hl = HL + band_start_y * band_stride_y;
    lh = LH + band_start_y * band_stride_y;
    hh = HH + band_start_y * band_stride_y;

    ll1 = LL + (band_start_y+1) * band_stride_y;
    hl1 = HL + (band_start_y+1) * band_stride_y;

    for(size_t y = band_start_y; y < band_end_y-1; ++y){
        for(size_t x = 0; x < band_size_x; ++x){
            sep_V_predict_unpacked_no_SSE_kernel(ll, hl, lh, hh, ll1, hl1, ALPHA);

            ++ll; ++hl; ++lh; ++hh; ++ll1; ++hl1;
        }

        ll += next_y; hl += next_y; lh += next_y; hh += next_y; ll1 += next_y; hl1 += next_y;
    }
    // last row
    if(tid == num_threads - 1){
        for(size_t x = 0; x < band_size_x; ++x){
            sep_V_predict_unpacked_no_SSE_symmetric_extension_kernel(ll, hl, lh, hh, ALPHA);

            ++ll; ++hl; ++lh; ++hh;
        }
    } else {
        ll1 = tmp1->LL + tid * tmp1->band_size_x;
        hl1 = tmp1->HL + tid * tmp1->band_size_x;
        for(size_t x = 0; x < band_size_x; ++x){
            sep_V_predict_unpacked_no_SSE_kernel(ll, hl, lh, hh, ll1, hl1, ALPHA);

            ++ll; ++hl; ++lh; ++hh; ++ll1; ++hl1;
        }
    }

    // redundant computation
    // update upper row
    if(tid != 0){
        ll = tmp0->LL + tid * tmp0->band_size_x;
        hl = tmp0->HL + tid * tmp0->band_size_x;
        lh = tmp0->LH + tid * tmp0->band_size_x;
        hh = tmp0->HH + tid * tmp0->band_size_x;

        ll1 = LL + band_start_y * band_stride_y;
        hl1 = HL + band_start_y * band_stride_y;

        for(size_t x = 0; x < band_size_x; ++x){
            sep_V_predict_unpacked_no_SSE_kernel(ll, hl, lh, hh, ll1, hl1, ALPHA);

           ++ll; ++hl; ++lh; ++hh; ++ll1; ++hl1;
        }
    }
}

void NO_TREE_VECTORIZE sep_H_update_unpacked_no_SSE_no_barrier(TileBands * tile_bands, const BandsThreadingInfo * threading_info, TmpMem * tmp0)
{
    const float BETA = 0.25f;

    const size_t band_size_x = tile_bands->size_x;
    const size_t band_size_y = tile_bands->size_y;
    const size_t band_stride_y = tile_bands->stride_y;

    const size_t num_threads = (size_t)omp_get_num_threads();
    assert(num_threads <= band_size_y);
    const size_t tid = (size_t)omp_get_thread_num();

    const size_t band_start_y = threading_info->band_start_y[tid];
    const size_t band_end_y = threading_info->band_end_y[tid];

    const size_t next_y = band_stride_y - band_size_x;

    float * ll = tile_bands->LL + band_start_y * band_stride_y;
    float * hl = tile_bands->HL + band_start_y * band_stride_y;
    float * lh = tile_bands->LH + band_start_y * band_stride_y;
    float * hh = tile_bands->HH + band_start_y * band_stride_y;

    for(size_t y = band_start_y; y < band_end_y; ++y){
        // first 4 columns
        sep_H_update_unpacked_no_SSE_symmetric_extension_kernel(ll, hl, lh, hh, BETA);

        ++ll; ++hl; ++lh; ++hh;
        for(size_t x = 1; x < band_size_x; x += 1){ // main area
            sep_H_update_unpacked_no_SSE_kernel(ll, hl, lh, hh, BETA);

            ++ll; ++hl; ++lh; ++hh;
        }
        ll += next_y; hl += next_y; lh += next_y; hh += next_y;
    }

    // redundant computation
    if(tid != 0){
        ll = tmp0->LL + tid * tmp0->band_size_x;
        hl = tmp0->HL + tid * tmp0->band_size_x;
        lh = tmp0->LH + tid * tmp0->band_size_x;
        hh = tmp0->HH + tid * tmp0->band_size_x;

        // first 4 columns
        sep_H_update_unpacked_no_SSE_symmetric_extension_kernel(ll, hl, lh, hh, BETA);

        ++ll; ++hl; ++lh; ++hh;
        for(size_t x = 1; x < band_size_x; ++x){ // main area
            sep_H_update_unpacked_no_SSE_kernel(ll, hl, lh, hh, BETA);
            ++ll; ++hl; ++lh; ++hh;
        }
    }
}

void NO_TREE_VECTORIZE sep_V_update_unpacked_no_SSE_no_barrier(TileBands * tile_bands, const BandsThreadingInfo * threading_info, TmpMem * tmp0)
{
    const float BETA = 0.25f;

    const size_t band_size_x = tile_bands->size_x;
    const size_t band_size_y = tile_bands->size_y;
    const size_t band_stride_y = tile_bands->stride_y;

    const size_t num_threads = (size_t)omp_get_num_threads();
    assert(num_threads <= band_size_y);
    const size_t tid = (size_t)omp_get_thread_num();

    const size_t band_start_y = threading_info->band_start_y[tid];
    const size_t band_end_y = threading_info->band_end_y[tid];

    const size_t next_y = band_stride_y - band_size_x;

    float * ll = tile_bands->LL + band_start_y * band_stride_y;
    float * hl = tile_bands->HL + band_start_y * band_stride_y;
    float * lh = tile_bands->LH + band_start_y * band_stride_y;
    float * hh = tile_bands->HH + band_start_y * band_stride_y;

    if(tid != 0){
        float * lh1 = tmp0->LH + tid * tmp0->band_size_x;
        float * hh1 = tmp0->HH + tid * tmp0->band_size_x;
        for(size_t x = 0; x < band_size_x; ++x){
            sep_V_update_unpacked_no_SSE_kernel(ll, hl, lh, hh, lh1, hh1, BETA);
            ++ll; ++hl; ++lh; ++hh; ++lh1; ++hh1;
        }

        ll += next_y; hl += next_y; lh += next_y; hh += next_y;

        lh1 = tile_bands->LH + (band_start_y) * band_stride_y;
        hh1 = tile_bands->HH + (band_start_y) * band_stride_y;
        for(size_t y = band_start_y + 1; y < band_end_y; ++y){
            for(size_t x = 0; x < band_size_x; ++x){
                sep_V_update_unpacked_no_SSE_kernel(ll, hl, lh, hh, lh1, hh1, BETA);

                ++ll; ++hl; ++lh; ++hh; ++lh1; ++hh1;
            }
            ll += next_y; hl += next_y; lh += next_y; hh += next_y; lh1+=next_y; hh1+=next_y;
        }
    }else{
        for(size_t x = 0; x < band_size_x; ++x){ // first row
            sep_V_update_unpacked_no_SSE_symmetric_extension_kernel(ll, hl, lh, hh, BETA);
            ++ll; ++hl; ++lh; ++hh;
        }

        ll += next_y; hl += next_y; lh += next_y; hh += next_y;

        float * lh1 = tile_bands->LH + (band_start_y) * band_stride_y;
        float * hh1 = tile_bands->HH + (band_start_y) * band_stride_y;
        for(size_t y = band_start_y+1; y < band_end_y; ++y){
            for(size_t x = 0; x < band_size_x; ++x){
                sep_V_update_unpacked_no_SSE_kernel(ll, hl, lh, hh, lh1, hh1, BETA);
                ++ll; ++hl; ++lh; ++hh; ++lh1; ++hh1;
            }
            ll += next_y; hl += next_y; lh += next_y; hh += next_y; lh1+=next_y; hh1+=next_y;
        }
    }
}
#elif defined NO_SSE
void NO_TREE_VECTORIZE sep_no_SSE(const Image * img, Bands * bands, const BandsThreadingInfo * threading_info)
{
    if(threading_info->num_sockets > 1){
        #   pragma omp parallel proc_bind(spread) num_threads(threading_info->num_sockets)
        {
            size_t tid = (size_t)omp_get_thread_num();

            Chunk chunk;
            init_chunk(&chunk, &img[tid], &threading_info[tid], tid);

            #   pragma omp parallel num_threads(threading_info->num_threads)
            {
                for(size_t idx = chunk.start_index; idx < chunk.end_index; ++idx){
                    size_t i = idx / img[tid].tiles_per_width;
                    size_t j = idx % img[tid].tiles_per_width;

                    Tile tile;
                    TileBands tile_bands;

                    get_tile(&img[tid], &tile, j, i);
                    get_tile_bands(&bands[tid], &tile_bands, j, i);

                    sep_H_predict_with_unpack_no_SSE(&tile, &tile_bands, &threading_info[tid]);
                    sep_V_predict_unpacked_no_SSE(&tile_bands, &threading_info[tid]);
                    sep_H_update_unpacked_no_SSE(&tile_bands, &threading_info[tid]);
                    sep_V_update_unpacked_no_SSE(&tile_bands, &threading_info[tid]);
                }
            }
        }
    } else {
        size_t tid = 0;

        Chunk chunk;
        init_chunk(&chunk, &img[tid], &threading_info[tid], tid);

        #   pragma omp parallel num_threads(threading_info->num_threads)
        {
            for(size_t idx = chunk.start_index; idx < chunk.end_index; ++idx){
                size_t i = idx / img[tid].tiles_per_width;
                size_t j = idx % img[tid].tiles_per_width;

                Tile tile;
                TileBands tile_bands;

                get_tile(&img[tid], &tile, j, i);
                get_tile_bands(&bands[tid], &tile_bands, j, i);

                sep_H_predict_with_unpack_no_SSE(&tile, &tile_bands, &threading_info[tid]);
                sep_V_predict_unpacked_no_SSE(&tile_bands, &threading_info[tid]);
                sep_H_update_unpacked_no_SSE(&tile_bands, &threading_info[tid]);
                sep_V_update_unpacked_no_SSE(&tile_bands, &threading_info[tid]);
            }
        }
    }
}

void NO_TREE_VECTORIZE sep_H_predict_with_unpack_no_SSE(const Tile * tile, TileBands * tile_bands, const BandsThreadingInfo * threading_info)
{
    const float ALPHA = -0.5f;

    const size_t size_x = tile->size_x;
    const size_t stride_y = tile->stride_y;
    float * mem = tile->data;

    const size_t band_size_y = tile_bands->size_y;
    const size_t band_stride_y = tile_bands->stride_y;

    const size_t num_threads = (size_t)omp_get_num_threads();
    assert(num_threads <= band_size_y);
    const size_t tid = (size_t)omp_get_thread_num();

    const size_t band_start_y = threading_info->band_start_y[tid];
    const size_t band_end_y = threading_info->band_end_y[tid];

    const size_t img_start_y = band_start_y << 1;
    const size_t img_end_y = band_end_y << 1;

    const size_t next_band_y = band_stride_y - tile_bands->size_x + 1;
    const size_t next_tile_y = 2 * stride_y - size_x + 2;

    float * m0 = mem + img_start_y * stride_y;
    float * m1 = mem + (img_start_y+1) * stride_y;

    float * ll = tile_bands->LL + band_start_y * band_stride_y;
    float * hl = tile_bands->HL + band_start_y * band_stride_y;
    float * lh = tile_bands->LH + band_start_y * band_stride_y;
    float * hh = tile_bands->HH + band_start_y * band_stride_y;

    for(size_t y = img_start_y; y < img_end_y; y += 2){
        for(size_t x = 0; x < size_x-2; x+=2){
            
            sep_H_predict_with_unpack_no_SSE_kernel(m0, m1, ll, hl, lh, hh, ALPHA);

            ++ll; ++hl; ++hh; ++lh;
            m0+=2; m1+=2;
        }
        
        sep_H_predict_with_unpack_no_SSE_symmetric_extension_kernel(m0, m1, ll, hl, lh, hh, ALPHA);

        ll += next_band_y; hl += next_band_y; lh += next_band_y; hh += next_band_y;
        m0 += next_tile_y; m1 += next_tile_y;
    }
    #pragma omp barrier
}

void NO_TREE_VECTORIZE sep_V_predict_unpacked_no_SSE(TileBands * tile_bands, const BandsThreadingInfo * threading_info)
{
    const float ALPHA = -0.5f;

    const size_t band_size_x = tile_bands->size_x;
    const size_t band_size_y = tile_bands->size_y;
    const size_t band_stride_y = tile_bands->stride_y;

    const size_t num_threads = (size_t)omp_get_num_threads();
    assert(num_threads <= band_size_y);
    const size_t tid = (size_t)omp_get_thread_num();

    const size_t band_start_y = threading_info->band_start_y[tid];
    const size_t band_end_y = threading_info->band_end_y[tid];

    const size_t next_y = band_stride_y - band_size_x;

    float * ll = tile_bands->LL + band_start_y * band_stride_y;
    float * hl = tile_bands->HL + band_start_y * band_stride_y;
    float * lh = tile_bands->LH + band_start_y * band_stride_y;
    float * hh = tile_bands->HH + band_start_y * band_stride_y;

    float * ll1 = tile_bands->LL + (band_start_y+1) * band_stride_y;
    float * hl1 = tile_bands->HL + (band_start_y+1) * band_stride_y;

    if(tid != num_threads - 1){
        for(size_t y = band_start_y; y < band_end_y; ++y){
            for(size_t x = 0; x < band_size_x; ++x){
                
                sep_V_predict_unpacked_no_SSE_kernel(ll, hl, lh, hh, ll1, hl1, ALPHA);

                ++ll; ++hl; ++lh; ++hh; ++ll1; ++hl1;
            }
            
            ll += next_y; hl += next_y; lh += next_y; hh += next_y; ll1+=next_y; hl1+=next_y;
        }
    } else {
        for(size_t y = band_start_y; y < band_end_y-1; ++y){
            for(size_t x = 0; x < band_size_x; ++x){
                sep_V_predict_unpacked_no_SSE_kernel(ll, hl, lh, hh, ll1, hl1, ALPHA);

                ++ll; ++hl; ++lh; ++hh; ++ll1; ++hl1;
            }

            ll += next_y; hl += next_y; lh += next_y; hh += next_y; ll1+=next_y; hl1+=next_y;
        }
        // last row
        for(size_t x = 0; x < band_size_x; ++x){
            sep_V_predict_unpacked_no_SSE_symmetric_extension_kernel(ll, hl, lh, hh, ALPHA);

            ++ll; ++hl; ++lh; ++hh;
        }
    }

    #pragma omp barrier
}

void NO_TREE_VECTORIZE sep_H_update_unpacked_no_SSE(TileBands * tile_bands, const BandsThreadingInfo * threading_info)
{
    const float BETA = 0.25f;

    const size_t band_size_x = tile_bands->size_x;
    const size_t band_size_y = tile_bands->size_y;
    const size_t band_stride_y = tile_bands->stride_y;

    const size_t num_threads = (size_t)omp_get_num_threads();
    assert(num_threads <= band_size_y);
    const size_t tid = (size_t)omp_get_thread_num();

    const size_t band_start_y = threading_info->band_start_y[tid];
    const size_t band_end_y = threading_info->band_end_y[tid];

    const size_t next_y = band_stride_y - band_size_x;

    float * ll = tile_bands->LL + band_start_y * band_stride_y;
    float * hl = tile_bands->HL + band_start_y * band_stride_y;
    float * lh = tile_bands->LH + band_start_y * band_stride_y;
    float * hh = tile_bands->HH + band_start_y * band_stride_y;

    for(size_t y = band_start_y; y < band_end_y; ++y){
        // first 4 columns
        sep_H_update_unpacked_no_SSE_symmetric_extension_kernel(ll, hl, lh, hh, BETA);

        ++ll; ++hl; ++lh; ++hh;
        for(size_t x = 1; x < band_size_x; x += 1){ // main area
            
            sep_H_update_unpacked_no_SSE_kernel(ll, hl, lh, hh, BETA);

            ++ll; ++hl; ++lh; ++hh;
        }
        
        ll += next_y; hl += next_y; lh += next_y; hh += next_y;
    }
    #pragma omp barrier
}

void NO_TREE_VECTORIZE sep_V_update_unpacked_no_SSE(TileBands * tile_bands, const BandsThreadingInfo * threading_info)
{
    const float BETA = 0.25f;

    const size_t band_size_x = tile_bands->size_x;
    const size_t band_size_y = tile_bands->size_y;
    const size_t band_stride_y = tile_bands->stride_y;

    const size_t num_threads = (size_t)omp_get_num_threads();
    assert(num_threads <= band_size_y);
    const size_t tid = (size_t)omp_get_thread_num();

    const size_t band_start_y = threading_info->band_start_y[tid];
    const size_t band_end_y = threading_info->band_end_y[tid];

    const size_t next_y = band_stride_y - band_size_x;

    float * ll = tile_bands->LL + band_start_y * band_stride_y;
    float * hl = tile_bands->HL + band_start_y * band_stride_y;
    float * lh = tile_bands->LH + band_start_y * band_stride_y;
    float * hh = tile_bands->HH + band_start_y * band_stride_y;

    if(tid != 0){
        float * lh1 = tile_bands->LH + (band_start_y-1) * band_stride_y;
        float * hh1 = tile_bands->HH + (band_start_y-1) * band_stride_y;

        for(size_t y = band_start_y; y < band_end_y; ++y){
            for(size_t x = 0; x < band_size_x; ++x){
                
                sep_V_update_unpacked_no_SSE_kernel(ll, hl, lh, hh, lh1, hh1, BETA);

                ++ll; ++hl; ++lh; ++hh; ++lh1; ++hh1;
            }
            
            ll += next_y; hl += next_y; lh += next_y; hh += next_y; lh1+=next_y; hh1+=next_y;
        }
    }else{
        // first row
        for(size_t x = 0; x < band_size_x; ++x){
            sep_V_update_unpacked_no_SSE_symmetric_extension_kernel(ll, hl, lh, hh, BETA);
            ++ll; ++hl; ++lh; ++hh;
        }

        ll += next_y; hl += next_y; lh += next_y; hh += next_y;

        float * lh1 = tile_bands->LH + (band_start_y) * band_stride_y;
        float * hh1 = tile_bands->HH + (band_start_y) * band_stride_y;

        for(size_t y = band_start_y+1; y < band_end_y; ++y){
            for(size_t x = 0; x < band_size_x; ++x){
                sep_V_update_unpacked_no_SSE_kernel(ll, hl, lh, hh, lh1, hh1, BETA);
                ++ll; ++hl; ++lh; ++hh; ++lh1; ++hh1;
            }
            ll += next_y; hl += next_y; lh += next_y; hh += next_y; lh1+=next_y; hh1+=next_y;
        }
    }
    #   pragma omp barrier
}
#elif NO_BARRIER
void sep_no_barrier(const Image * img, Bands * bands, TmpMem * tmp_mem0, TmpMem * tmp_mem1, const BandsThreadingInfo * threading_info)
{
    if(threading_info->num_sockets > 1){
        #   pragma omp parallel proc_bind(spread) num_threads(threading_info->num_sockets)
        {
            size_t tid = (size_t)omp_get_thread_num();

            Chunk chunk;
            init_chunk(&chunk, &img[tid], &threading_info[tid], tid);

            #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
            {
                for(size_t idx = chunk.start_index; idx < chunk.end_index; ++idx){
                    size_t i = idx / img[tid].tiles_per_width;
                    size_t j = idx % img[tid].tiles_per_width;

                    Tile tile;
                    TileBands tile_bands;

                    get_tile(&img[tid], &tile, j, i);
                    get_tile_bands(&bands[tid], &tile_bands, j, i);

                    sep_H_predict_with_unpack_no_barrier(&tile, &tile_bands, &threading_info[tid], &tmp_mem0[tid], &tmp_mem1[tid]);
                    sep_V_predict_unpacked_no_barrier(&tile_bands, &threading_info[tid], &tmp_mem0[tid], &tmp_mem1[tid]);
                    sep_H_update_unpacked_no_barrier(&tile_bands, &threading_info[tid], &tmp_mem0[tid]);
                    sep_V_update_unpacked_no_barrier(&tile_bands, &threading_info[tid], &tmp_mem0[tid]);
                }
            }
        }
    } else {
        size_t tid = 0;

        Chunk chunk;
        init_chunk(&chunk, &img[tid], &threading_info[tid], tid);

        #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
        {
            for(size_t idx = chunk.start_index; idx < chunk.end_index; ++idx){
                size_t i = idx / img[tid].tiles_per_width;
                size_t j = idx % img[tid].tiles_per_width;

                Tile tile;
                TileBands tile_bands;

                get_tile(&img[tid], &tile, j, i);
                get_tile_bands(&bands[tid], &tile_bands, j, i);

                sep_H_predict_with_unpack_no_barrier(&tile, &tile_bands, &threading_info[tid], &tmp_mem0[tid], &tmp_mem1[tid]);
                sep_V_predict_unpacked_no_barrier(&tile_bands, &threading_info[tid], &tmp_mem0[tid], &tmp_mem1[tid]);
                sep_H_update_unpacked_no_barrier(&tile_bands, &threading_info[tid], &tmp_mem0[tid]);
                sep_V_update_unpacked_no_barrier(&tile_bands, &threading_info[tid], &tmp_mem0[tid]);
            }
        }
    }
}

void sep_H_predict_with_unpack_no_barrier(const Tile * tile, TileBands * tile_bands, const BandsThreadingInfo * threading_info, TmpMem * tmp0, TmpMem * tmp1)
{
    const __m128 ALPHA = _mm_set_ps1(-0.5f);

    const size_t size_x = tile->size_x;
    const size_t stride_y = tile->stride_y;
    float * mem = tile->data;

    const size_t band_size_y = tile_bands->size_y;
    const size_t band_stride_y = tile_bands->stride_y;

    const size_t num_threads = (size_t)omp_get_num_threads();
    assert(num_threads <= band_size_y);
    const size_t tid = (size_t)omp_get_thread_num();

    const size_t band_start_y = threading_info->band_start_y[tid];
    const size_t band_end_y = threading_info->band_end_y[tid];

    const size_t img_start_y = band_start_y << 1;
    const size_t img_end_y = band_end_y << 1;

    const size_t next_band_y = band_stride_y - tile_bands->size_x + 4;
    const size_t next_tile_y = 2 * stride_y - size_x + 8;

    float * m0 = mem + img_start_y * stride_y;
    float * m1 = mem + (img_start_y+1) * stride_y;

    float * ll = tile_bands->LL + band_start_y * band_stride_y;
    float * hl = tile_bands->HL + band_start_y * band_stride_y;
    float * lh = tile_bands->LH + band_start_y * band_stride_y;
    float * hh = tile_bands->HH + band_start_y * band_stride_y;

    __m128 R0C0, R0C1, R0C2, R1C0, R1C1, R1C2;

    for(size_t y = img_start_y; y < img_end_y; y += 2){
        R0C0 = _mm_load_ps(m0);
        R1C0 = _mm_load_ps(m1);
        for(size_t x = 0; x < size_x-8; x+=8){
            R0C1 = _mm_load_ps(m0+4); R0C2 = _mm_load_ps(m0+8);
            R1C1 = _mm_load_ps(m1+4); R1C2 = _mm_load_ps(m1+8);

            sep_H_predict_with_unpack_kernel(R0C0, R0C1, R0C2, R1C0, R1C1, R1C2, ll, hl, lh, hh, ALPHA);

            R0C0 = R0C2; R1C0 = R1C2;

            ll += 4; hl += 4; hh += 4; lh += 4;
            m0 += 8; m1 += 8;
        }
        R0C1 = _mm_load_ps(m0+4); R1C1 = _mm_load_ps(m1+4);

        sep_H_predict_with_unpack_symmetric_extension_kernel(R0C0, R0C1, R1C0, R1C1, ll, hl, lh, hh, ALPHA);

        ll += next_band_y; hl += next_band_y; lh += next_band_y; hh += next_band_y;
        m0 += next_tile_y; m1 += next_tile_y;
    }

    // redundant computation
    lh = tmp0->LH + tid * tmp0->band_size_x;
    hh = tmp0->HH + tid * tmp0->band_size_x;
    ll = tmp0->LL + tid * tmp0->band_size_x;
    hl = tmp0->HL + tid * tmp0->band_size_x;

    if(tid != 0){
        size_t y = img_start_y - 2;
        m0 = mem + (y) * stride_y;
        m1 = mem + (y+1) * stride_y;
        R0C0 = _mm_load_ps(m0);
        R1C0 = _mm_load_ps(m1);
        for(size_t x = 0; x < size_x-8; x+=8){
            R0C1 = _mm_load_ps(m0+4); R0C2 = _mm_load_ps(m0+8);
            R1C1 = _mm_load_ps(m1+4); R1C2 = _mm_load_ps(m1+8);

            sep_H_predict_with_unpack_kernel(R0C0, R0C1, R0C2, R1C0, R1C1, R1C2, ll, hl, lh, hh, ALPHA);

            R0C0 = R0C2; R1C0 = R1C2;

            ll += 4; hl += 4; hh += 4; lh += 4;
            m0 += 8; m1 += 8;
        }
        R0C1 = _mm_load_ps(m0+4); R1C1 = _mm_load_ps(m1+4);

        sep_H_predict_with_unpack_symmetric_extension_kernel(R0C0, R0C1, R1C0, R1C1, ll, hl, lh, hh, ALPHA);
    }

    __m128 LL00, HL00, TMP, RES;

    ll = tmp1->LL + tid * tmp1->band_size_x;
    hl = tmp1->HL + tid * tmp1->band_size_x;
    if(tid != num_threads - 1){
        m0 = mem + img_end_y * stride_y;
        for(size_t x = 0; x < size_x-8; x+=8){
            R0C0 = _mm_load_ps(m0);
            R0C1 = _mm_load_ps(m0+4);
            R0C2 = _mm_load_ps(m0+8);

            LL00 = _mm_shuffle_ps(R0C0, R0C1, _MM_SHUFFLE(2, 0, 2, 0));
            HL00 = _mm_shuffle_ps(R0C0, R0C1, _MM_SHUFFLE(3, 1, 3, 1));

            TMP = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(LL00), 4));
            TMP = _mm_insert_ps(TMP, R0C2, 0x30);

            RES = _mm_add_ps(LL00, TMP);
#ifdef NO_FMA
            RES = _mm_mul_ps(RES, ALPHA);
            HL00 = _mm_add_ps(RES, HL00);
#else
            HL00 = _mm_fmadd_ps(RES, ALPHA, HL00);
#endif

            _mm_store_ps(hl, HL00);
            _mm_store_ps(ll, LL00);

            ll += 4; hl += 4;
            m0 += 8;
        }

        R0C0 = _mm_load_ps(m0);
        R0C1 = _mm_load_ps(m0+4);

        LL00 = _mm_shuffle_ps(R0C0, R0C1, _MM_SHUFFLE(2, 0, 2, 0));
        HL00 = _mm_shuffle_ps(R0C0, R0C1, _MM_SHUFFLE(3, 1, 3, 1));

        TMP = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(LL00), 4));
        TMP = _mm_insert_ps(TMP, TMP, 0xB0);

        RES = _mm_add_ps(LL00, TMP);
#ifdef NO_FMA
        RES = _mm_mul_ps(RES, ALPHA);
        HL00 = _mm_add_ps(RES, HL00);
#else
        HL00 = _mm_fmadd_ps(RES, ALPHA, HL00);
#endif

        _mm_store_ps(hl, HL00);
        _mm_store_ps(ll, LL00);
    }
}

void sep_V_predict_unpacked_no_barrier(TileBands * tile_bands, const BandsThreadingInfo * threading_info, TmpMem * tmp0, TmpMem * tmp1)
{
    const __m128 ALPHA = _mm_set1_ps(-0.5f);

    const size_t band_size_x = tile_bands->size_x;
    const size_t band_size_y = tile_bands->size_y;
    const size_t band_stride_y = tile_bands->stride_y;
    float * LL = tile_bands->LL;
    float * HL = tile_bands->HL;
    float * LH = tile_bands->LH;
    float * HH = tile_bands->HH;

    const size_t num_threads = (size_t)omp_get_num_threads();
    assert(num_threads <= band_size_y);
    const size_t tid = (size_t)omp_get_thread_num();

    const size_t band_start_y = threading_info->band_start_y[tid];
    const size_t band_end_y = threading_info->band_end_y[tid];

    const size_t next_y = band_stride_y - band_size_x;

    float * ll, *hl, *lh, *hh, *ll1, *hl1;

    ll = LL + band_start_y * band_stride_y;
    hl = HL + band_start_y * band_stride_y;
    lh = LH + band_start_y * band_stride_y;
    hh = HH + band_start_y * band_stride_y;

    ll1 = LL + (band_start_y+1) * band_stride_y;
    hl1 = HL + (band_start_y+1) * band_stride_y;

    for(size_t y = band_start_y; y < band_end_y-1; ++y){
        for(size_t x = 0; x < band_size_x; x += 4){
            sep_V_predict_unpacked_kernel(ll, hl, lh, hh, ll1, hl1, ALPHA);

            ll+=4; hl+=4; lh+=4; hh+=4; ll1+=4; hl1+=4;
        }

        ll += next_y; hl += next_y; lh += next_y; hh += next_y; ll1 += next_y; hl1 += next_y;
    }
    // last row
    if(tid == num_threads - 1){
        for(size_t x = 0; x < band_size_x; x += 4){
            sep_V_predict_unpacked_symmetric_extension_kernel(ll, hl, lh, hh, ALPHA);

            ll+=4; hl+=4; lh+=4; hh+=4;
        }
    } else {
        ll1 = tmp1->LL + tid * tmp1->band_size_x;
        hl1 = tmp1->HL + tid * tmp1->band_size_x;
        for(size_t x = 0; x < band_size_x; x += 4){
            sep_V_predict_unpacked_kernel(ll, hl, lh, hh, ll1, hl1, ALPHA);

            ll+=4; hl+=4; lh+=4; hh+=4; ll1+=4; hl1+=4;
        }
    }

    // redundant computation
    // update upper row
    if(tid != 0){
        ll = tmp0->LL + tid * tmp0->band_size_x;
        hl = tmp0->HL + tid * tmp0->band_size_x;
        lh = tmp0->LH + tid * tmp0->band_size_x;
        hh = tmp0->HH + tid * tmp0->band_size_x;

        ll1 = LL + band_start_y * band_stride_y;
        hl1 = HL + band_start_y * band_stride_y;

        for(size_t x = 0; x < band_size_x; x += 4){
            sep_V_predict_unpacked_kernel(ll, hl, lh, hh, ll1, hl1, ALPHA);

            ll+=4; hl+=4; lh+=4; hh+=4; ll1+=4; hl1+=4;
        }
    }
}

void sep_H_update_unpacked_no_barrier(TileBands * tile_bands, const BandsThreadingInfo * threading_info, TmpMem * tmp0)
{
    const __m128 BETA = _mm_set1_ps(0.25f);

    const size_t band_size_x = tile_bands->size_x;
    const size_t band_size_y = tile_bands->size_y;
    const size_t band_stride_y = tile_bands->stride_y;

    const size_t num_threads = (size_t)omp_get_num_threads();
    assert(num_threads <= band_size_y);
    const size_t tid = (size_t)omp_get_thread_num();

    const size_t band_start_y = threading_info->band_start_y[tid];
    const size_t band_end_y = threading_info->band_end_y[tid];

    const size_t next_y = band_stride_y - band_size_x;

    __m128 C0, C1, X0, X1;

    float * ll = tile_bands->LL + band_start_y * band_stride_y;
    float * hl = tile_bands->HL + band_start_y * band_stride_y;
    float * lh = tile_bands->LH + band_start_y * band_stride_y;
    float * hh = tile_bands->HH + band_start_y * band_stride_y;

    for(size_t y = band_start_y; y < band_end_y; ++y){
        // first 4 columns
        C0 = _mm_load_ps(hl); X0 = _mm_load_ps(hh);

        sep_H_update_unpacked_symmetric_extension_kernel(C0, X0, ll, lh, BETA);

        C1 = C0; X1 = X0;

        ll += 4; hl += 4; lh += 4; hh += 4;

        //main area
        for(size_t x = 4; x < band_size_x; x += 4){
            C0 = _mm_load_ps(hl); X0 = _mm_load_ps(hh);

            sep_H_update_unpacked_kernel(C0, X0, C1, X1, ll, lh, BETA);

            C1 = C0; X1 = X0;

            ll += 4; hl += 4; lh += 4; hh += 4;
        }

        ll += next_y; hl += next_y; lh += next_y; hh += next_y;
    }

    // redundant computation
    if(tid != 0){
        ll = tmp0->LL + tid * tmp0->band_size_x;
        hl = tmp0->HL + tid * tmp0->band_size_x;
        lh = tmp0->LH + tid * tmp0->band_size_x;
        hh = tmp0->HH + tid * tmp0->band_size_x;

        // first 4 columns
        C0 = _mm_load_ps(hl); X0 = _mm_load_ps(hh);

        sep_H_update_unpacked_symmetric_extension_kernel(C0, X0, ll, lh, BETA);

        C1 = C0; X1 = X0;

        ll += 4; hl += 4; lh += 4; hh += 4;

        // main area
        for(size_t x = 4; x < band_size_x; x += 4){
            C0 = _mm_load_ps(hl); X0 = _mm_load_ps(hh);

            sep_H_update_unpacked_kernel(C0, X0, C1, X1, ll, lh, BETA);

            C1 = C0; X1 = X0;

            ll += 4; hl += 4; lh += 4; hh += 4;
        }
    }
}

void sep_V_update_unpacked_no_barrier(TileBands * tile_bands, const BandsThreadingInfo * threading_info, TmpMem * tmp0)
{
    const __m128 BETA = _mm_set1_ps(0.25f);

    const size_t band_size_x = tile_bands->size_x;
    const size_t band_size_y = tile_bands->size_y;
    const size_t band_stride_y = tile_bands->stride_y;
    float * LL = tile_bands->LL;
    float * HL = tile_bands->HL;
    float * LH = tile_bands->LH;
    float * HH = tile_bands->HH;

    const size_t num_threads = (size_t)omp_get_num_threads();
    assert(num_threads <= band_size_y);
    const size_t tid = (size_t)omp_get_thread_num();

    const size_t band_start_y = threading_info->band_start_y[tid];
    const size_t band_end_y = threading_info->band_end_y[tid];

    const size_t next_y = band_stride_y - band_size_x;

    float * ll, *hl, *lh, *hh, *lh1, *hh1;

    ll = LL + band_start_y * band_stride_y;
    hl = HL + band_start_y * band_stride_y;
    lh = LH + band_start_y * band_stride_y;
    hh = HH + band_start_y * band_stride_y;

    if(tid != 0){
        lh1 = tmp0->LH + tid * tmp0->band_size_x;
        hh1 = tmp0->HH + tid * tmp0->band_size_x;

        for(size_t x = 0; x < band_size_x; x += 4){
            sep_V_update_unpacked_kernel(ll, hl, lh, hh, lh1, hh1, BETA);
            ll+=4; hl+=4; lh+=4; hh+=4; lh1+=4; hh1+=4;
        }

        ll += next_y; hl += next_y; lh += next_y; hh += next_y;

        lh1 = LH + (band_start_y) * band_stride_y;
        hh1 = HH + (band_start_y) * band_stride_y;

        for(size_t y = band_start_y + 1; y < band_end_y; ++y){
            for(size_t x = 0; x < band_size_x; x += 4){
                sep_V_update_unpacked_kernel(ll, hl, lh, hh, lh1, hh1, BETA);

                ll+=4; hl+=4; lh+=4; hh+=4; lh1+=4; hh1+=4;
            }
            ll += next_y; hl += next_y; lh += next_y; hh += next_y; lh1+=next_y; hh1+=next_y;
        }
    }else{
        // first row
        for(size_t x = 0; x < band_size_x; x += 4){
            sep_V_update_unpacked_symmetric_extension_kernel(ll, hl, lh, hh, BETA);
            ll+=4; hl+=4; lh+=4; hh+=4;
        }

        ll += next_y; hl += next_y; lh += next_y; hh += next_y;

        lh1 = LH + (band_start_y) * band_stride_y;
        hh1 = HH + (band_start_y) * band_stride_y;

        for(size_t y = band_start_y+1; y < band_end_y; ++y){
            for(size_t x = 0; x < band_size_x; x += 4){
                sep_V_update_unpacked_kernel(ll, hl, lh, hh, lh1, hh1, BETA);
                ll+=4; hl+=4; lh+=4; hh+=4; lh1+=4; hh1+=4;
            }
            ll += next_y; hl += next_y; lh += next_y; hh += next_y; lh1+=next_y; hh1+=next_y;
        }
    }
}
#else
void sep(const Image * img, Bands * bands, const BandsThreadingInfo * threading_info)
{
    if(threading_info->num_sockets > 1){
        #   pragma omp parallel proc_bind(spread) num_threads(threading_info->num_sockets)
        {
            size_t tid = (size_t)omp_get_thread_num();

            Chunk chunk;
            init_chunk(&chunk, &img[tid], &threading_info[tid], tid);

            #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
            {
                for(size_t idx = chunk.start_index; idx < chunk.end_index; ++idx){
                    size_t i = idx / img[tid].tiles_per_width;
                    size_t j = idx % img[tid].tiles_per_width;

                    Tile tile;
                    TileBands tile_bands;

                    get_tile(&img[tid], &tile, j, i);
                    get_tile_bands(&bands[tid], &tile_bands, j, i);

                    sep_H_predict_with_unpack(&tile, &tile_bands, &threading_info[tid]);
                    sep_V_predict_unpacked(&tile_bands, &threading_info[tid]);
                    sep_H_update_unpacked(&tile_bands, &threading_info[tid]);
                    sep_V_update_unpacked(&tile_bands, &threading_info[tid]);
                }
            }
        }
    } else {
        size_t tid = 0;
        
        Chunk chunk;
        init_chunk(&chunk, &img[tid], &threading_info[tid], tid);

        #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
        {
            for(size_t idx = chunk.start_index; idx < chunk.end_index; ++idx){
                size_t i = idx / img[tid].tiles_per_width;
                size_t j = idx % img[tid].tiles_per_width;

                Tile tile;
                TileBands tile_bands;

                get_tile(&img[tid], &tile, j, i);
                get_tile_bands(&bands[tid], &tile_bands, j, i);

                sep_H_predict_with_unpack(&tile, &tile_bands, &threading_info[tid]);
                sep_V_predict_unpacked(&tile_bands, &threading_info[tid]);
                sep_H_update_unpacked(&tile_bands, &threading_info[tid]);
                sep_V_update_unpacked(&tile_bands, &threading_info[tid]);
            }
        }
    }
}

void sep_H_predict_with_unpack(const Tile * tile, TileBands * tile_bands, const BandsThreadingInfo * threading_info)
{
    const __m128 ALPHA = _mm_set_ps1(-0.5f);

    const size_t size_x = tile->size_x;
    const size_t stride_y = tile->stride_y;
    float * mem = tile->data;

    const size_t band_size_y = tile_bands->size_y;
    const size_t band_stride_y = tile_bands->stride_y;

    const size_t num_threads = (size_t)omp_get_num_threads();
    assert(num_threads <= band_size_y);
    const size_t tid = (size_t)omp_get_thread_num();

    const size_t band_start_y = threading_info->band_start_y[tid];
    const size_t band_end_y = threading_info->band_end_y[tid];

    const size_t img_start_y = band_start_y << 1;
    const size_t img_end_y = band_end_y << 1;

    const size_t next_band_y = band_stride_y - tile_bands->size_x + 4;
    const size_t next_tile_y = 2 * stride_y - size_x + 8;

    float * m0 = mem + img_start_y * stride_y;
    float * m1 = mem + (img_start_y+1) * stride_y;

    float * ll = tile_bands->LL + band_start_y * band_stride_y;
    float * hl = tile_bands->HL + band_start_y * band_stride_y;
    float * lh = tile_bands->LH + band_start_y * band_stride_y;
    float * hh = tile_bands->HH + band_start_y * band_stride_y;

    __m128 R0C0, R0C1, R0C2, R1C0, R1C1, R1C2;

    for(size_t y = img_start_y; y < img_end_y; y += 2){
        R0C0 = _mm_load_ps(m0); R1C0 = _mm_load_ps(m1);
        for(size_t x = 0; x < size_x-8; x+=8){
            R0C1 = _mm_load_ps(m0+4); R0C2 = _mm_load_ps(m0+8);
            R1C1 = _mm_load_ps(m1+4); R1C2 = _mm_load_ps(m1+8);

            sep_H_predict_with_unpack_kernel(R0C0, R0C1, R0C2, R1C0, R1C1, R1C2, ll, hl, lh, hh, ALPHA);

            R0C0 = R0C2; R1C0 = R1C2;

            ll += 4; hl += 4; hh += 4; lh += 4;
            m0 += 8; m1 += 8;
        }
        
        R0C1 = _mm_load_ps(m0+4); R1C1 = _mm_load_ps(m1+4);

        sep_H_predict_with_unpack_symmetric_extension_kernel(R0C0, R0C1, R1C0, R1C1, ll, hl, lh, hh, ALPHA);

        ll += next_band_y; hl += next_band_y; lh += next_band_y; hh += next_band_y;
        m0 += next_tile_y; m1 += next_tile_y;
    }

    #pragma omp barrier
}

void sep_V_predict_unpacked(TileBands * tile_bands, const BandsThreadingInfo * threading_info)
{
    const __m128 ALPHA = _mm_set1_ps(-0.5f);

    const size_t band_size_x = tile_bands->size_x;
    const size_t band_size_y = tile_bands->size_y;
    const size_t band_stride_y = tile_bands->stride_y;

    const size_t num_threads = (size_t)omp_get_num_threads();
    assert(num_threads <= band_size_y);
    const size_t tid = (size_t)omp_get_thread_num();

    const size_t band_start_y = threading_info->band_start_y[tid];
    const size_t band_end_y = threading_info->band_end_y[tid];

    const size_t next_y = band_stride_y - band_size_x;

    float * ll = tile_bands->LL + band_start_y * band_stride_y;
    float * hl = tile_bands->HL + band_start_y * band_stride_y;
    float * lh = tile_bands->LH + band_start_y * band_stride_y;
    float * hh = tile_bands->HH + band_start_y * band_stride_y;

    float * ll1 = tile_bands->LL + (band_start_y+1) * band_stride_y;
    float * hl1 = tile_bands->HL + (band_start_y+1) * band_stride_y;

    if(tid != num_threads - 1){
        for(size_t y = band_start_y; y < band_end_y; ++y){
            for(size_t x = 0; x < band_size_x; x += 4){
                
                sep_V_predict_unpacked_kernel(ll, hl, lh, hh, ll1, hl1, ALPHA);

                ll+=4; hl+=4; lh+=4; hh+=4; ll1+=4; hl1+=4;
            }
            
            ll += next_y; hl += next_y; lh += next_y; hh += next_y; ll1+=next_y; hl1+=next_y;
        }
    } else {
        for(size_t y = band_start_y; y < band_end_y-1; ++y){
            for(size_t x = 0; x < band_size_x; x += 4){
                sep_V_predict_unpacked_kernel(ll, hl, lh, hh, ll1, hl1, ALPHA);

                ll+=4; hl+=4; lh+=4; hh+=4; ll1+=4; hl1+=4;
            }

            ll += next_y; hl += next_y; lh += next_y; hh += next_y; ll1+=next_y; hl1+=next_y;
        }
        // last row
        for(size_t x = 0; x < band_size_x; x += 4){
            sep_V_predict_unpacked_symmetric_extension_kernel(ll, hl, lh, hh, ALPHA);

            ll+=4; hl+=4; lh+=4; hh+=4;
        }
    }

    #pragma omp barrier
}

void sep_H_update_unpacked(TileBands * tile_bands, const BandsThreadingInfo * threading_info)
{
    const __m128 BETA = _mm_set1_ps(0.25f);

    const size_t band_size_x = tile_bands->size_x;
    const size_t band_size_y = tile_bands->size_y;
    const size_t band_stride_y = tile_bands->stride_y;

    const size_t num_threads = (size_t)omp_get_num_threads();
    assert(num_threads <= band_size_y);
    const size_t tid = (size_t)omp_get_thread_num();

    const size_t band_start_y = threading_info->band_start_y[tid];
    const size_t band_end_y = threading_info->band_end_y[tid];

    const size_t next_y = band_stride_y - band_size_x;

    __m128 C0, C1, X0, X1;

    float * ll = tile_bands->LL + band_start_y * band_stride_y;
    float * hl = tile_bands->HL + band_start_y * band_stride_y;
    float * lh = tile_bands->LH + band_start_y * band_stride_y;
    float * hh = tile_bands->HH + band_start_y * band_stride_y;

    for(size_t y = band_start_y; y < band_end_y; ++y){
        // first 4 columns
        C0 = _mm_load_ps(hl); X0 = _mm_load_ps(hh);

        sep_H_update_unpacked_symmetric_extension_kernel(C0, X0, ll, lh, BETA);

        C1 = C0; X1 = X0;

        ll += 4; hl += 4; lh += 4; hh += 4;

        //main area
        for(size_t x = 4; x < band_size_x; x += 4){
            
            C0 = _mm_load_ps(hl); X0 = _mm_load_ps(hh);

            sep_H_update_unpacked_kernel(C0, X0, C1, X1, ll, lh, BETA);

            C1 = C0; X1 = X0;

            ll += 4; hl += 4; lh += 4; hh += 4;
        }
        

        ll += next_y; hl += next_y; lh += next_y; hh += next_y;
    }
    #pragma omp barrier
}

void sep_V_update_unpacked(TileBands * tile_bands, const BandsThreadingInfo * threading_info)
{
    const __m128 BETA = _mm_set1_ps(0.25f);

    const size_t band_size_x = tile_bands->size_x;
    const size_t band_size_y = tile_bands->size_y;
    const size_t band_stride_y = tile_bands->stride_y;
    float * LL = tile_bands->LL;
    float * HL = tile_bands->HL;
    float * LH = tile_bands->LH;
    float * HH = tile_bands->HH;

    const size_t num_threads = (size_t)omp_get_num_threads();
    assert(num_threads <= band_size_y);
    const size_t tid = (size_t)omp_get_thread_num();

    const size_t band_start_y = threading_info->band_start_y[tid];
    const size_t band_end_y = threading_info->band_end_y[tid];

    const size_t next_y = band_stride_y - band_size_x;

    float * ll, *hl, *lh, *hh, *lh1, *hh1;

    ll = LL + band_start_y * band_stride_y;
    hl = HL + band_start_y * band_stride_y;
    lh = LH + band_start_y * band_stride_y;
    hh = HH + band_start_y * band_stride_y;

    if(tid != 0){
        lh1 = LH + (band_start_y-1) * band_stride_y;
        hh1 = HH + (band_start_y-1) * band_stride_y;

        for(size_t y = band_start_y; y < band_end_y; ++y){
            for(size_t x = 0; x < band_size_x; x += 4){
                
                sep_V_update_unpacked_kernel(ll, hl, lh, hh, lh1, hh1, BETA);

                ll+=4; hl+=4; lh+=4; hh+=4; lh1+=4; hh1+=4;
            }
            
            ll += next_y; hl += next_y; lh += next_y; hh += next_y; lh1+=next_y; hh1+=next_y;
        }
    }else{
        // first row
        for(size_t x = 0; x < band_size_x; x += 4){
            sep_V_update_unpacked_symmetric_extension_kernel(ll, hl, lh, hh, BETA);
            ll+=4; hl+=4; lh+=4; hh+=4;
        }

        ll += next_y; hl += next_y; lh += next_y; hh += next_y;

        lh1 = LH + (band_start_y) * band_stride_y;
        hh1 = HH + (band_start_y) * band_stride_y;

        for(size_t y = band_start_y+1; y < band_end_y; ++y){
            for(size_t x = 0; x < band_size_x; x += 4){
                sep_V_update_unpacked_kernel(ll, hl, lh, hh, lh1, hh1, BETA);
                ll+=4; hl+=4; lh+=4; hh+=4; lh1+=4; hh1+=4;
            }
            ll += next_y; hl += next_y; lh += next_y; hh += next_y; lh1+=next_y; hh1+=next_y;
        }
    }

    #   pragma omp barrier
}
#endif

#ifdef NO_BARRIER
void test_sep(const char * filename, Image * img, Bands * bands, TmpMem * tmp_mem0, TmpMem * tmp_mem1, const BandsThreadingInfo * threading_info)
#else
void test_sep(const char * filename, Image * img, Bands * bands, const BandsThreadingInfo * threading_info)
#endif
{
    Tile tile;
    TileBands tile_bands;

    FILE * f = fopen(filename, "r");
    if(!f)
        return;
    fclose(f);

    Image test;
    allocate_image(&test, img->size_x, img->size_y);
    set_tile_size(&test, img->tile_size_x, img->tile_size_y);

    load_image(filename, &test);
#if defined NO_SSE && defined NO_BARRIER
    sep_no_SSE_no_barrier(img, bands, tmp_mem0, tmp_mem1, threading_info);
#elif defined NO_SSE
    sep_no_SSE(img, bands, threading_info);
#elif defined NO_BARRIER
    sep_no_barrier(img, bands, tmp_mem0, tmp_mem1, threading_info);
#else
    sep(img, bands, threading_info);
#endif

    const float EPS = 1e-5f;

    size_t num_tiles = img->tiles_per_height * img->tiles_per_width;
    size_t chunk = (num_tiles + threading_info->num_sockets - 1) / threading_info->num_sockets;

    float n1, n2;
    float *ll, *hl, *lh, *hh;
    for (size_t y = 0; y < img->tiles_per_height; ++y) {
        for (size_t x = 0; x < img->tiles_per_width; ++x) {
            get_tile(&test, &tile, x, y);
            size_t idx = (y * img->tiles_per_width + x) / chunk;
            get_tile_bands(&bands[idx], &tile_bands, x, y);

            for (size_t i = 0; i < tile.size_y; ++i) {
                for (size_t j = 0; j < tile.size_x; j += 2) {
                    n1 = *(tile.data + i * tile.stride_y + j);
                    n2 = *(tile.data + i * tile.stride_y + j + 1);

                    if (i % 2 == 0) {
                        ll = tile_bands.LL + (i>>1) * bands->stride_y + (j >> 1);
                        hl = tile_bands.HL + (i>>1) * bands->stride_y + (j >> 1);
                        //fprintf(stderr,"%zu %zu %zu %zu %f %f\n", x, y, j, i, *ll, *hl);
                        assert(*ll - n1 < EPS);
                        assert(*hl - n2 < EPS);
                    } else {
                        lh = tile_bands.LH + (i>>1) * bands->stride_y + (j >> 1);
                        hh = tile_bands.HH + (i>>1) * bands->stride_y + (j >> 1);
                        //fprintf(stderr,"%zu %zu %zu %zu %f %f\n", x, y, j, i, *lh, *hh);
                        assert(*lh - n1 < EPS);
                        assert(*hh - n2 < EPS);
                    }
                }
            }
        }
    }
    free_image(&test);
}

#ifdef NO_BARRIER
double measure_sep(Image * img, Bands * bands, TmpMem * tmp_mem0, TmpMem * tmp_mem1, const BandsThreadingInfo * threading_info, TestConfig config)
{
    double result = -1;
    switch(config){
        case IMAGE:
            result = measure_sep_image(img, bands, tmp_mem0, tmp_mem1, threading_info);
            break;
        case TILE:
            result = measure_sep_tile(img, bands, tmp_mem0, tmp_mem1, threading_info);
            break;
        case TILES_IN_IMAGE:
            result = measure_sep_tiles_in_image(img, bands, tmp_mem0, tmp_mem1, threading_info);
            break;

    }
    return result;
}
#else
double measure_sep(Image * img, Bands * bands, const BandsThreadingInfo * threading_info, TestConfig config)
{
    double result = -1;
    switch(config){
        case IMAGE:
            result = measure_sep_image(img, bands, threading_info);
            break;
        case TILE:
            result = measure_sep_tile(img, bands, threading_info);
            break;
        case TILES_IN_IMAGE:
            result = measure_sep_tiles_in_image(img, bands, threading_info);
            break;

    }
    return result;
}
#endif

#ifdef NO_BARRIER
double measure_sep_tile(Image * img, Bands * bands, TmpMem * tmp_mem0, TmpMem * tmp_mem1, const BandsThreadingInfo * threading_info)
#else
double measure_sep_tile(Image * img, Bands * bands, const BandsThreadingInfo * threading_info)
#endif
{
    assert(threading_info->num_sockets == 1 && img->num_tiles == 1);

    const size_t repetitions = REPETITIONS;
    const size_t attempts = ATTEMPTS;

    long long t = -1;
    long long times[attempts];

    Tile tile;
    TileBands tile_bands;

    get_tile(&img[0], &tile, 0, 0);
    get_tile_bands(&bands[0], &tile_bands, 0, 0);

    for(size_t a = 0; a < attempts; ++a)
    {
/*
        #pragma omp parallel proc_bind(spread) num_threads(threading_info->num_sockets)
        {
            #pragma omp parallel proc_bind(spread) num_threads(threading_info->num_threads)
            {
                flush_cache();
            }
        }
*/
        long long t0 = gettimer(); // ns


        for(size_t t = 0; t < repetitions; ++t){
        #if defined NO_SSE && defined NO_BARRIER
            #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
            {
                sep_H_predict_with_unpack_no_SSE_no_barrier(&tile, &tile_bands, &threading_info[0], &tmp_mem0[0], &tmp_mem1[0]);
                sep_V_predict_unpacked_no_SSE_no_barrier(&tile_bands, &threading_info[0], &tmp_mem0[0], &tmp_mem1[0]);
                sep_H_update_unpacked_no_SSE_no_barrier(&tile_bands, &threading_info[0], &tmp_mem0[0]);
                sep_V_update_unpacked_no_SSE_no_barrier(&tile_bands, &threading_info[0], &tmp_mem0[0]);
            }
        #elif defined NO_SSE
            #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
            {
                sep_H_predict_with_unpack_no_SSE(&tile, &tile_bands, &threading_info[0]);
                sep_V_predict_unpacked_no_SSE(&tile_bands, &threading_info[0]);
                sep_H_update_unpacked_no_SSE(&tile_bands, &threading_info[0]);
                sep_V_update_unpacked_no_SSE(&tile_bands, &threading_info[0]);
            }
        #elif defined NO_BARRIER
            #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
            {
                sep_H_predict_with_unpack_no_barrier(&tile, &tile_bands, &threading_info[0], &tmp_mem0[0], &tmp_mem1[0]);
                sep_V_predict_unpacked_no_barrier(&tile_bands, &threading_info[0], &tmp_mem0[0], &tmp_mem1[0]);
                sep_H_update_unpacked_no_barrier(&tile_bands, &threading_info[0], &tmp_mem0[0]);
                sep_V_update_unpacked_no_barrier(&tile_bands, &threading_info[0], &tmp_mem0[0]);
            }
        #else
            #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
            {
                sep_H_predict_with_unpack(&tile, &tile_bands, &threading_info[0]);
                sep_V_predict_unpacked(&tile_bands, &threading_info[0]);
                sep_H_update_unpacked(&tile_bands, &threading_info[0]);
                sep_V_update_unpacked(&tile_bands, &threading_info[0]);
            }
        #endif
        }

        long long t1 = gettimer();

        long long t_ = t1 - t0;

    times[a] = t_;
    }

    qsort(times, attempts, sizeof(long long), compare_times);

    t = times[attempts/2];

    return 1000.0*(double)t/(double)((tile.size_x * tile.size_y) * repetitions);
}

#ifdef NO_BARRIER
double measure_sep_image(Image * img, Bands * bands, TmpMem * tmp_mem0, TmpMem * tmp_mem1, const BandsThreadingInfo * threading_info)
#else
double measure_sep_image(Image * img, Bands * bands, const BandsThreadingInfo * threading_info)
#endif
{
    const size_t repetitions = REPETITIONS;
    const size_t attempts = ATTEMPTS;

    long long t = -1;
    long long times[attempts];

    for(size_t a = 0; a < attempts; ++a)
    {
/*
        #pragma omp parallel proc_bind(spread) num_threads(threading_info->num_sockets)
        {
            #pragma omp parallel proc_bind(spread) num_threads(threading_info->num_threads)
            {
                flush_cache();
            }
        }
*/
        long long t0 = gettimer(); // ns


        for(size_t t = 0; t < repetitions; ++t){
        #if defined NO_SSE && defined NO_BARRIER
            sep_no_SSE_no_barrier(img, bands, tmp_mem0, tmp_mem1, threading_info);
        #elif defined NO_SSE
            sep_no_SSE(img, bands, threading_info);
        #elif defined NO_BARRIER
            sep_no_barrier(img, bands, tmp_mem0, tmp_mem1, threading_info);
        #else
            sep(img, bands, threading_info);
        #endif
        }

        long long t1 = gettimer();

        long long t_ = t1 - t0;

    times[a] = t_;
    }

    qsort(times, attempts, sizeof(long long), compare_times);

    t = times[attempts/2];

    return 1000.0*(double)t/(double)(img->size * repetitions);
}

#ifdef NO_BARRIER
double measure_sep_tiles_in_image(Image * img, Bands * bands, TmpMem * tmp_mem0, TmpMem * tmp_mem1, const BandsThreadingInfo * threading_info)
#else
double measure_sep_tiles_in_image(Image * img, Bands * bands, const BandsThreadingInfo * threading_info)
#endif
{
    assert(threading_info->num_sockets == 1);

    const size_t repetitions = REPETITIONS;
    const size_t attempts = ATTEMPTS;

    long long t = -1;
    long long * times =(long long *) malloc(attempts * img[0].num_tiles * sizeof(long long));

    Tile tile;
    TileBands tile_bands;

    get_tile(&img[0], &tile, 0, 0);
    get_tile_bands(&bands[0], &tile_bands, 0, 0);


    for(size_t i = 0; i < img[0].tiles_per_height; ++i){
        for(size_t j = 0; j < img[0].tiles_per_width; ++j){
            Tile tile;
            TileBands tile_bands;

            get_tile(&img[0], &tile, j, i);
            get_tile_bands(&bands[0], &tile_bands, j, i);

            for(size_t a = 0; a < attempts; ++a){

                /*
                #pragma omp parallel proc_bind(spread) num_threads(threading_info->num_sockets)
                {
                    #pragma omp parallel proc_bind(spread) num_threads(threading_info->num_threads)
                    {
                        flush_cache();
                    }
                }
                */

                long long t0 = gettimer(); // ns

                for(size_t t = 0; t < repetitions; ++t){
                #if defined NO_SSE && defined NO_BARRIER
                    #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
                    {
                        sep_H_predict_with_unpack_no_SSE_no_barrier(&tile, &tile_bands, &threading_info[0], &tmp_mem0[0], &tmp_mem1[0]);
                        sep_V_predict_unpacked_no_SSE_no_barrier(&tile_bands, &threading_info[0], &tmp_mem0[0], &tmp_mem1[0]);
                        sep_H_update_unpacked_no_SSE_no_barrier(&tile_bands, &threading_info[0], &tmp_mem0[0]);
                        sep_V_update_unpacked_no_SSE_no_barrier(&tile_bands, &threading_info[0], &tmp_mem0[0]);
                    }
                #elif defined NO_SSE
                    #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
                    {
                        sep_H_predict_with_unpack_no_SSE(&tile, &tile_bands, &threading_info[0]);
                        sep_V_predict_unpacked_no_SSE(&tile_bands, &threading_info[0]);
                        sep_H_update_unpacked_no_SSE(&tile_bands, &threading_info[0]);
                        sep_V_update_unpacked_no_SSE(&tile_bands, &threading_info[0]);
                    }
                #elif defined NO_BARRIER
                    #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
                    {
                        sep_H_predict_with_unpack_no_barrier(&tile, &tile_bands, &threading_info[0], &tmp_mem0[0], &tmp_mem1[0]);
                        sep_V_predict_unpacked_no_barrier(&tile_bands, &threading_info[0], &tmp_mem0[0], &tmp_mem1[0]);
                        sep_H_update_unpacked_no_barrier(&tile_bands, &threading_info[0], &tmp_mem0[0]);
                        sep_V_update_unpacked_no_barrier(&tile_bands, &threading_info[0], &tmp_mem0[0]);
                    }
                #else
                    #   pragma omp parallel proc_bind(close) num_threads(threading_info->num_threads)
                    {
                        sep_H_predict_with_unpack(&tile, &tile_bands, &threading_info[0]);
                        sep_V_predict_unpacked(&tile_bands, &threading_info[0]);
                        sep_H_update_unpacked(&tile_bands, &threading_info[0]);
                        sep_V_update_unpacked(&tile_bands, &threading_info[0]);
                    }
                #endif
                }

                long long t1 = gettimer();

                long long t_ = t1 - t0;

                size_t idx = i * img[0].tiles_per_width + j;
                times[idx * attempts + a] = t_;

            }
        }
    }

    qsort(times, attempts * img[0].num_tiles, sizeof(long long), compare_times);

    t = times[(attempts * img[0].num_tiles)/2];

    free(times);
    return 1000.0*(double)t/(double)((tile.size_x * tile.size_y) * repetitions);
}
