/*
 * File:   common.h
 * Author: Pavel Najman <najman.pavel at gmail.com>
 *
 * Created on April 28, 2017, 8:08 AM
 */

#ifndef COMMON_H
#define COMMON_H

#include <stdio.h>

// create .mat files
#define DUMP_MAT

// assuming cache line size = 64 B
#define CACHE_LINE_SIZE 64

// local memory layout:
// 4 memory areas (one for each subband) due to next transform level
// coefficient is float32 (thus 4 B)
// i.e. 16 coefficients in the cache line

#define LOCA_SPARSE_BANDS
//#define LOCA_SPARSE_ROWS

// NOTE: the +1 term would break down the maping to cache lines
// NOTE: the +CACHE_LINE_SIZE term skews the maping to cache sets
#ifdef LOCA_SPARSE_BANDS
#	define LOCA_BAND_PADDING CACHE_LINE_SIZE
#else
#	define LOCA_BAND_PADDING 0
#endif

// NOTE: the +1 term would break down the maping to cache lines
// NOTE: the +CACHE_LINE_SIZE term skews the maping to cache sets
#ifdef LOCA_SPARSE_ROWS
#	define LOCA_ROW_PADDING CACHE_LINE_SIZE
#else
#	define LOCA_ROW_PADDING 0
#endif

#define BAND_CHUNK_Y(num_threads) ((band_size_y + (num_threads) - 1) / (num_threads))

#ifdef __INTEL_COMPILER
    #define NO_TREE_VECTORIZE
    #ifdef __MIC__
        #define NO_SSE
    #endif
#else
    #define NO_TREE_VECTORIZE __attribute__((optimize("no-tree-vectorize")))
#endif

#define ATTEMPTS 500
#define REPETITIONS 10

typedef enum {IMAGE, TILE, TILES_IN_IMAGE} TestConfig;

typedef struct
{
    size_t band_size_x;
    float *LH, *HH, *LL, *HL;
} TmpMem;

void allocate_tmp_mem(TmpMem * tmp_mem, size_t num_threads, size_t size_x);

#ifndef NO_SSE
void put_tmp_mem_LH(TmpMem * tmp_mem, float * src, size_t y);

void put_tmp_mem_HH(TmpMem * tmp_mem, float * src, size_t y);

void put_tmp_mem_LL(TmpMem * tmp_mem, float * src, size_t y);

void put_tmp_mem_HL(TmpMem * tmp_mem, float * src, size_t y);
#endif

void NO_TREE_VECTORIZE put_tmp_mem_LH_no_SSE(TmpMem * tmp_mem, float * src, size_t y);

void NO_TREE_VECTORIZE put_tmp_mem_HH_no_SSE(TmpMem * tmp_mem, float * src, size_t y);

void free_tmp_mem(TmpMem * tmp_mem);

typedef struct
{
    size_t num_sockets;
    size_t num_threads;
} ThreadingInfo;

void init_threading_info(ThreadingInfo * info);

typedef struct
{
    size_t size_x, size_y, stride_y;
    float * data;
} Tile;

#define IMAGE_ROW_PADDING CACHE_LINE_SIZE

typedef struct
{
    size_t size_x, size_y, size, stride_y;
    size_t tile_size_x, tile_size_y;
    size_t tiles_per_width, tiles_per_height;
    size_t num_tiles;
    float *data;
} Image;

void allocate_image(Image * img, size_t size_x, size_t size_y);

void init_image(Image * img, size_t tile_size_x, size_t tile_size_y);

void load_image(const char * filename, Image * img);

void set_tile_size(Image * img, size_t tile_size_x, size_t tile_size_y);

void get_tile(const Image * img, Tile * tile, size_t x, size_t y);

void free_image(Image * img);

typedef struct
{
    size_t size_x, size_y, stride_y;
    float * LL, * HL, * LH, * HH;
} TileBands;

#define BANDS_ROW_PADDING CACHE_LINE_SIZE
#define BANDS_PADDING 0

typedef struct
{
    size_t size_x, size_y, stride_y;
    size_t band_size_x, band_size_y, band_stride_y;
    float * LL, * HL, * LH, * HH;
} Bands;

void allocate_bands(Bands * bands, const Image * img);

void clear_bands(Bands * bands);

void get_tile_bands(const Bands * bands, TileBands * tile_bands, size_t x, size_t y);

void free_bands(Bands * bands);

typedef struct
{
    size_t num_sockets;
    size_t num_threads;

    size_t * band_start_y;
    size_t * band_end_y;
} BandsThreadingInfo;

void allocate_bands_threading_info(BandsThreadingInfo * info, size_t num_threads);

void init_bands_threading_info(BandsThreadingInfo * info, const Bands * bands, size_t num_sockets, size_t num_threads);

void free_bands_threading_info(BandsThreadingInfo * info);

typedef struct
{
    size_t num_tiles;
    size_t start_index, end_index;
} Chunk;

void init_chunk(Chunk * chunk, const Image * img, const BandsThreadingInfo * info, size_t tid);

long long gettimer();

void flush_cache();

int compare_times(const void *p1, const void * p2);

#endif	// COMMON_H

