/*
								+--------------------------------+
								|                                |
								| *** OCL temp buffer alloc. *** |
								|                                |
								|  Copyright  -tHE SWINe- 2016  |
								|                                |
								|          TempBuffer.h          |
								|                                |
								+--------------------------------+
*/

#pragma once
#ifndef __OPENCL_TEMP_BUFFER_STACK_INCLUDED
#define __OPENCL_TEMP_BUFFER_STACK_INCLUDED

/**
 *	@file gpgpu/TempBuffer.h
 *	@date 2016
 *	@author -tHE SWINe-
 *	@brief OpenCL temporary buffer allocators
 */

#include <vector>
#include "ClUtils.h"

/**
 *	@def OPENCL_TEMP_BUFFER_STACK_USAGE_PROFILING
 *	@brief if defined, CCLTempBufferStack collects usage
 *		information which is displayed upon destruction
 */
#define OPENCL_TEMP_BUFFER_STACK_USAGE_PROFILING

/**
 *	@def OPENCL_TEMP_BUFFER_GREEDY_ALLOC
 *	@brief if defined, CCLTempBufferStack returns the first buffer
 *		with sufficient size, otherwise it attempts to return the buffer
 *		with the smallest slack
 */
//#define OPENCL_TEMP_BUFFER_GREEDY_ALLOC

/**
 *	@def OPENCL_TEMP_BUFFER_MEMORY_ALIGN_BYTES
 *	@brief if defined, CCLTempBufferStack will align memory allocations
 *		up to the integer multiples of this value (128 B is good on NVIDIA platforms)
 */
#define OPENCL_TEMP_BUFFER_MEMORY_ALIGN_BYTES (32 * sizeof(uint32_t))

/**
 *	@brief stack of temporary buffers for recursive calls which e.g. use autotuned
 *		reasoning and require simple means of allocating small temp buffers
 *	@note The buffers are supposed to be returned in reverse order.
 */
class CCLTempBufferStack {
	friend class CCLTempBufferReservation;
protected:
	std::vector<std::pair<size_t, cl_mem> > m_buffer_list;
	size_t m_n_reserved_buffer_num;
	CCLContextWrapper m_context; // nonmanaged!

#if defined(_DEBUG) || defined(OPENCL_TEMP_BUFFER_STACK_USAGE_PROFILING)
	size_t m_n_request_num, m_n_hit_num, m_n_allocation_num, m_n_reallocation_num;
#endif // _DEBUG || OPENCL_TEMP_BUFFER_STACK_USAGE_PROFILING
public:
	CCLTempBufferStack(cl_context h_context)
		:m_n_reserved_buffer_num(0), m_context(h_context)
#if defined(_DEBUG) || defined(OPENCL_TEMP_BUFFER_STACK_USAGE_PROFILING)
		, m_n_request_num(0), m_n_hit_num(0), m_n_allocation_num(0), m_n_reallocation_num(0)
#endif // _DEBUG || OPENCL_TEMP_BUFFER_STACK_USAGE_PROFILING
	{}

	~CCLTempBufferStack()
	{
		_ASSERTE(!m_n_reserved_buffer_num);
		uint64_t n_capacity = 0;
		size_t n_misaligned_num = 0;
		for(size_t i = 0, n = m_buffer_list.size(); i < n; ++ i) {
			n_capacity += min(uint64_t(m_buffer_list[i].first),
				UINT64_MAX - m_buffer_list[i].first); // saturated arithmetics
			if(m_buffer_list[i].first % OPENCL_TEMP_BUFFER_MEMORY_ALIGN_BYTES)
				++ n_misaligned_num;
			clReleaseMemObject(m_buffer_list[i].second);
		}
		// free the memory

#if defined(_DEBUG) || defined(OPENCL_TEMP_BUFFER_STACK_USAGE_PROFILING)
		printf("debug: CCLTempBufferStack::~CCLTempBufferStack(): had " PRIsize " requests, " PRIsize
			" hits, " PRIsize " new allocs, " PRIsize " reallocs, " PRIsize " now (" PRIsizeB "B)\n",
			m_n_request_num, m_n_hit_num, m_n_allocation_num, m_n_reallocation_num,
			m_buffer_list.size(), PRIsizeBparams(n_capacity));
		if(n_misaligned_num) {
			printf("debug: CCLTempBufferStack::~CCLTempBufferStack(): there are " PRIsize
				" misaligned allocations\n", n_misaligned_num);
		}
#endif // _DEBUG || OPENCL_TEMP_BUFFER_STACK_USAGE_PROFILING
	}

	cl_context h_Context() const // gets the context all the buffers are allocated with (if there are multiple contexts, the banks must be matched -- the buffers will only reside on the devides in the given context)
	{
		return m_context;
	}

	bool b_Have_UnusedBuffers() const
	{
		_ASSERTE(m_n_reserved_buffer_num <= m_buffer_list.size());
		return m_n_reserved_buffer_num < m_buffer_list.size();
	}

	void Free_UnusedBuffers()
	{
		for(size_t i = m_n_reserved_buffer_num, n = m_buffer_list.size(); i < n; ++ i)
			clReleaseMemObject(m_buffer_list[i].second);
		m_buffer_list.resize(m_n_reserved_buffer_num);
	}

private:
	void FreeBuffer(cl_mem UNUSED(h_buffer))
	{
		// too bad one needs a command queue for reading the buffer contents, could allocate
		// slightly more and check array bounds getting overwritten in debug (still, could specify
		// that to the allocation object (or a checked allocation object) or to this, with the possibility
		// to specify a null handle)

		// note that the buffers are deleted asynchronously (OpenCL objects are refcounted so deleting the
		// buffer while the kernel accessing it is still running (or even only queued) is ok), thus making
		// an extra command queue for checking the buffer contents would not work unless synchronizing the
		// entire context (cuCtxSynchronize() in CUDA, probably no function like that in OpenCL (yet)).

		_ASSERTE(m_n_reserved_buffer_num); // make sure there are buffers allocated here
		_ASSERTE(m_buffer_list[m_n_reserved_buffer_num - 1].second == h_buffer); // make sure the next buffer to be deleted is this one
		if(m_buffer_list[m_n_reserved_buffer_num - 1].second == h_buffer)
			-- m_n_reserved_buffer_num; // delete the buffer from the allocation
		else {
			// note that this should not be used, the objects in C++ are supposed
			// to be deallocated in reverse allocation order but especially in release,
			// one could expect the compiler
			for(size_t i = m_n_reserved_buffer_num; i > 0;) {
				-- i; // here
				if(m_buffer_list[i].second == h_buffer) {
					std::pair<size_t, cl_mem> t_info = m_buffer_list[i]; // get the buffer
					m_buffer_list.erase(m_buffer_list.begin() + i); // remove it from the list of reservations
					-- m_n_reserved_buffer_num; // one less is reserved now
					m_buffer_list.push_back(t_info); // put it at the end (won't throw)
					return;
				}
			}
			_ASSERTE(0); // should not happen; perhaps the buffer was not allocated by this object or was since reallocated
		}
	}

	cl_mem h_GetBuffer(size_t n_size_bytes) // only accessible via CCLTempBufferReservation(), not reentrant (!)
	{
		const size_t n_old_reservation_num = m_n_reserved_buffer_num;

		cl_mem h_buffer = h_GetBuffer_Internal(n_size_bytes);

		_ASSERTE(m_buffer_list.size() >= m_n_reserved_buffer_num); // make sure the number of reservations is sane
		_ASSERTE(!h_buffer || m_n_reserved_buffer_num == n_old_reservation_num + 1); // make sure that we reserved a buffer
		_ASSERTE(!h_buffer || h_buffer == m_buffer_list[m_n_reserved_buffer_num - 1].second); // make sure that the returned buffer is indeed on top of the stack now
		_ASSERTE(!h_buffer || m_buffer_list[m_n_reserved_buffer_num - 1].first >= n_size_bytes); // make sure that the returned buffer is large enough
		// just a wrapper to debug things

		return h_buffer;
	}

protected:
	cl_mem h_GetBuffer_Internal(size_t n_size_bytes)
	{
#if defined(_DEBUG) || defined(OPENCL_TEMP_BUFFER_STACK_USAGE_PROFILING)
		++ m_n_request_num;
#endif // _DEBUG || OPENCL_TEMP_BUFFER_STACK_USAGE_PROFILING
		size_t n_largest = 0, n_largest_size = 0;
#ifndef OPENCL_TEMP_BUFFER_GREEDY_ALLOC
		size_t n_smallest_fitting = size_t(-1), n_smallest_fitting_size = SIZE_MAX;
#endif // !OPENCL_TEMP_BUFFER_GREEDY_ALLOC
		for(size_t i = m_n_reserved_buffer_num, n = m_buffer_list.size(); i < n; ++ i) {
			if(m_buffer_list[i].first >= n_size_bytes) { // todo - impose a max slack limit? will it make things worse?
#ifndef OPENCL_TEMP_BUFFER_GREEDY_ALLOC
				if(m_buffer_list[i].first == n_size_bytes) { // also solves the problem with n_size_bytes == SIZE_MAX
					n_smallest_fitting_size = m_buffer_list[i].first;
					n_smallest_fitting = i;
					break; // perfect fit, look no more
				} else if(n_smallest_fitting_size > m_buffer_list[i].first) { // note that if n_size_bytes == SIZE_MAX, this never enters; don't want to use >= though since that would lead to finding the right-most buffer if there are more
					n_smallest_fitting_size = m_buffer_list[i].first;
					n_smallest_fitting = i;
				}
#else // !OPENCL_TEMP_BUFFER_GREEDY_ALLOC
				if(i != m_n_reserved_buffer_num)
					std::swap(m_buffer_list[i], m_buffer_list[m_n_reserved_buffer_num]);
				++ m_n_reserved_buffer_num;
#if defined(_DEBUG) || defined(OPENCL_TEMP_BUFFER_STACK_USAGE_PROFILING)
				++ m_n_hit_num;
#endif // _DEBUG || OPENCL_TEMP_BUFFER_STACK_USAGE_PROFILING
				return m_buffer_list[m_n_reserved_buffer_num - 1].second;
#endif // !OPENCL_TEMP_BUFFER_GREEDY_ALLOC
			} else if(m_buffer_list[i].first > n_largest_size) {
				n_largest_size = m_buffer_list[i].first;
				n_largest = i;
			}
		}
#ifndef OPENCL_TEMP_BUFFER_GREEDY_ALLOC
		if(n_smallest_fitting != size_t(-1)) { // todo - impose a max slack limit? will it make things worse?
			_ASSERTE(n_smallest_fitting_size >= n_size_bytes);
			const size_t i = n_smallest_fitting;
			if(i != m_n_reserved_buffer_num)
				std::swap(m_buffer_list[i], m_buffer_list[m_n_reserved_buffer_num]);
			++ m_n_reserved_buffer_num;
#if defined(_DEBUG) || defined(OPENCL_TEMP_BUFFER_STACK_USAGE_PROFILING)
			++ m_n_hit_num;
#endif // _DEBUG || OPENCL_TEMP_BUFFER_STACK_USAGE_PROFILING
			return m_buffer_list[m_n_reserved_buffer_num - 1].second;
		}
#endif // !OPENCL_TEMP_BUFFER_GREEDY_ALLOC
		// see if there already is a buffer of required size

		if(n_largest_size) { // in case there are buffers allocated, take the largest one and reallocate it
#if defined(_DEBUG) || defined(OPENCL_TEMP_BUFFER_STACK_USAGE_PROFILING)
			++ m_n_reallocation_num;
#endif // _DEBUG || OPENCL_TEMP_BUFFER_STACK_USAGE_PROFILING

			if(n_largest != m_n_reserved_buffer_num)
				std::swap(m_buffer_list[n_largest], m_buffer_list[m_n_reserved_buffer_num]);
			size_t i = m_n_reserved_buffer_num;
			clReleaseMemObject(m_buffer_list[i].second); // delete the buffer first
			size_t n_new_size = std::max(n_Align_Up(std::max(2 * n_largest_size, n_size_bytes),
				size_t(OPENCL_TEMP_BUFFER_MEMORY_ALIGN_BYTES)), n_size_bytes); // the outer max guards against overflow
			CLresult n_result;
			if((n_result = m_context.n_CreateBuffer(m_buffer_list[i].second, n_new_size)) != cl_Success) {
				n_new_size = n_size_bytes; // try the exact size if the double is over the device limit
				if((n_result = m_context.n_CreateBuffer(m_buffer_list[i].second, n_new_size)) != cl_Success) {
					m_buffer_list.erase(m_buffer_list.begin() + i); // this does not have any memory associated with it now
					if(b_Have_UnusedBuffers()) {
						Free_UnusedBuffers(); // last ditch effort; free everything not in use and try again
						_ASSERTE(!b_Have_UnusedBuffers());
						return h_GetBuffer(n_size_bytes);
					}
					return 0;
				}
			}
			m_buffer_list[i].first = n_new_size;
			++ m_n_reserved_buffer_num;
			_ASSERTE(i == m_n_reserved_buffer_num - 1);
			return m_buffer_list[i].second;
		} else { // there are no unused buffers; allocate a new one
#if defined(_DEBUG) || defined(OPENCL_TEMP_BUFFER_STACK_USAGE_PROFILING)
			++ m_n_allocation_num;
#endif // _DEBUG || OPENCL_TEMP_BUFFER_STACK_USAGE_PROFILING

			size_t n_new_size = std::max(n_Align_Up(n_size_bytes,
				size_t(OPENCL_TEMP_BUFFER_MEMORY_ALIGN_BYTES)), n_size_bytes); // the max guards against overflow
			if(!stl_ut::Resize_Add_1More(m_buffer_list, std::make_pair(n_new_size, CCLUniqueMem())))
				return 0;
			CLresult n_result;
			if((n_result = m_context.n_CreateBuffer(m_buffer_list.back().second, n_new_size)) != cl_Success) {
				m_buffer_list.back().first = n_new_size = n_size_bytes;
				if((n_result = m_context.n_CreateBuffer(m_buffer_list.back().second, n_new_size)) != cl_Success) {
					m_buffer_list.erase(m_buffer_list.end() - 1); // this does not have any memory associated with it now
					if(b_Have_UnusedBuffers()) {
						Free_UnusedBuffers(); // last ditch effort; free everything not in use and try again
						_ASSERTE(!b_Have_UnusedBuffers());
						return h_GetBuffer(n_size_bytes);
					}
					return 0;
				}
			}
			++ m_n_reserved_buffer_num;
			_ASSERTE(m_buffer_list.size() == m_n_reserved_buffer_num);
			return m_buffer_list.back().second;
		}
	}

protected:
	CCLTempBufferStack(const CCLTempBufferStack &r_other); // no-copy
	CCLTempBufferStack &operator =(const CCLTempBufferStack &r_other); // no-copy
};

/**
 *	@brief temporary buffer reservation
 */
class CCLTempBufferReservation {
protected:
	CCLTempBufferStack &m_r_owner; /**< @brief reference to the owner */
	cl_mem m_h_buffer; /**< @brief handle to the buffer */

public:
	CCLTempBufferReservation(size_t n_size_bytes, CCLTempBufferStack &r_bank)
		:m_r_owner(r_bank), m_h_buffer(r_bank.h_GetBuffer(n_size_bytes))
	{}

	~CCLTempBufferReservation()
	{
		if(m_h_buffer)
			m_r_owner.FreeBuffer(m_h_buffer);
	}

	inline operator cl_mem() const
	{
		return m_h_buffer;
	}

	/*void Delete() // nope, allows freeing buffers in different order than they were allocated
	{}*/

protected:
	CCLTempBufferReservation(const CCLTempBufferReservation &r_other); // no-copy
	CCLTempBufferReservation &operator =(const CCLTempBufferReservation &r_other); // no-copy
};

#endif // !__OPENCL_TEMP_BUFFER_STACK_INCLUDED
