/*
								+--------------------------------+
								|                                |
								|    ***  CUDA utilities  ***    |
								|                                |
								|  Copyright  -tHE SWINe- 2010  |
								|                                |
								|          CuUtils.cpp           |
								|                                |
								+--------------------------------+
*/

#include "../NewFix.h"
#include "../CallStack.h"
#include <stdio.h>
#include <cuda.h>
#include "../Dir.h" // PRIsizeB
#include "CuUtils.h"

/*
 *								=== CCuDeviceParams ===
 */

void CCuDeviceParams::Dump(FILE *p_fw)
{
	fprintf(p_fw, "device(id: %d, \'%s\', " PRIsizeB
		"B RAM, %.2f MHz, multiproc: %d, max-threads-block: %d, max-block-size: %dx%dx%d)\n",
		n_DeviceIndex(), p_s_Name(),
		PRIsizeBparams(n_Memory_Size()),
		t_Properties().clockRate / 1e3f,
		n_Multiprocessor_Num(),
		t_Properties().maxThreadsPerBlock,
		t_Properties().maxThreadsDim[0],
		t_Properties().maxThreadsDim[1],
		t_Properties().maxThreadsDim[2]);
	// show some device parameters
}

int CCuDeviceParams::n_Get_MaxGFlops_DeviceId()
{
	int n_device_num;
	if(cuDeviceGetCount(&n_device_num) != CUDA_SUCCESS)
		return -1;
	// get device count

	int n_best_device = -1;
	double f_best_gflops = 0;
	size_t n_best_memory_size = 0;
	for(int i = 0; i < n_device_num; ++ i) {
		CUdevice h_device;
		if(cuDeviceGet(&h_device, i) != CUDA_SUCCESS)
			return -1;
		// get device handle

		CUdevprop t_devprop;
		if(cuDeviceGetProperties(&t_devprop, h_device) != CUDA_SUCCESS)
			return -1;
		// get device properties

		int n_multiproc_num;
		unsigned int n_memory_size;
		if(cuDeviceGetAttribute(&n_multiproc_num, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, i) != CUDA_SUCCESS ||
		   cuDeviceTotalMem(&n_memory_size, i) != CUDA_SUCCESS)
			return -1;
		// get device multiprocessor count

		double f_gflops = t_devprop.clockRate * n_multiproc_num/*t_devprop.maxThreadsPerBlock*/;
		//size_t n_memory_size = t_devprop.totalConstantMemory;
		if(f_best_gflops < f_gflops || (f_best_gflops == f_gflops && n_best_memory_size < n_memory_size)) {
			f_best_gflops = f_gflops;
			n_best_memory_size = n_memory_size;
			n_best_device = i;
		}
		// calculate device power, find maximum
	}
	// select the meanest device

	return n_best_device;
}

CCuDeviceParams::CCuDeviceParams(int n_device_index)
	:m_n_index(-1), // mark error
	m_h_device(0), m_n_memory_size(0)
{
	memset(m_p_device_caps, 0, 2 * sizeof(int));
	memset(&m_t_devprop, 0, sizeof(CUdevprop));

	if(cuDeviceGet(&m_h_device, n_device_index) != CUDA_SUCCESS)
		return;
	// get device handle

	try {
		std::string s_name;
		for(int n_length = 1;; n_length *= 2) {
			s_name.resize(n_length + 1);
			// (re)allocate string

			if(cuDeviceGetName(&s_name[0], n_length, m_h_device) != CUDA_SUCCESS)
				return;
			s_name[n_length] = 0; // terminating null (just make sure it's terminated)
			// get device name

			if(strlen(s_name.c_str()) + 1 < unsigned(n_length)) {
				s_name.erase(strlen(s_name.c_str()));
				m_s_name.assign(s_name);
				break;
			}
			// in case it fits, make sure string has correct length
		}
	} catch(std::bad_alloc&) {
		return;
	}
	// get device name

	if(cuDeviceComputeCapability(&m_p_device_caps[0],
	   &m_p_device_caps[1], m_h_device) != CUDA_SUCCESS)
		return;
	// get device caps

	int n_kernel_exec_timeout;
	if(cuDeviceGetAttribute(&m_n_multiprocessor_num,
	   CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, n_device_index) != CUDA_SUCCESS ||
	   cuDeviceGetAttribute(&n_kernel_exec_timeout,
	   CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, n_device_index) != CUDA_SUCCESS ||
	   false)
		return;
	m_b_kernel_exec_timeout = n_kernel_exec_timeout != 0;
	// get some interesting device attributes

	if(cuDeviceTotalMem(&m_n_memory_size, m_h_device) != CUDA_SUCCESS)
		return;
	// get device memory size

	if(cuDeviceGetProperties(&m_t_devprop, m_h_device) != CUDA_SUCCESS)
		return;
	// get device properties

	m_n_index = n_device_index; // mark success
}

bool CCuDeviceParams::b_ProblemFitsAtOnce(int n_width, int n_height, int n_depth) const
{
	n_width = (n_width + m_t_devprop.maxGridSize[0] - 1) / m_t_devprop.maxGridSize[0];
	n_height = (n_height + m_t_devprop.maxGridSize[1] - 1) / m_t_devprop.maxGridSize[1];
	n_depth = (n_depth + m_t_devprop.maxGridSize[2] - 1) / m_t_devprop.maxGridSize[2];
	// calculate dimensions, relative to maximal grid size (round up)

	if(n_width > m_t_devprop.maxThreadsDim[0] ||
	   n_height > m_t_devprop.maxThreadsDim[1] ||
	   n_depth > m_t_devprop.maxThreadsDim[2])
		return false;
	// those dimensions must not exceed block size

	if(n_width * n_height * n_depth > m_t_devprop.maxThreadsPerBlock)
		return false;
	// number of threads in the block must be below limit

	return true;
}

bool CCuDeviceParams::CalculateGridParams(int *p_block_size, int *p_grid_size,
	int n_width, int n_height, int n_depth) const
{
	if(!b_ProblemFitsAtOnce(n_width, n_height, n_depth))
		return false;
	// @todo - handle subdivided problems too

	int n_blk_width = (n_width + m_t_devprop.maxGridSize[0] - 1) / m_t_devprop.maxGridSize[0];
	int n_blk_height = (n_height + m_t_devprop.maxGridSize[1] - 1) / m_t_devprop.maxGridSize[1];
	int n_blk_depth = (n_depth + m_t_devprop.maxGridSize[2] - 1) / m_t_devprop.maxGridSize[2];
	// calculate block dimensions (lower bound)

	// @todo - optimize block dimensions to approach m_t_devprop.maxThreadsPerBlock as closely as possible (that is good thing to do, right?)

	p_block_size[0] = n_blk_width;
	p_block_size[1] = n_blk_height;
	p_block_size[2] = n_blk_depth;
	p_grid_size[0] = (n_width + n_blk_width - 1) / n_blk_width;
	p_grid_size[1] = (n_height + n_blk_height - 1) / n_blk_height;
	p_grid_size[2] = (n_depth + n_blk_depth - 1) / n_blk_depth;
	// store block sizes, and grid size

	return true;
}

/*
 *								=== ~CCuDeviceParams ===
 */

/*
 *								=== CCuArgLoaderHelper ===
 */

CUresult CCuArgLoaderHelper::__SafeCall(CUresult n_error_code, const char *p_s_file, int n_line)
{
	if(n_error_code != CUDA_SUCCESS) {
		fprintf(stderr, "error: cuParamSet*() failed : error code %04d : "
			"file \'%s\', line %d.\n", n_error_code, p_s_file, n_line);
		exit(-1);
	}

	return n_error_code;
}

/*
 *								=== ~CCuArgLoaderHelper ===
 */
