/**
 *	@file GPU_Allocator.h
 *	@brief GPU term vector allocator implementation
 *	@author -tHE SWINe-
 *	@date 2010-12-06
 *	@note This is not standalone re-usable file, this is include-once file for this particular application,
 *		intended to reduce clutter in ProcessTermVectors/Main.cpp.
 */

/**
 *	@def __GPU_ALLOCATOR_SANITY_CHECKS
 *	@brief makes sure all the terms are allocated and that there are no collisions
 *	@note This is more of a debug check, allocator will report errors without this as well,
 *		this just slows down and eats some extra memory.
 */
#define __GPU_ALLOCATOR_SANITY_CHECKS

/**
 *	@brief term vector allocator for GPU
 */
class CGPUTermVectorAllocator {
public:
	/**
	 *	@brief structure, containing term vector GPU / CPU allocation information
	 *	@note this structure doesn't store term id, it is meant to be allocated in a vector and indexed by the term id's
	 */
	struct TTermVectorInfo {
		//bool b_nonzero; /**< @brief does it contain anything else than nulls? */
		size_t n_remaining_refs; /**< @brief number of times this term will occur yet, or SIZE_MAX for 'infinite' */
		uint32_t n_gpu_id; /**< @brief id of the vector on the gpu, or -1 if it's not allocated on the GPU */
		//int32_t *p_vector; /**< @brief pointer to term vector data, or null if it's not alocated yet @note This vector is not normalized. */

		/**
		 *	@brief default constructor; has no effect
		 */
		inline TTermVectorInfo()
		{}

		/**
		 *	@brief constructor; initializes vector information for an unallocated and unreferenced term
		 *	@param[in] n_ref_count is number of remaining references, or SIZE_MAX if this value is unknown
		 */
		inline TTermVectorInfo(size_t n_ref_count)
			:/*b_nonzero(false),*/ n_remaining_refs(n_ref_count), n_gpu_id(-1)/*, p_vector(0)*/
		{}

		/**
		 *	@brief utility function for constructing list of TTermVectorInfo from list of CLuceneIndexReaderInterface terms
		 *	@param[in] r_term_freq is pair of term name and it's sum of frequencies in all the documents to be processed
		 *	@return Returns term vector info for the specified term.
		 */
		static inline TTermVectorInfo t_TermToVecInfo(const std::pair<::wstring, size_t> &r_term_freq)
		{
			_ASSERTE(r_term_freq.second > 0); // this really shouldn't be zero
			return TTermVectorInfo(r_term_freq.second);
		}
	};

	/**
	 *	@brief structure with information about term allocation to the GPU
	 */
	struct TTermAllocation {
		uint32_t n_GPU_slot; /**< index of the GPU slot allocated to hold the term */
		term_id_t n_term_id; /**< id of the term, being allocated to that slot */

		/**
		 *	@brief default constructor; has no effect
		 */
		inline TTermAllocation()
		{}

		/**
		 *	@brief constructor
		 *
		 *	@param[in] _n_GPU_slot is zero-based index of the GPU slot allocated to hold the term
		 *	@param[in] _n_term_id is id of the term, being allocated to that slot
		 */
		inline TTermAllocation(uint32_t _n_GPU_slot, term_id_t _n_term_id)
			:n_GPU_slot(_n_GPU_slot), n_term_id(_n_term_id)
		{}
	};

	/**
	 *	@brief structure with information about term de-allocation from the GPU
	 */
	struct TTermVacation {
		uint32_t n_GPU_slot; /**< index of the GPU slot to vacate */
		term_id_t n_term_id; /**< id of the term, currently residing in that slot */
		bool b_scrub; /**< set if the term is not going to be referenced again */

		/**
		 *	@brief default constructor; has no effect
		 */
		inline TTermVacation()
		{}

		/**
		 *	@brief constructor
		 *
		 *	@param[in] _n_GPU_slot is index of the GPU slot to vacate
		 *	@param[in] _n_term_id is id of the term, currently residing in that slot
		 *	@param[in] _b_scrub is set if the term is not going to be referenced again
		 */
		inline TTermVacation(uint32_t _n_GPU_slot, term_id_t _n_term_id, bool _b_scrub)
			:n_GPU_slot(_n_GPU_slot), n_term_id(_n_term_id), b_scrub(_b_scrub)
		{}
	};

protected:
	CLRU_K_AllocationPolicy<2, term_id_t, /*CLuceneIndexReaderInterface::*/term_Dummy> gpu_vec_allocator;
	// 2 is slightly better than 3 and even better than 1

	std::vector<TTermVectorInfo> term_vector_allocation_table;
	std::vector<uint32_t> GPU_mapping_table; // table, mapping term id's to GPU addresses (or -1 if not allocated) // @note this is just a copy of TTermVectorInfo::n_gpu_id

	size_t m_n_chunk_size;
	size_t m_n_gpu_pool_size;
	size_t m_n_chunk;

public:
	CGPUTermVectorAllocator(size_t n_gpu_pool_size, size_t n_chunk_size,
		const std::vector<std::pair<::wstring, size_t> > &r_term_list)
		:gpu_vec_allocator(n_gpu_pool_size, n_chunk_size), m_n_chunk_size(n_chunk_size),
		m_n_gpu_pool_size(n_gpu_pool_size), m_n_chunk(0)
	{
		// initialize the GPU pool allocator

		if(!stl_ut::Resize_To_N(term_vector_allocation_table, r_term_list.size()) ||
		   !stl_ut::Resize_To_N(GPU_mapping_table, r_term_list.size(), -1)) {
			GPU_mapping_table.clear(); // to mark error
			return;
		}
		std::transform(r_term_list.begin(), r_term_list.end(),
			term_vector_allocation_table.begin(), TTermVectorInfo::t_TermToVecInfo);
		// build the term vector allocation table, allocate the mapping table
	}

	bool b_Status() const
	{
		return !GPU_mapping_table.empty();
	}

	inline const std::vector<uint32_t> &r_GPU_Mapping_Table() const
	{
		return GPU_mapping_table;
	}

	bool Plan_ChunkAllocations(const std::vector<CDocStreamSplitter::TTermOccurence> &r_occ_list,
		std::vector<TTermVacation> &GPU_vacation_list, std::vector<TTermAllocation> &GPU_allocation_list)
	{
		if(m_n_chunk == SIZE_MAX || m_n_chunk >= SIZE_MAX / m_n_chunk_size)
			return false; // allocator time overflow // @todo - handle this (how? throw away all the history informaion and start from zero? use threshold?)
		size_t n_term_off = 1 + m_n_chunk * m_n_chunk_size;
		++ m_n_chunk;
		// calclate number of processed terms

		if(r_occ_list.size() > m_n_gpu_pool_size) {
			fprintf(stderr, "fatal error: term occurence list size exceeds gpu pool size (%d)\n", r_occ_list.size());
			return false;
		}
		// this can't be possibly ever allocated

		for(size_t i = 0, n = r_occ_list.size(); i < n; ++ i) {
			const CDocStreamSplitter::TTermOccurence &r_occ = r_occ_list[i];
			term_id_t n_term = r_occ.first;
			size_t n_freq = r_occ.second.size();
			_ASSERTE(!r_occ.second.empty());
			size_t n_first_occ = r_occ.second.front();
			// get term and it's frequency in the chunk

			_ASSERTE(n_term >= 0 && n_term < term_vector_allocation_table.size());
			TTermVectorInfo &r_t_alloc = term_vector_allocation_table[n_term];
			// get term information

			_ASSERTE(r_t_alloc.n_remaining_refs >= n_freq); // make sure the reference counting works
			if(r_t_alloc.n_remaining_refs != SIZE_MAX)
				r_t_alloc.n_remaining_refs -= n_freq;
			// decrement term reference counter

			if(r_t_alloc.n_gpu_id != -1) {
				_ASSERTE(GPU_mapping_table[n_term] == r_t_alloc.n_gpu_id); // this must match (note 0 is no longer reserved for the dummy term)
				if(!gpu_vec_allocator.ReReferencePage(n_term_off + n_first_occ, n_term, size_t(r_t_alloc.n_gpu_id)))
					return false;
				// make sure the term stays allocated for this round
			}
		}
		// decrement term frequency counters first, notify the caching policy of the vectors that need to remain allocated

		for(size_t i = 0, n = r_occ_list.size(); i < n; ++ i) {
			const CDocStreamSplitter::TTermOccurence &r_occ = r_occ_list[i];
			term_id_t n_term = r_occ.first;
			size_t n_freq = r_occ.second.size();
			_ASSERTE(!r_occ.second.empty());
			size_t n_first_occ = r_occ.second.front();
			// get term and it's frequency in the chunk

			_ASSERTE(n_term >= 0 && n_term < term_vector_allocation_table.size());
			TTermVectorInfo &r_t_alloc = term_vector_allocation_table[n_term];
			// get term information

			if(r_t_alloc.n_gpu_id == -1) {
				size_t n_allocated_GPU_slot = -1;
				term_id_t n_vacated_term;
				if(!gpu_vec_allocator.AllocatePageEx(n_term_off + n_first_occ, n_term,
				   n_allocated_GPU_slot, n_term_off, n_vacated_term))
					return false;
				_ASSERTE(n_allocated_GPU_slot < UINT32_MAX); // "<", not "<=" !!
				// the term needs to be allocated on the GPU side

				if(n_vacated_term != term_Dummy) {
					if(!term_vector_allocation_table[n_vacated_term].n_remaining_refs) {
						gpu_vec_allocator.DropPage(n_vacated_term, -1, true);
						// tell the allocator drop the page permanently
					}
					// is it final deallocation?

					if(!stl_ut::Resize_Add_1More(GPU_vacation_list,
					   TTermVacation(uint32_t(n_allocated_GPU_slot), n_vacated_term,
					   !term_vector_allocation_table[n_vacated_term].n_remaining_refs)))
						return false;
					// add it to the vacation list

					GPU_mapping_table[n_vacated_term] = -1;
					term_vector_allocation_table[n_vacated_term].n_gpu_id = -1;
					// mark it as unallocated
				}
				// in case a vacation occured because of this, add it to the list

				if(!stl_ut::Resize_Add_1More(GPU_allocation_list,
				   TTermAllocation(uint32_t(n_allocated_GPU_slot), n_term)))
					return false;
				// add this item to the allocation list

				r_t_alloc.n_gpu_id = uint32_t(n_allocated_GPU_slot);
				GPU_mapping_table[n_term] = uint32_t(n_allocated_GPU_slot); // 0 is no longer reserved for the dummy term
				// mark the allocation in the tables
			}
			// allocate the term on the GPU
		}
		// go trough the terms and allocate them on the GPU as needed

#ifdef __GPU_ALLOCATOR_SANITY_CHECKS
		{
			std::vector<term_id_t> slot_usage_vector;
			slot_usage_vector.resize(m_n_gpu_pool_size, term_Dummy);
			for(size_t i = 0, n = r_occ_list.size(); i < n; ++ i) {
				const CDocStreamSplitter::TTermOccurence &r_occ = r_occ_list[i];
				term_id_t n_term = r_occ.first;
				size_t n_freq = r_occ.second.size();
				_ASSERTE(!r_occ.second.empty());
				// get term and it's frequency in the chunk

				_ASSERTE(n_term >= 0 && n_term < term_vector_allocation_table.size());
				TTermVectorInfo &r_t_alloc = term_vector_allocation_table[n_term];
				// get term information

				if(r_t_alloc.n_gpu_id == -1)
					fprintf(stderr, "error: allocator failure: term %d not allocated\n", n_term);
				else if(slot_usage_vector[r_t_alloc.n_gpu_id] != term_Dummy)
					fprintf(stderr, "error: allocator failure: term %d colliding with term %d\n", n_term, slot_usage_vector[r_t_alloc.n_gpu_id]);
				else
					slot_usage_vector[r_t_alloc.n_gpu_id] = n_term;
				// check allocation, look for collisions
			}
		}
		// make sure all the required terms are allocated and there are no collisions
#endif //__GPU_ALLOCATOR_SANITY_CHECKS

		return true;
	}

	bool Plan_PostChunkDeallocations(const std::vector<CDocStreamSplitter::TTermOccurence> &r_occ_list,
		std::vector<TTermVacation> &GPU_leave_list, bool b_lazy_gpu_deallocation, bool b_force_flush)
	{
		if(!b_lazy_gpu_deallocation) {
			for(size_t i = 0, n = r_occ_list.size(); i < n; ++ i) {
				const CDocStreamSplitter::TTermOccurence &r_occ = r_occ_list[i];
				term_id_t n_term = r_occ.first;
				// get term

				_ASSERTE(n_term >= 0 && n_term < term_vector_allocation_table.size());
				TTermVectorInfo &r_t_alloc = term_vector_allocation_table[n_term];
				// get term information

				if(!r_t_alloc.n_remaining_refs) {
					_ASSERTE(r_t_alloc.n_gpu_id != -1); // it better be allocated
					gpu_vec_allocator.DropPage(n_term, r_t_alloc.n_gpu_id, true);
					// tell the allocator drop the page permanently

					if(!stl_ut::Resize_Add_1More(GPU_leave_list,
					   TTermVacation(r_t_alloc.n_gpu_id, n_term, !r_t_alloc.n_remaining_refs)))
						return false;
					// add this term to the GPU leave list

					GPU_mapping_table[n_term] = -1;
					r_t_alloc.n_gpu_id = -1;
					// mark it as unallocated
				}
			}
		}
		// schedule terms referenced for the last time to deallocate (if not lazy)

		if(b_force_flush) {
			for(size_t i = 0, n = term_vector_allocation_table.size(); i < n; ++ i) {
				TTermVectorInfo &r_t_alloc = term_vector_allocation_table[i];
				term_id_t n_term = term_id_t(i);
				// get term information

				if(r_t_alloc.n_gpu_id != -1) {
					_ASSERTE(!r_t_alloc.n_remaining_refs || r_t_alloc.n_remaining_refs == SIZE_MAX); // reference counters should be null at the end
					gpu_vec_allocator.DropPage(n_term, r_t_alloc.n_gpu_id, true);
					// tell the allocator drop the page permanently

					if(!stl_ut::Resize_Add_1More(GPU_leave_list,
					   TTermVacation(r_t_alloc.n_gpu_id, n_term, !r_t_alloc.n_remaining_refs)))
						return false;
					// add this term to the GPU leave list

					GPU_mapping_table[n_term] = -1;
					r_t_alloc.n_gpu_id = -1;
					// mark it as unallocated
				}
			}
		}
		// schedule terms left in the pool after the last chunk to deallocate

		return true;
	}
};
