/*
								+----------------------------------+
								|                                  |
								|   ***  Unicode conversion  ***   |
								|                                  |
								|   Copyright  -tHE SWINe- 2008   |
								|                                  |
								|            UniConv.h             |
								|                                  |
								+----------------------------------+
*/

/*
 *	2009-07-09
 *
 *	fixed unicode mappings url (http://www.unicode.org/Public/MAPPINGS/)
 *
 *	added more complete list of conversions between 8-bit, UTF-8, UTF-16
 *	(LE or BE) and UTF-32 to CUnicodeConversion
 *
 *	added alias CUniConv for CUnicodeConversion
 *
 *	2009-09-13
 *
 *	changed CUnicodeMapping::TCharacterMapping::n_character to unsigned (signed
 *	caused most encodings to fail working with character codes above 128,
 *	CUnicodeMapping::n_FromUnicode() and CUnicodeMapping::FromUnicode() functions
 *	were affected by this change)
 *
 *	2009-10-11
 *
 *	changed type of input data from const uint8_t* to const void* in some of
 *	CUniConv routines (convenience, do not have to type-cast anymore). functionality
 *	remains unchanged.
 *
 *	2009-10-20
 *
 *	fixed some warnings when compiling under VC 2005, implemented "Security
 *	Enhancements in the CRT" for VC 2008. compare against MyProjects_2009-10-19_
 *
 *	@date 2010-11-05
 *
 *	Changed string size parameters type in CUniConv functions from int to size_t
 *	(64-bit compatibility).
 *
 */

#ifndef __UNICODE_CONVERSION_INCLUDED
#define __UNICODE_CONVERSION_INCLUDED

#include "Integer.h"
#include <string>

class CUnicodeMapping;

/*
 *	class CUniConv
 *		- offers conversion between different unicode representations,
 *		  specifically between UTF-8, UTF-16 and UTF-32
 *		- note string lengths for UTF-8 and UTF-16 are in bytes as the
 *		  enncodings have variable code size. UTF-32 string lengths are
 *		  in *characters*, every UTF-32 character is exactly four bytes.
 *		  this is also true for 8-bit charsets, but there length in characters
 *		  is equal to length in bytes and so the terms are interchangable.
 *		  (in the end, this only concerns UTF32_to_UTF8 and UTF32_to_UTF16
 *		  as those are the only functions to get UTF-32 string length as input)
 */
class CUniConv {
public:
	/*
	 *	static int CUniConv::n_Decode_UTF16(const void *p_data, int n_size,
	 *		std::string &r_s_string, bool b_allow_bom, bool b_expect_little_endian = true)
	 *		- converts buffer p_data containing n_size bytes of UTF-16 encoded string
	 *		  to us-ascii (codes above 255 are replaced by '?') and puts result to r_s_string
	 *		- if b_allow_bom is set, BOM (byte-order-mark) is expected
	 *		- in case BOM is disabled or not present, it depends on value of
	 *		  b_expect_little_endian wheter little or big endian is used
	 *		- note two consecutive null characters are considered end of the string
	 *		  (they are counted as read, but are not output to r_s_string)
	 *		- returns number of bytes read from the buffer or -1 on error
	 */
	static int n_Decode_UTF16(const void *p_data, size_t n_size, std::string &r_s_string,
		bool b_allow_bom, bool b_expect_little_endian = true);

	/*
	 *	static int CUniConv::n_Decode_UTF16(const void *p_data, int n_size,
	 *		std::string &r_s_string, const CUnicodeMapping &r_map, bool b_allow_bom,
	 *		bool b_expect_little_endian = true)
	 *		- converts buffer p_data containing n_size bytes of UTF-16 encoded string
	 *		  to 8-bit charset, given by mapping p_map and puts result to r_s_string
	 *		- in case mapping of unicode to 8-bit charset doesn't exist, function behavior
	 *		  depends on value of substitute character set in r_map. in case it's negative
	 *		  (default) the function fails, otherwise function uses the replacement character.
	 *		- if b_allow_bom is set, BOM (byte-order-mark) is expected
	 *		- in case BOM is disabled or not present, it depends on value of
	 *		  b_expect_little_endian wheter little or big endian is used
	 *		- note two consecutive null characters are considered end of the string
	 *		  (they are counted as read, but are not output to r_s_string)
	 *		- returns number of bytes read from the buffer or -1 on error
	 */
	static int n_Decode_UTF16(const void *p_data, size_t n_size, std::string &r_s_string,
		const CUnicodeMapping &r_map, bool b_allow_bom, bool b_expect_little_endian = true);

	/*
	 *	static int CUniConv::n_Decode_UTF8(const void *p_data,
	 *		int n_size, std::string &r_s_string, bool b_allow_bom)
	 *		- converts buffer p_data containing n_size bytes of UTF-8 encoded string
	 *		  to us-ascii (codes above 255 are replaced by '?') and puts result to r_s_string
	 *		- if b_allow_bom is set, BOM (byte-order-mark) is expected
	 *		- note null character is considered explicit end of the string
	 *		  (it is counted as read, but is not part of r_s_string)
	 *		- returns number of bytes read from the buffer or -1 on error
	 */
	static int n_Decode_UTF8(const void *p_data, size_t n_size,
		std::string &r_s_string, bool b_allow_bom);

	/*
	 *	static int CUniConv::n_Decode_UTF8(const void *p_data, int n_size,
	 *		std::string &r_s_string, const CUnicodeMapping &r_map, bool b_allow_bom)
	 *		- converts buffer p_data containing n_size bytes of UTF-8 encoded string
	 *		  to 8-bit charset, given by mapping p_map and puts result to r_s_string
	 *		- in case mapping of unicode to 8-bit charset doesn't exist, function behavior
	 *		  depends on value of substitute character set in r_map. in case it's negative
	 *		  (default) the function fails, otherwise function uses the replacement character.
	 *		- if b_allow_bom is set, BOM (byte-order-mark) is expected
	 *		- note null character is considered explicit end of the string
	 *		  (it is counted as read, but is not part of r_s_string)
	 *		- returns number of bytes read from the buffer or -1 on error
	 */
	static int n_Decode_UTF8(const void *p_data, size_t n_size,
		std::string &r_s_string, const CUnicodeMapping &r_map, bool b_allow_bom);

	/*
	 *	static int CUniConv::n_UTF16_LE_Char_Size(uint8_t n_first_byte,
	 *		uint8_t n_second_byte)
	 *		- returns size of UTF-16 (little endian) character based on it's first
	 *		  two bytes n_first_byte, n_second_byte. result is in bytes and is either 2
	 *		  or 4 (surrogate pair)
	 *		- returns -1 on failure (low surrogate)
	 *		- note actually only the second byte is required, but that should be
	 *		  optimized-away in inline expansion of the function
	 */
	static int n_UTF16_LE_Char_Size(uint8_t n_first_byte, uint8_t n_second_byte);

	/*
	 *	static int CUniConv::n_UTF16_BE_Char_Size(uint8_t n_first_byte,
	 *		uint8_t n_second_byte)
	 *		- returns size of UTF-16 (little endian) character based on it's first
	 *		  two bytes n_first_byte, n_second_byte. result is in bytes and is either 2
	 *		  or 4 (surrogate pair)
	 *		- returns -1 on failure (low surrogate)
	 *		- note actually only the first byte is required, but that should be
	 *		  optimized-away in inline expansion of the function
	 */
	static int n_UTF16_BE_Char_Size(uint8_t n_first_byte, uint8_t n_second_byte);

	/*
	 *	static int CUniConv::n_UTF16_LE_Code(const void *p_data,
	 *		int n_size, int &r_n_read)
	 *		- decodes a single UTF-16 (little endian) character
	 *		- p_data is buffer with n_size bytes of UTF-16 data
	 *		- r_n_read will contain number of bytes read from input
	 *		  buffer upon function return
	 *		- returns character code (UTF-32) on success, -1 on failure
	 *		- note this would interpret BOM as ordinary character code
	 */
	static int n_UTF16_LE_Code(const void *p_data, size_t n_size, int &r_n_read);

	/*
	 *	static int CUniConv::n_UTF16_BE_Code(const void *p_data,
	 *		int n_size, int &r_n_read)
	 *		- decodes a single UTF-16 (big endian) character
	 *		- p_data is buffer with n_size bytes of UTF-16 data
	 *		- r_n_read will contain number of bytes read from input
	 *		  buffer upon function return
	 *		- returns character code (UTF-32) on success, -1 on failure
	 *		- note this would interpret BOM as ordinary character code
	 */
	static int n_UTF16_BE_Code(const void *p_data, size_t n_size, int &r_n_read);

	/*
	 *	static int CUniConv::n_UTF16_to_UTF32(const void *p_data,
	 *		int n_size, std::basic_string<int> &r_s_string, bool b_allow_bom,
	 *		bool b_expect_little_endian = true)
	 *		- converts buffer p_data containing n_size bytes of UTF-16 encoded string
	 *		  to plain character codes (UTF-32) and puts result to r_s_string
	 *		- if b_allow_bom is set, BOM (byte-order-mark) is expected
	 *		- in case BOM is disabled or not present, it depends on value of
	 *		  b_expect_little_endian wheter little or big endian is used
	 *		- note two consecutive null characters are considered end of the string
	 *		  (they are counted as read, but are not output to r_s_string)
	 *		- returns number of bytes read from the buffer or -1 on error
	 */
	static int n_UTF16_to_UTF32(const void *p_data, size_t n_size,
		std::basic_string<int> &r_s_string, bool b_allow_bom,
		bool b_expect_little_endian = true);

	/*
	 *	static int CUniConv::n_UTF8_Char_Size(uint8_t n_first_byte)
	 *		- returns size of UTF-8 character based on it's first byte n_first_byte
	 *		  returned size is in bytes and includes the first byte (values range 1 to 4)
	 *		- returns -1 on failure (invalid UTF-8 character)
	 */
	static int n_UTF8_Char_Size(uint8_t n_first_byte);

	/*
	 *	static int CUniConv::n_UTF8_Code(const void *p_data,
	 *		int n_size, int &r_n_read)
	 *		- decodes a single UTF-8 character
	 *		- p_data is buffer with n_size bytes of UTF-8 data
	 *		- r_n_read will contain number of bytes read from input
	 *		  buffer upon function return
	 *		- returns character code (UTF-32) on success, -1 on failure
	 *		- note this doesn't allow UTF-16 surrogates (character range 0xd800 to 0xdfff)
	 *		  or characters above 0x10ffff (returns -1 instead)
	 */
	static int n_UTF8_Code(const void *p_data, size_t n_size, int &r_n_read);

	/*
	 *	static int CUniConv::n_UTF8_to_UTF32(const void *p_data,
	 *		int n_size, std::basic_string<int> &r_s_string, bool b_allow_bom)
	 *		- converts buffer p_data containing n_size bytes of UTF-8 encoded string
	 *		  to plain character codes (UTF-32) and puts result to r_s_string
	 *		- if b_allow_bom is set, BOM (byte-order-mark) is expected
	 *		- note null character is considered explicit end of the string
	 *		  (it is counted as read, but is not part of r_s_string)
	 *		- returns number of bytes read from the buffer or -1 on error
	 */
	static int n_UTF8_to_UTF32(const void *p_data,
		size_t n_size, std::basic_string<int> &r_s_string, bool b_allow_bom);

	/*
	 *	static bool CUniConv::UTF32_to_UTF8(const int *p_data,
	 *		int n_length, std::string &r_s_string, bool b_use_bom)
	 *		- encodes plain unicode characters (UTF-32) in p_data as UTF-8
	 *		- n_length is length of string, contained in p_data (may be null-terminated)
	 *		- output is returned in r_s_string
	 *		- setting b_use_bom causes UTF-8 BOM being present in output
	 *		- returns true on success, false on failure (not enough memory, or invalid chars)
	 *		- note it's possible to call with size = -1 in case p_data contains
	 *		  null-terminated string (the loop will break after decrementing size down
	 *		  to zero (2^32 - 1 chars for -1), or when encountered zero character)
	 */
	static bool UTF32_to_UTF8(const int *p_data, size_t n_size,
		std::string &r_s_string, bool b_use_bom);

	/*
	 *	static bool CUniConv::Encode_UTF8(const char *p_data, int n_size,
	 *		const int *p_mapping_table, std::string &r_s_string, bool b_use_bom)
	 *		- encodes generic 8-bit encoded characters in p_data as UTF-8
	 *		- n_size is length of string, contained in p_data (may be null-terminated)
	 *		- p_mapping_table is table with 256 entries for each 8-bit code, containing
	 *		  corresponding UTF-32 character, or negative number for undefined characters
	 *		  (note entry with index 0 is always ignored, 8-bit char 0 is terminating zero)
	 *		- setting b_use_bom causes UTF-8 BOM being present in output
	 *		- output is returned in r_s_string
	 *		- returns true on success, false on failure (not enough memory, or invalid chars)
	 *		- note it's possible to call with size = -1 in case p_data contains
	 *		  null-terminated string (the loop will break after decrementing size down
	 *		  to zero (2^32 - 1 chars for -1), or when encountered zero character)
	 */
	static bool Encode_UTF8(const char *p_data, size_t n_size,
		const int *p_mapping_table, std::string &r_s_string, bool b_use_bom);

	/*
	 *	static inline unsigned short n_HiLoSwap(unsigned short n_code)
	 *		- swaps high and low byte in a word n_code, returns the result
	 *		  (conversion between big and little endian for UTF-16)
	 */
	static inline unsigned short n_HiLoSwap(unsigned short n_code)
	{
		return (n_code >> 8) | (n_code << 8); // xchg al, ah
	}

	/*
	 *	static bool CUniConv::UTF32_to_UTF16(const int *p_data,
	 *		int n_length, std::basic_string<unsigned short> &r_s_string,
	 *		bool b_use_bom = false, bool b_little_endian = true)
	 *		- encodes plain unicode characters (UTF-32) in p_data as UTF-8
	 *		- n_length is length of string, contained in p_data (may be null-terminated)
	 *		- output is returned in r_s_string
	 *		- b_use_bom decides wheter to include byte-order mark in the output
	 *		- b_little_endian decides wheter to encode as UTF-16 LE (true), or BE (false)
	 *		- returns true on success, false on failure (not enough memory, or invalid chars)
	 *		- note it's possible to call with size = -1 in case p_data contains
	 *		  null-terminated string (the loop will break after decrementing size down
	 *		  to zero (2^32 - 1 chars for -1), or when encountered zero character)
	 */
	static bool UTF32_to_UTF16(const int *p_data, size_t n_size,
		std::basic_string<unsigned short> &r_s_string,
		bool b_use_bom = false, bool b_little_endian = true);

	/*
	 *	static bool CUniConv::Encode_UTF16(const char *p_data, int n_size,
	 *		const int *p_mapping_table, std::basic_string<unsigned short> &r_s_string,
	 *		bool b_use_bom = false, bool b_little_endian = true)
	 *		- encodes generic 8-bit encoded characters in p_data as UTF-16
	 *		- n_size is length of string, contained in p_data (may be null-terminated)
	 *		- p_mapping_table is table with 256 entries for each 8-bit code, containing
	 *		  corresponding UTF-32 character, or negative number for undefined characters
	 *		  (note entry with index 0 is always ignored, 8-bit char 0 is terminating zero)
	 *		- output is returned in r_s_string
	 *		- b_use_bom decides wheter to include byte-order mark in the output
	 *		- b_little_endian decides wheter to encode as UTF-16 LE (true), or BE (false)
	 *		- returns true on success, false on failure (not enough memory, or invalid chars)
	 *		- note it's possible to call with size = -1 in case p_data contains
	 *		  null-terminated string (the loop will break after decrementing size down
	 *		  to zero (2^32 - 1 chars for -1), or when encountered zero character)
	 */
	static bool Encode_UTF16(const char *p_data, size_t n_size,
		const int *p_mapping_table, std::basic_string<unsigned short> &r_s_string,
		bool b_use_bom = false, bool b_little_endian = true);

	/*
	 *	static int CUniConv::n_UTF16_to_UTF8(const void *p_data, int n_size,
	 *		std::string &r_s_string, bool b_use_utf8_bom, bool b_allow_utf16_bom,
	 *		bool b_expect_utf16_little_endian = true)
	 *		- converts buffer p_data containing n_size bytes of UTF-16 encoded string
	 *		  to UTF-8 and puts result to r_s_string
	 *		- setting b_use_utf8_bom causes UTF-8 BOM being present in output
	 *		- if b_allow_utf16_bom is set, BOM (byte-order-mark) is expected
	 *		- in case BOM is disabled or not present, it depends on value of
	 *		  b_expect_utf16_little_endian wheter little or big endian is used
	 *		- note two consecutive null characters are considered end of the string
	 *		  (they are counted as read, but are not output to r_s_string)
	 *		- returns number of bytes read from the buffer or -1 on error
	 */
	static int n_UTF16_to_UTF8(const void *p_data, size_t n_size, std::string &r_s_string,
		bool b_use_utf8_bom, bool b_allow_utf16_bom,
		bool b_expect_utf16_little_endian = true);

	/*
	 *	static int CUniConv::n_UTF8_to_UTF16(const void *p_data,
	 *		int n_size, std::basic_string<unsigned short> &r_s_string,
	 *		bool b_allow_utf8_bom, bool b_include_utf16_bom = false,
	 *		bool b_utf16_little_endian = true)
	 *		- converts buffer p_data containing n_size bytes of UTF-8 encoded string
	 *		  to UTF-16 and puts result to r_s_string
	 *		- note null character is considered explicit end of the string
	 *		  (it is counted as read, but is not part of r_s_string)
	 *		- b_allow_utf8_bom decides wheter to accept UTF-8 BOM
	 *		- b_include_utf16_bom decides wheter to include byte-order mark in the output
	 *		- b_utf16_little_endian decides wheter to encode as UTF-16 LE (true), or BE (false)
	 *		- returns number of bytes read from the buffer or -1 on error
	 */
	static int n_UTF8_to_UTF16(const void *p_data, size_t n_size,
		std::basic_string<unsigned short> &r_s_string, bool b_allow_utf8_bom,
		bool b_include_utf16_bom = false, bool b_utf16_little_endian = true);
};

typedef CUniConv CUnicodeConversion;
// typedef to retain backward compatibility,
// but shouldn't be used in new software

/*
 *	class CUnicodeMapping
 *		- unicode to 8-bit charset mapping table
 */
class CUnicodeMapping {
protected:
	int m_p_mapping[256]; // for translation to unicode (UTF-32) (plain array reference)

	struct TCharacterMapping {
		unsigned char n_character;
		int n_unicode; // UTF-32
		inline operator int() const { return n_unicode; }
	} m_p_inverse_map[256]; // for translation from unicode (using binary search)
	int m_n_inverse_map_size;

	int m_n_subst_char;

public:
	/*
	 *	CUnicodeMapping::CUnicodeMapping(const char *p_s_filename, bool b_avoid_accents = false)
	 *		- default constructor; loads 8-bit charset mapping table from file p_s_filename
	 *		  (files from http://www.unicode.org/Public/MAPPINGS/, Table format: Format A)
	 *		- if b_avoid_accents is set, latin accent characters are replaced by
	 *		  ordinary ones, relies on comments in the file, such as:
	 *			0xC1	0x00C1	# LATIN CAPITAL LETTER A WITH ACUTE
	 *		  (then the unicode character 0x00C1 will be replaced with 'A'),
	 *		  note this only affects conversion of 8-bit strings from unicode, not to unicode
	 *		- it's recommended to call b_Status() to see if constructor succeeded
	 */
	CUnicodeMapping(const char *p_s_filename, bool b_avoid_accents = false);

	/*
	 *	bool CUnicodeMapping::b_Status() const
	 *		- returns true if constructor succeeded, otherwise returns false
	 *		- note functions below are designed to work, even if constructor
	 *		  failed (will not cause access violation / etc.)
	 */
	bool b_Status() const;

	/*
	 *	int CUnicodeMapping::n_FromUnicode(int n_unicode) const
	 *		- translates unicode character n_unicode (UTF-32) to 8-bit charset
	 *		- in case given character cannot be represented, substitute character
	 *		  is used instead (default -1, can be set using n_Set_SubsituteChar())
	 *		- returns 8-bit representation of (UTF-32) unicode character n_unicode
	 */
	int n_FromUnicode(int n_unicode) const;

	/*
	 *	int CUnicodeMapping::n_FromUnicode(int n_unicode, int n_substitute) const
	 *		- translates unicode character n_unicode (UTF-32) to 8-bit charset
	 *		- in case given character cannot be represented, n_substitute is used instead
	 *		- returns 8-bit representation of (UTF-32) unicode character n_unicode
	 */
	int n_FromUnicode(int n_unicode, int n_substitute) const;

	/*
	 *	int CUnicodeMapping::n_Set_SubsituteChar(int n_substitute)
	 *		- sets substitute character for conversion from unicode
	 *		  to 8-bit charset to n_substitute
	 *		- returns current former character
	 *		- note setting -1 as substitute character causes conversion routines to fail
	 *		  when there's no conversion for a particular character (default)
	 *		- note setting '?' as substitute character makes conversion routines
	 *		  never fail, they just return strings with question marks, insead of
	 *		  characters which can't be represented in a given 8-bit charset
	 */
	int n_Set_SubsituteChar(int n_substitute);

	/*
	 *	int CUnicodeMapping::n_SubsituteChar() const
	 *		- returns substitute character
	 */
	int n_SubsituteChar() const;

	/*
	 *	inline int CUnicodeMapping::n_ToUnicode(char n_character) const
	 *		- translates character in 8-bit charset to unicode (UTF-32)
	 *		- returns unicode (UTF-32) for character n_character, or -1 on error
	 *		  (character undefined either in 8-bit charset, or in unicode)
	 */
	inline int n_ToUnicode(char n_character) const
	{
		_ASSERTE(sizeof(char) == 1); // make sure array won't overflow
		return m_p_mapping[unsigned char(n_character)];
	}

	/*
	 *	bool CUnicodeMapping::FromUnicode(std::string &r_s_dest,
	 *		const std::basic_string<int> &r_s_string, char n_substitute = '?') const
	 *		- translates unicode (UTF-32) string r_s_string to 8-bit charset string r_s_dest
	 *		- in case given character cannot be represented, n_substitute is used instead
	 *		- r_s_dest is filled with translated string in 8-bit charset
	 *		- returns true on success, false on failure (not enough memory)
	 */
	bool FromUnicode(std::string &r_s_dest,
		const std::basic_string<int> &r_s_string, char n_substitute = '?') const;

	/*
	 *	bool CUnicodeMapping::ToUnicode(std::basic_string<int> &r_s_dest, std::string &r_s_string)
	 *		- translates 8-bit charset string r_s_string to unicode (UTF-32) string r_s_dest
	 *		- in case given character cannot be represented, function fails
	 *		- r_s_dest is filled with translated string in UTF-32 character set
	 *		- returns true on success, false on failure
	 */
	bool ToUnicode(std::basic_string<int> &r_s_dest, std::string &r_s_string);

	/*
	 *	inline const int *p_Get_Mapping() const
	 *		- returns mapping table (256 entries, each containing unicode for corresponding
	 *		  character, or -1 in case character can't be represented with unicode / isn't
	 *		  defined in the set)
	 *		- note this can be used in CUniConv::Encode_UTF8 or CUniConv::Encode_UTF16
	 */
	inline const int *p_Get_Mapping() const
	{
		return m_p_mapping;
	}

protected:
	static inline bool b_HigherUnicode(const TCharacterMapping &r_t_a,
		int n_unicode);
	static inline bool b_SmallerUnicode(const TCharacterMapping &r_t_a,
		const TCharacterMapping &r_t_b);
	static bool GetLine(std::string &r_s_line, FILE *p_fr);
	static bool Parse_LatinCharacterName(const std::string &r_s_line,
		char &r_n_char_name, bool &r_b_capital);
};

#endif //__UNICODE_CONVERSION_INCLUDED
