// note that this is not a standalone header, it is supposed
// to be included only in Main.cpp, from a specific place

/**
 *	@brief A simple color ID's implementation
 */
class CColorIDs {
public:
	enum {
		bits_Red = 8, /**< @brief number of bits per red component */
		bits_Green = 8, /**< @brief number of bits per green component */
		bits_Blue = 8 /**< @brief number of bits per blue component */
	};

protected:
	size_t m_n_color_num;
	int m_n_bit_num;
	int m_n_red_bits, m_n_green_bits, m_n_blue_bits;
	uint32_t m_n_red_mask, m_n_green_mask, m_n_blue_mask;
	int m_n_red_shift, m_n_green_shift, m_n_blue_shift;

public:
	/**
	 *	@brief default constructor
	 *	@param[in] n_color_num is number of required colors (must fit in bits_Red + bits_Green + bits_Blue)
	 */
	CColorIDs(size_t n_color_num)
	{
		m_n_color_num = n_color_num;
		m_n_bit_num = int(ceil(log(double(n_color_num)) / log(2.0)));
		// store number of required colors and number of bits needed for this number

		_ASSERTE(m_n_bit_num <= bits_Red + bits_Green + bits_Blue); // must fit in bits_Red + bits_Green + bits_Blue

		m_n_red_bits = m_n_blue_bits = m_n_bit_num / 3;
		m_n_green_bits = m_n_bit_num - 2 * m_n_red_bits;
		// divide bits between color components

		m_n_blue_mask = (uint32_t(1) << m_n_blue_bits) - 1;
		m_n_green_mask = ((uint32_t(1) << m_n_green_bits) - 1) << m_n_blue_bits;
		m_n_red_mask = ((uint32_t(1) << m_n_red_bits) - 1) << (m_n_blue_bits + m_n_green_bits);
		// calculate masks

		m_n_red_shift = bits_Green + bits_Blue + (bits_Red - m_n_red_bits) - (m_n_blue_bits + m_n_green_bits);
		m_n_green_shift = bits_Blue + (bits_Green - m_n_green_bits) - m_n_blue_bits;
		m_n_blue_shift = bits_Blue - m_n_blue_bits;
		// calculate shifts
	}

	/**
	 *	@brief gets color by index
	 *	@param[in] n_index is zero-based color index (color 0 is always black)
	 *	@return Returns color with the specified index (it is different from any color with other index).
	 */
	uint32_t n_Color(size_t n_index) const
	{
		return ((n_index & m_n_red_mask) << m_n_red_shift) |
			   ((n_index & m_n_green_mask) << m_n_green_shift) |
			   ((n_index & m_n_blue_mask) << m_n_blue_shift);
	}

	/**
	 *	@brief gets index of a color
	 *	@param[in] n_color is color (eg. returned by n_Color())
	 *	@return Returns zero-based index of the given color.
	 */
	size_t n_Index(uint32_t n_color) const
	{
		return ((n_color >> m_n_red_shift) & m_n_red_mask) |
			   ((n_color >> m_n_green_shift) & m_n_green_mask) |
			   ((n_color >> m_n_blue_shift) & m_n_blue_mask);
	}
};

void Calibrate()
{
	const char *p_line_list[] = {
		"line(turn: 1, coef: 0.22381602897778 -0.0751618278353714 0.909752536111808 -0.332559295868015, clamp: 0.176136 - 0.616372)",
		"line(turn: 1, coef: 0.681943430699958 0.448865402718541 -0.951351263160157 0.320662201618743, clamp: 0.179503 - 0.47601)",
		"line(turn: 1, coef: 0.136759110614051 -0.33909158424367 0.939602082457273 0.219507385401981, clamp: 0.191288 - 0.694655)",
		"line(turn: 1, coef: 0.786333430340697 -2.92117926993684 13.0714777637824 -18.2734921925706, clamp: 0.225168 - 0.257997)",
		"line(turn: 1, coef: 0.639165058525268 0.0862897893651375 0.413842590563961 -1.13172508756361, clamp: 0.262205 - 0.311448)",
		"line(turn: 1, coef: 0.680704592780966 0.557337351324011 -1.23780542322314 0.532924604177187, clamp: 0.265572 - 0.487374)",
		"line(turn: 1, coef: 0.0828097088227502 6.49825100191717 -25.1814078176343 29.0992341719243, clamp: 0.275253 - 0.301136)",
		"line(turn: 1, coef: 0.77031342242536 -1.3277056696157 3.56017303626284 -3.10318625259242, clamp: 0.372264 - 0.433291)",
		"line(turn: 1, coef: 1.02457281638244 -3.47003908793213 3.89227478290361 -1.42011982717224, clamp: 0.416667 - 0.629419)",
		"line(turn: 1, coef: -0.558107710347946 7.43067181557741 -15.1306082247068 10.711875220387, clamp: 0.43287 - 0.48548)",
		"line(turn: 1, coef: -8.90443635907277 62.8396543985439 -136.720771376186 97.7241721511908, clamp: 0.450968 - 0.473695)",
		"line(turn: 1, coef: 0.888948541554147 -2.26471671557175 1.69015905787937 -0.189555575642557, clamp: 0.462121 - 0.715488)",
		"line(turn: 1, coef: 0.764733707894127 -0.0734481917401125 0.141769471007522 -0.441966919174704, clamp: 0.501894 - 0.664562)",
		"line(turn: 1, coef: -4.37074859607448 27.3426954538431 -48.9169396670188 28.3886361002974, clamp: 0.553451 - 0.579335)",
		"line(turn: 1, coef: 0.698746962101669 -0.875122415359124 -0.244139752809215 0.607071844964819, clamp: 0.558712 - 0.760311)",
		"line(turn: 1, coef: -0.913625419980619 6.86038133196308 -11.0754787312153 5.7198036848741, clamp: 0.619949 - 0.6875)",
		"line(turn: 1, coef: 1.27885986239169 -2.78230254054391 4.32529651712635 -2.23393241470423, clamp: 0.686237 - 0.753157)",
		"line(turn: 1, coef: 1.78908909978227 -4.84119432543283 7.13599164437234 -3.50697018212234, clamp: 0.693392 - 0.753788)",
		"line(turn: 1, coef: 0.503773236369529 0.175890948848667 -0.616292414159713 0.285395234454448, clamp: 0.699916 - 0.823653)",
		"line(turn: 0, coef: 0.679032798767094 1.08795500177947 -2.678984728039 2.23315885954748, clamp: 0.0538721 - 0.344066)",
		"line(turn: 0, coef: 0.512460525066606 0.793891112828976 -0.634662696789727 -0.00434876941392545, clamp: 0.44213 - 0.678872)",
		"line(turn: 1, coef: 0.923194832167928 -1.37605163468594 1.41154484698736 -0.592107758141234, clamp: 0.739689 - 0.806397)",
		"line(turn: 0, coef: 0.420054257345201 1.31942328922584 -1.52838760613951 0.501522331365729, clamp: 0.541667 - 0.606692)",
		"line(turn: 0, coef: -1.96555153768309 13.3563210506979 -21.6532347458271 11.7034793894666, clamp: 0.553451 - 0.6008)",
		// image 0

		"line(turn: 0, coef: 0.496809791925012 -1.04016171858715 0.845358439995883 -0.0233955530946973, clamp: 0.444655 - 0.655513)",
		"line(turn: 0, coef: 0.469260993434495 -0.7688712691083 0.442752003149782 0.169541826592055, clamp: 0.496843 - 0.643729)",
		"line(turn: 0, coef: 0.300938986169449 -0.393328160581976 0.174745074958682 0.557264628149245, clamp: 0.29819 - 0.464015)",
		"line(turn: 0, coef: 0.37566421406564 -1.19537882819784 3.16390155862456 -3.3079399704874, clamp: 0.0246212 - 0.272517)",
		"line(turn: 1, coef: 0.530937603088007 -0.936431976276628 5.03266528468226 -7.12835047783591, clamp: 0.219907 - 0.277567)",
		"line(turn: 0, coef: 0.497684085269261 -0.666016059699774 0.421264956429098 0.0941318302635282, clamp: 0.513047 - 0.654251)",
		"line(turn: 0, coef: 0.450039229978363 -0.372526319511241 -0.0482807995487338 0.343457653069585, clamp: 0.525042 - 0.655513)",
		"line(turn: 0, coef: 51.0365026603183 -218.468362172737 312.513011936695 -148.508011059427, clamp: 0.683291 - 0.710859)",
		"line(turn: 0, coef: 0.905368686994955 -1.97422041432083 1.93134416542123 -0.907915762020871, clamp: 0.386574 - 0.486111)",
		"line(turn: 0, coef: 0.395520023794931 0.0368173535650408 -0.720084990430032 0.712434298118616, clamp: 0.479798 - 0.666456)",
		"line(turn: 1, coef: 0.0143235373916265 3.38905768038021 -4.42234672351123 2.17233280854995, clamp: 0.321128 - 0.463384)",
		"line(turn: 1, coef: -0.0164083518137482 3.6496010587848 -5.32018728831954 3.04109220193501, clamp: 0.323443 - 0.464015)",
		"line(turn: 1, coef: 0.993587054858069 -2.1764960401157 1.64552773362041 -0.354022741294634, clamp: 0.410354 - 0.710227)",
		"line(turn: 0, coef: 0.516495404174977 -0.0909782669318362 0.0359044991438017 0.0169792594658876, clamp: 0.337121 - 0.837121)",
		"line(turn: 1, coef: 0.840664949688957 -1.35798227723012 0.433202615426378 0.23438065007001, clamp: 0.565657 - 0.678241)",
		"line(turn: 0, coef: 0.550711709461862 0.775395452964366 3.19009132612413 -8.22389597549306, clamp: 0.173611 - 0.271886)",
		// image 1
	};
	const size_t n_line_num = sizeof(p_line_list) / sizeof(p_line_list[0]);

	TBmp *p_square = TBmp::p_Alloc(1024, 1024);

	std::vector<std::vector<std::pair<float, float> > > line_list;

	for(int i = 0; i < n_line_num; ++ i) {
		line_list.resize(line_list.size() + 1);
		std::vector<std::pair<float, float> > &r_line = line_list.back();

		{
			double a, b, c, d, f_min_x, f_max_x;
			int n_flip;
			if(sscanf(p_line_list[i], "line(turn: %d, coef: %lf %lf %lf %lf, clamp: %lf - %lf)",
			   &n_flip, &a, &b, &c, &d, &f_min_x, &f_max_x) != 7) {
				fprintf(stderr, "error: line record %d malformed\n", i);
				line_list.erase(line_list.end() - 1);
				continue;
			}
			bool b_flip = n_flip != 0;
			// get line params

			const int n_subdivide = max(3, int((f_max_x - f_min_x) * 100));
			for(int j = 0; j < n_subdivide; ++ j) {
				float f_x0 = float(f_min_x + j * (f_max_x - f_min_x) / n_subdivide);
				float f_x1 = float(f_min_x + (j + 1) * (f_max_x - f_min_x) / n_subdivide);
				float f_y0 = float(a + b * f_x0 + c * f_x0 * f_x0 + d * f_x0 * f_x0 * f_x0);
				float f_y1 = float(a + b * f_x1 + c * f_x1 * f_x1 + d * f_x1 * f_x1 * f_x1);

				if(b_flip) {
					std::swap(f_x0, f_y0);
					std::swap(f_x1, f_y1);
				}

				r_line.push_back(std::make_pair(f_x0 * 2 - 1, f_y0 * 2 - 1));
				if(j + 1 == n_subdivide)
					r_line.push_back(std::make_pair(f_x1 * 2 - 1, f_y1 * 2 - 1));
				// save the line points in normalized OpenGL-like coordinates

				f_x0 *= p_square->n_width;
				f_x1 *= p_square->n_width;
				f_y0 *= p_square->n_height;
				f_y1 *= p_square->n_height;

				p_square->DrawLine_AA(f_x0, f_y0, f_x1, f_y1, 0xff00ff00U);
			}
			// draw the lines
		}
		// get lines

		float f_sum_x = 0;
		float f_sum_x2 = 0;
		float f_sum_x_y = 0;
		float f_sum_y = 0;
		for(int j = 0, m = r_line.size(); j < m; ++ j) {
			std::pair<float, float> p = r_line[j];
			f_sum_x += p.first;
			f_sum_y += p.second;
			f_sum_x2 += p.first * p.first;
			f_sum_x_y += p.first * p.second;
		}
		float a = (f_sum_y * f_sum_x2 - f_sum_x * f_sum_x_y) / (r_line.size() * f_sum_x2 - f_sum_x * f_sum_x);
		float b = (r_line.size() * f_sum_x_y - f_sum_x * f_sum_y) / (r_line.size() * f_sum_x2 - f_sum_x * f_sum_x);
		// calculate linear least squares line (a = DC, b = slope)

		float lx0 = -1, ly0 = a - b;
		float lx1 = 1, ly1 = a + b;
		float nx = lx1 - lx0;
		float ny = ly1 - ly0;
		float rm = 1 / sqrt(nx * nx + ny * ny);
		nx *= rm;
		ny *= rm;
		// calculate point and vector on the line

		for(int j = 0, m = r_line.size(); j < m; ++ j) {
			std::pair<float, float> p = r_line[j];
			float x = p.first, y = p.second;
			// for a point on a distorted line

			float diff_x = x - lx0, diff_y = y - ly0;
			float f_dot = nx * diff_x + ny * diff_y;
			float x1 = lx0 + nx * f_dot, y1 = ly0 + ny * f_dot;
			// calculate the (approximate) closest point on a perfect line

			float f_error = (x1 - x) * (x1 - x) + (y1 - y) * (y1 - y);
			// error is the squred distance of the two points

			x = (x * .5f + .5f) * p_square->n_width;
			y = (y * .5f + .5f) * p_square->n_height;
			// denormalize the coordinate

			p_square->DrawRect(int(x - .5f), int(y - .5f), int(x + 1.5f), int(y + 1.5f), 0xff0000ffU);

			/*x = (lx0 * .5f + .5f) * p_square->n_width;
			y = (ly0 * .5f + .5f) * p_square->n_height;*/
			x1 = (x1 * .5f + .5f) * p_square->n_width;
			y1 = (y1 * .5f + .5f) * p_square->n_height;

			p_square->DrawLine_AA(x, y, x1, y1, 0xffff0000U);
		}
	}

	CPngCodec::Save_PNG("06_square_lines.png", *p_square);

	double K[3] = {0}; // radial distortion parameters
	const double xc = 0, yc = 0; // distortion center
	for(int n_pass = 0; n_pass < 75; ++ n_pass) {
		double p_rhs[3] = {0};
		MatrixMNd H(3, 3);
		//H.SetZero();
		H.Identity();
		H *= 1e-1f;

		p_square->Clear(-1);

		for(int l = 0; l < n_line_num; ++ l) {
			std::vector<std::pair<float, float> > &r_line = line_list[l];
			// get a line

			float f_sum_x = 0;
			float f_sum_x2 = 0;
			float f_sum_x_y = 0;
			float f_sum_y = 0;

			float last_x, last_y;
			for(int j = 0, m = r_line.size(); j < m; ++ j) {
				std::pair<float, float> p = r_line[j];
				float xd = p.first, yd = p.second;
				// for a point on a distorted line

				float r = float(sqrt((xd - xc) * (xd - xc) + (yd - yc) * (yd - yc)));
				float r2 = r * r, r4 = r2 * r2, r8 = r4 * r4;
				float xu = float((xd - xc) * (1 + K[0] * r2 + K[1] * r4 + K[2] * r8));
				float yu = float((yd - yc) * (1 + K[0] * r2 + K[1] * r4 + K[2] * r8));
				// calculate the corrected point

				f_sum_x += xu;
				f_sum_y += yu;
				f_sum_x2 += xu * xu;
				f_sum_x_y += xu * yu;

				xu = (xu * .5f + .5f) * p_square->n_width;
				yu = (yu * .5f + .5f) * p_square->n_height;
				if(j)
					p_square->DrawLine_AA(xu, yu, last_x, last_y, 0xff000000U);
				last_x = xu;
				last_y = yu;
			}

			float a = (f_sum_y * f_sum_x2 - f_sum_x * f_sum_x_y) / (r_line.size() * f_sum_x2 - f_sum_x * f_sum_x);
			float b = (r_line.size() * f_sum_x_y - f_sum_x * f_sum_y) / (r_line.size() * f_sum_x2 - f_sum_x * f_sum_x);
			// calculate linear least squares line (a = DC, b = slope)

			float lx0 = -1, ly0 = a - b;
			float lx1 = 1, ly1 = a + b;
			float nx = lx1 - lx0;
			float ny = ly1 - ly0;
			float rm = 1 / sqrt(nx * nx + ny * ny);
			nx *= rm;
			ny *= rm;
			// calculate point and vector on the line

			for(int j = 0, m = r_line.size(); j < m; ++ j) {
				std::pair<float, float> p = r_line[j];
				float xd = p.first, yd = p.second;
				// for a point on a distorted line

				float r = float(sqrt((xd - xc) * (xd - xc) + (yd - yc) * (yd - yc)));
				float r2 = r * r, r4 = r2 * r2, r8 = r4 * r4;
				float xu = float((xd - xc) * (1 + K[0] * r2 + K[1] * r4 + K[2] * r8));
				float yu = float((yd - yc) * (1 + K[0] * r2 + K[1] * r4 + K[2] * r8));
				// calculate the corrected point

				float diff_x = xu - lx0, diff_y = yu - ly0;
				float f_dot = nx * diff_x + ny * diff_y;
				float x1 = lx0 + nx * f_dot, y1 = ly0 + ny * f_dot;
				// calculate the (approximate) closest point on a perfect line

				float f_error = (x1 - xu) * (x1 - xu) + (y1 - yu) * (y1 - yu);
				// error is the squred distance of the two points

				/*

				syms xd yd xc yc nx ny lx0 ly0 real % const inputs
				syms r r2 r4 r8 xu yu diff_x diff_y f_dot x1 y1 f_error real % intermediates
				syms K0 K1 K2 real

				r = sqrt((xd - xc) * (xd - xc) + (yd - yc) * (yd - yc));
				r2 = r * r;
				r4 = r2 * r2;
				r8 = r4 * r4;
				xu = (xd - xc) * (1 + K0 * r2 + K1 * r4 + K2 * r8);
				yu = (yd - yc) * (1 + K0 * r2 + K1 * r4 + K2 * r8);
				diff_x = xu - lx0;
				diff_y = yu - ly0;
				f_dot = nx * diff_x + ny * diff_y;
				x1 = lx0 + nx * f_dot;
				y1 = ly0 + ny * f_dot;
				f_error = (x1 - xu) * (x1 - xu) + (y1 - yu) * (y1 - yu);

				ccode(jacobian(f_error, [K0 K1 K2])')

				 */

				double T[3][1], K0 = K[0], K1 = K[1], K2 = K[2];
				double MapleGenVar2 = 2.0;
				double MapleGenVar4 = lx0+nx*(nx*((xd-xc)*(1.0+K0*(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc)+
					K1*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,2.0)+K2*pow(xd*xd-2.0*xd*xc+xc*xc+
					yd*yd-2.0*yd*yc+yc*yc,4.0))-lx0)+ny*((yd-yc)*(1.0+K0*(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*
					yd*yc+yc*yc)+K1*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,2.0)+K2*pow(xd*xd-2.0*
					xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,4.0))-ly0))-(xd-xc)*(1.0+K0*(xd*xd-2.0*xd*xc+xc*xc+
					yd*yd-2.0*yd*yc+yc*yc)+K1*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,2.0)+K2*
					pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,4.0));
				double MapleGenVar5 = nx*(nx*(xd-xc)*(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc)+ny*(yd-yc)*(xd*
					xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc))-(xd-xc)*(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc);
				double MapleGenVar3 = MapleGenVar4*MapleGenVar5;
				double MapleGenVar1 = MapleGenVar2*MapleGenVar3;
				MapleGenVar3 = 2.0;
				MapleGenVar5 = ly0+ny*(nx*((xd-xc)*(1.0+K0*(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc)+
					K1*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,2.0)+K2*pow(xd*xd-2.0*xd*xc+xc*xc+
					yd*yd-2.0*yd*yc+yc*yc,4.0))-lx0)+ny*((yd-yc)*(1.0+K0*(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*
					yd*yc+yc*yc)+K1*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,2.0)+K2*pow(xd*xd-2.0*
					xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,4.0))-ly0))-(yd-yc)*(1.0+K0*(xd*xd-2.0*xd*xc+xc*xc+yd*
					yd-2.0*yd*yc+yc*yc)+K1*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,2.0)+K2*pow(xd*
					xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,4.0));
				double MapleGenVar6 = ny*(nx*(xd-xc)*(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc)+ny*(yd-yc)*
					(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc))-(yd-yc)*(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc);
				MapleGenVar4 = MapleGenVar5*MapleGenVar6;
				MapleGenVar2 = MapleGenVar3*MapleGenVar4;
				T[0][0] = MapleGenVar1+MapleGenVar2;
				MapleGenVar2 = 2.0;
				MapleGenVar4 = lx0+nx*(nx*((xd-xc)*(1.0+K0*(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc)+K1*
					pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,2.0)+K2*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-
					2.0*yd*yc+yc*yc,4.0))-lx0)+ny*((yd-yc)*(1.0+K0*(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*
					yc)+K1*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,2.0)+K2*pow(xd*xd-2.0*xd*xc+xc*xc+
					yd*yd-2.0*yd*yc+yc*yc,4.0))-ly0))-(xd-xc)*(1.0+K0*(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+
					yc*yc)+K1*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,2.0)+K2*pow(xd*xd-2.0*xd*xc+xc*
					xc+yd*yd-2.0*yd*yc+yc*yc,4.0));
				MapleGenVar5 = nx*(nx*(xd-xc)*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,2.0)+ny*(yd-yc)*
					pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,2.0))-(xd-xc)*pow(xd*xd-2.0*xd*xc+xc*xc+
					yd*yd-2.0*yd*yc+yc*yc,2.0);
				MapleGenVar3 = MapleGenVar4*MapleGenVar5;
				MapleGenVar1 = MapleGenVar2*MapleGenVar3;
				MapleGenVar3 = 2.0;
				MapleGenVar5 = ly0+ny*(nx*((xd-xc)*(1.0+K0*(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc)+
					K1*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,2.0)+K2*pow(xd*xd-2.0*xd*xc+xc*xc+
					yd*yd-2.0*yd*yc+yc*yc,4.0))-lx0)+ny*((yd-yc)*(1.0+K0*(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*
					yd*yc+yc*yc)+K1*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,2.0)+K2*pow(xd*xd-2.0*
					xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,4.0))-ly0))-(yd-yc)*(1.0+K0*(xd*xd-2.0*xd*xc+xc*xc+
					yd*yd-2.0*yd*yc+yc*yc)+K1*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,2.0)+K2*
					pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,4.0));
				MapleGenVar6 = ny*(nx*(xd-xc)*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,2.0)+ny*(yd-
					yc)*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,2.0))-(yd-yc)*pow(xd*xd-2.0*xd*xc+
					xc*xc+yd*yd-2.0*yd*yc+yc*yc,2.0);
				MapleGenVar4 = MapleGenVar5*MapleGenVar6;
				MapleGenVar2 = MapleGenVar3*MapleGenVar4;
				T[1][0] = MapleGenVar1+MapleGenVar2;
				MapleGenVar2 = 2.0;
				MapleGenVar4 = lx0+nx*(nx*((xd-xc)*(1.0+K0*(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc)+
					K1*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,2.0)+K2*pow(xd*xd-2.0*xd*xc+xc*xc+
					yd*yd-2.0*yd*yc+yc*yc,4.0))-lx0)+ny*((yd-yc)*(1.0+K0*(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*
					yd*yc+yc*yc)+K1*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,2.0)+K2*pow(xd*xd-2.0*
					xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,4.0))-ly0))-(xd-xc)*(1.0+K0*(xd*xd-2.0*xd*xc+xc*xc+
					yd*yd-2.0*yd*yc+yc*yc)+K1*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,2.0)+K2*
					pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,4.0));
				MapleGenVar5 = nx*(nx*(xd-xc)*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,4.0)+ny*
					(yd-yc)*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,4.0))-(xd-xc)*pow(xd*xd-2.0*
					xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,4.0);
				MapleGenVar3 = MapleGenVar4*MapleGenVar5;
				MapleGenVar1 = MapleGenVar2*MapleGenVar3;
				MapleGenVar3 = 2.0;
				MapleGenVar5 = ly0+ny*(nx*((xd-xc)*(1.0+K0*(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc)+
					K1*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,2.0)+K2*pow(xd*xd-2.0*xd*xc+xc*xc+
					yd*yd-2.0*yd*yc+yc*yc,4.0))-lx0)+ny*((yd-yc)*(1.0+K0*(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*
					yd*yc+yc*yc)+K1*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,2.0)+K2*pow(xd*xd-2.0*
					xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,4.0))-ly0))-(yd-yc)*(1.0+K0*(xd*xd-2.0*xd*xc+xc*xc+
					yd*yd-2.0*yd*yc+yc*yc)+K1*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,2.0)+K2*
					pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,4.0));
				MapleGenVar6 = ny*(nx*(xd-xc)*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,4.0)+ny*(yd-
					yc)*pow(xd*xd-2.0*xd*xc+xc*xc+yd*yd-2.0*yd*yc+yc*yc,4.0))-(yd-yc)*pow(xd*xd-2.0*xd*xc+
					xc*xc+yd*yd-2.0*yd*yc+yc*yc,4.0);
				MapleGenVar4 = MapleGenVar5*MapleGenVar6;
				MapleGenVar2 = MapleGenVar3*MapleGenVar4;
				T[2][0] = MapleGenVar1+MapleGenVar2;
				// calculate the jacobian

				for(int i = 0; i < 3; ++ i) {
					p_rhs[i] += f_error * T[i][0];
					for(int o = 0; o < 3; ++ o)
						H[i][o] += T[i][0] * T[o][0];
				}
				// update rhs and hessian
			}
			// error for each line points
		}
		// error for each line

		double p_dx[3];
		int p_row_perm[3];
		H.LU(p_row_perm, 3);
		for(int i = 0; i < 3; ++ i)
			p_dx[i] = p_rhs[p_row_perm[i]];
		H.LSolve_UnitDiag(p_dx, 3);
		H.USolve(p_dx, 3);
		// solve the system to obtain delta-x

		double f_norm_dx = 0;
		for(int i = 0; i < 3; ++ i) {
			K[i] -= p_dx[i];
			f_norm_dx += p_dx[i] * p_dx[i];
		}
		f_norm_dx = sqrt(f_norm_dx);
		// update the solution

		printf("step %d: norm(dx) = %g\n", n_pass, f_norm_dx);

		if(!(n_pass % 10)) {
			char p_s_image[256];
			sprintf(p_s_image, "07_%04d_nls.png", n_pass);
			CPngCodec::Save_PNG(p_s_image, *p_square);
		}
	}

	printf("radial distortion parameters: %.15g %.15g %.15g\n", K[0], K[1], K[2]);
	// 0.250403979510877 0.277885983845201 0.3071973231629

	p_square->Delete();
}

static inline TBmp *p_LoadJpegImage(const char *p_s_filename, bool b_thumb = false) // work with thumbs now
{
	return CTinyJpegDecoder::p_Load_Jpeg(p_s_filename, b_thumb);
}

void TraceLines()
{
	printf("loading images ...\n");
	TBmp *p_mask_image = CPngCodec::p_Load_PNG("calib_lines01.png");
	TBmp *p_cam_image = /*CPngCodec::p_Load_PNG("calib_lines01_cam.png");*/p_LoadJpegImage("calib_lines01.jpg", false);
	//TBmp *p_mask_image = CPngCodec::p_Load_PNG("calib_lines00.png");
	//TBmp *p_cam_image = /*CPngCodec::p_Load_PNG("calib_lines01_cam.png");*/p_LoadJpegImage("calib_lines00.jpg", false);

	printf("filtering ...\n");
	p_cam_image->Make_Grayscale();
	p_cam_image->b_grayscale = false; // broken png support
	{
		TBmp *p_rc_horz = p_cam_image->p_Clone(true);
#if 0
		TBmp::CConstFilterLoop<2, 2>::FilterLoop(*p_rc_horz, *p_cam_image,
			TBmp::CBasicFilterKernels::n_RobertCross, TBmp::CBasicFilterKernels::n_DummyFilterBorder);
#else
		TBmp::CConstFilterLoop<3, 3>::FilterLoop(*p_rc_horz, *p_cam_image,
			TBmp::CBasicFilterKernels::n_Sobel_Gray, TBmp::CBasicFilterKernels::n_DummyFilterBorder);
#endif
		std::swap(p_rc_horz->p_buffer, p_cam_image->p_buffer); // just swap the buffers
		p_rc_horz->Delete();
	}
	// filter the camera image with robert-cross operator

	uint32_t n_mask_color = *p_mask_image->p_buffer; // the first pixel must be the mask color
	uint32_t n_replace_color = 0xff000000u;
	if(n_mask_color == n_replace_color)
		n_mask_color = 0xff123456u;
	p_mask_image->FloodFill(0, 0, n_replace_color); // erase the mask

	// modify the color to something else

	CPngCodec::Save_PNG("00_mask_ff.png", *p_mask_image, true);
	CPngCodec::Save_PNG("01_cam_filtered.png", *p_cam_image);

	printf("masking ...\n");

	size_t n_line_num = 0;
	const int mw = p_mask_image->n_width, mh = p_mask_image->n_height;
	for(int y = 0; y < mh; ++ y) {
		for(int x = 0; x < mw; ++ x) {
			if(p_mask_image->p_buffer[x + mw * y] == n_mask_color) {
				p_mask_image->FloodFill(x, y, ++ n_line_num);
			} else if(p_mask_image->p_buffer[x + mw * y] > n_line_num)
				p_mask_image->p_buffer[x + mw * y] = 0; // background
		}
	}
	CColorIDs color_ids(n_line_num + 1);
	#pragma omp parallel for
	for(int y = 0; y < mh; ++ y) {
		for(int x = 0; x < mw; ++ x) {
			p_mask_image->p_buffer[x + mw * y] =
				color_ids.n_Color(p_mask_image->p_buffer[x + mw * y]);
		}
	}
	// find connected components

	CPngCodec::Save_PNG("02_mask_components.png", *p_mask_image, true);

	std::vector<std::vector<std::pair<std::pair<int, int>, float> > > line_fragment_list;

	const int cw = p_cam_image->n_width, ch = p_cam_image->n_height;
	for(int y = 0; y < ch; ++ y) {
		int n_mask_y = int(floor(y / float(ch - 1) * (mh - 1) + .5f));
		for(int x = 0; x < cw; ++ x) {
			int n_mask_x = int(floor(x / float(cw - 1) * (mw - 1) + .5f));
			int n_mask = p_mask_image->p_buffer[n_mask_x + mw * n_mask_y];
			if(n_mask > 0) {
				n_mask = color_ids.n_Index(n_mask); // !!
				line_fragment_list.resize(max(size_t(n_mask), line_fragment_list.size()));
				// make sure the line list for the given line is allocated

				float f_weight = (p_cam_image->p_buffer[x + cw * y] & 0xff) / 255.0f;
				if(f_weight > .2f) // hard thresh
					line_fragment_list[-- n_mask].push_back(std::make_pair(std::make_pair(x, y), f_weight));
				// add a floating-point value with the current coordinates to the list
			} else
				p_cam_image->p_buffer[x + cw * y] = 0;
		}
	}
	// mask out the unimportant edges, extract lists of points

	CPngCodec::Save_PNG("03_cam_filtered_masked.png", *p_cam_image);

	printf("tracing the edges ...\n");

	TBmp *p_square = TBmp::p_Alloc(512, 512);
	p_square->Clear(0xff000000U);

	for(size_t i = 0, n = line_fragment_list.size(); i < n; ++ i) {
		std::vector<std::pair<std::pair<int, int>, float> > &r_fragment_list = line_fragment_list[i];
		// list of fragments

		if(r_fragment_list.size() < 10)
			continue;

		std::pair<int, int> t_min = r_fragment_list.front().first,
			t_max = r_fragment_list.front().first;
		for(size_t j = 0, m = r_fragment_list.size(); j < m; ++ j) {
			std::pair<int, int> t_frag = r_fragment_list[j].first;
			if(t_min.first > t_frag.first)
				t_min.first = t_frag.first;
			if(t_min.second > t_frag.second)
				t_min.second = t_frag.second;
			if(t_max.first < t_frag.first)
				t_max.first = t_frag.first;
			if(t_max.second < t_frag.second)
				t_max.second = t_frag.second;
		}
		// get min / max coordinates

		int n_width = t_max.first - t_min.first;
		int n_height = t_max.second - t_min.second;
		// determine bounding box of the thing (the straight lines should
		// mostly be 1D convex even after the deformation by the lenses)

		bool b_flip;
		if(((b_flip = n_width < n_height))) {
			for(size_t j = 0, m = r_fragment_list.size(); j < m; ++ j) {
				std::pair<int, int> &t_frag = r_fragment_list[j].first;
				std::swap(t_frag.first, t_frag.second);
			}
			std::swap(t_max.first, t_max.second);
			std::swap(t_min.first, t_min.second);
			std::swap(n_width, n_height);
		}
		// turn vertical lines for processing

		// jacobian is delta residual / delta parameter
		// parameter is (a b c d) of line equation
		// residual is distance of the point from the line (times weight)
		// the distance is (py - f(px))^2 = (py - a - b * px - c * px^2 - d * px^3)^2
		// the jacobian is 4x1 matrix
		//		delta error / delta a
		//		delta error / delta b
		//		delta error / delta c
		//		delta error / delta d
		//
		// using:
		//		syms px py a b c d r real
		//		r = (py - (a + b * px + c * px^2 + d * px^3))^2
		//		jacobian(r, [a b c d])'
		//
		// gives:
		//
		//		ans =
		//			-2*py+2*a+2*b*px+2*c*px^2+2*d*px^3
		//			-2*(py-a-b*px-c*px^2-d*px^3)*px
		//			-2*(py-a-b*px-c*px^2-d*px^3)*px^2
		//			-2*(py-a-b*px-c*px^2-d*px^3)*px^3
		//

		int ss = max(cw, ch);
		// assume the frame will be square and there is no skew involved (doesn't seem to be)

		double f_scale_x = 1.0 / ss;
		double f_scale_y = 1.0 / ss;
		// scale

		double f_offset_x = f_scale_x * (ss - cw) / 2;
		double f_offset_y = f_scale_y * (ss - ch) / 2;
		// offset to x and y in the frame

		if(b_flip) {
			std::swap(f_scale_x, f_scale_y);
			std::swap(f_offset_x, f_offset_y);
		}

		double a = f_scale_y * (t_max.second + t_min.second) * .5f, b = 0, c = 0, d = 0;
		// initialize the solution (with the DC component - easy guess)

		MatrixMNd H(4, 4); // allocate once, reuse storage
		for(int n_pass_num = 0; n_pass_num < 1000; ++ n_pass_num) {
			//H.Identity();
			//H *= 1e-12; // some damping
			H.SetZero();
			// initialize the hessian

			double p_rhs[4] = {0};
			// right-hand side

			for(size_t j = 0, m = r_fragment_list.size(); j < m; ++ j) {
				std::pair<int, int> t_frag = r_fragment_list[j].first;
				float f_weight = r_fragment_list[j].second;
				f_weight *= f_weight;
				double px = f_offset_x + f_scale_x * t_frag.first, px_2 = px * px, px_3 = px_2 * px;
				double py = f_offset_y + f_scale_y * t_frag.second;
				double r = py - (a + b * px + c * px_2 + d * px_3);
				r *= r;
				// calculate error

				double p_J[4];
				p_J[0] = -2*py+2*a+2*b*px+2*c*px_2+2*d*px_3;
				p_J[1] = -2*(py-a-b*px-c*px_2-d*px_3)*px;
				p_J[2] = -2*(py-a-b*px-c*px_2-d*px_3)*px_2;
				p_J[3] = -2*(py-a-b*px-c*px_2-d*px_3)*px_3;
				// calculate jacobian // todo - make a class for polynomial fitting

				for(int k = 0; k < 4; ++ k) {
					for(int l = 0; l < 4; ++ l)
						H[k][l] += p_J[k] * p_J[l];
				}
				// sum up hessian

				for(int k = 0; k < 4; ++ k)
					p_rhs[k] += r * p_J[k] * f_weight;
				// sum up right-hand side
			}

			double p_dx[4];
			/*MatrixMNf U = H;
			if(!U.Cholesky()) { // unstable for some reason (maybe its the floats and high error since it is in pixels)
				fprintf(stderr, "error: fitting line %d failed\n", i);
				break;
			}
			MatrixMNd::CholSolve(p_dx, 4, U, p_rhs, 4);*/
			int p_row_perm[4];
			if(!H.LU(p_row_perm, 4)) {
				fprintf(stderr, "error: fitting line %d failed\n", i); // does not really happen
				break;
			}
			for(int j = 0; j < 4; ++ j)
				p_dx[j] = p_rhs[p_row_perm[j]];
			H.LSolve_UnitDiag(p_dx, 4);
			H.USolve(p_dx, 4);
			// solve the system to obtain delta-x

			a -= p_dx[0];
			b -= p_dx[1];
			c -= p_dx[2];
			d -= p_dx[3];
			// update the solution
		}

		const double f_min_x = f_offset_x + f_scale_x * t_min.first,
			f_max_x = f_offset_x + f_scale_x * t_max.first;
		for(int j = 0; j < 10; ++ j) {
			float f_x0 = float(f_min_x + j * (f_max_x - f_min_x) / 10);
			float f_x1 = float(f_min_x + (j + 1) * (f_max_x - f_min_x) / 10);
			float f_y0 = float(a + b * f_x0 + c * f_x0 * f_x0 + d * f_x0 * f_x0 * f_x0);
			float f_y1 = float(a + b * f_x1 + c * f_x1 * f_x1 + d * f_x1 * f_x1 * f_x1);

			f_x0 -= float(f_offset_x);
			f_x0 /= float(f_scale_x);
			f_x1 -= float(f_offset_x);
			f_x1 /= float(f_scale_x);
			f_y0 -= float(f_offset_y);
			f_y0 /= float(f_scale_y);
			f_y1 -= float(f_offset_y);
			f_y1 /= float(f_scale_y);

			if(b_flip) {
				std::swap(f_x0, f_y0);
				std::swap(f_x1, f_y1);
			}

			p_cam_image->DrawLine_AA(f_x0, f_y0, f_x1, f_y1, 0xff00ff00U);
		}
		// plot the final curve in the image, save the result

		printf("line(turn: %d, coef: %.15g %.15g %.15g %.15g, clamp: %g - %g)\n",
			(b_flip)? 1 : 0, a, b, c, d, f_min_x, f_max_x);

		for(int j = 0; j < 10; ++ j) {
			float f_x0 = float(f_min_x + j * (f_max_x - f_min_x) / 10);
			float f_x1 = float(f_min_x + (j + 1) * (f_max_x - f_min_x) / 10);
			float f_y0 = float(a + b * f_x0 + c * f_x0 * f_x0 + d * f_x0 * f_x0 * f_x0);
			float f_y1 = float(a + b * f_x1 + c * f_x1 * f_x1 + d * f_x1 * f_x1 * f_x1);

			if(b_flip) {
				std::swap(f_x0, f_y0);
				std::swap(f_x1, f_y1);
			}

			f_x0 *= p_square->n_width;
			f_x1 *= p_square->n_width;
			f_y0 *= p_square->n_height;
			f_y1 *= p_square->n_height;

			p_square->DrawLine_AA(f_x0, f_y0, f_x1, f_y1, 0xff00ff00U);
		}
	}

	CPngCodec::Save_PNG("04_cam_filtered_masked_lines.png", *p_cam_image);
	CPngCodec::Save_PNG("05_square_lines.png", *p_square);

	p_square->Delete();
	p_cam_image->Delete();
	p_mask_image->Delete();
	// cleanup
}
