1 /* $Id: $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's offical duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================*/
25 
26 /*****************************************************************************
27 
28 File name: sls_alp_regression.cpp
29 
30 Author: Sergey Sheetlin
31 
32 Contents: Regression methods
33 
34 ******************************************************************************/
35 
36 #include "sls_alp_regression.hpp"
37 
38 using namespace Sls;
39 using namespace std;
40 
41 
42 //-------------------------Solution of arbitrary equations----------------------------
find_tetta_general(function_type * func_,void * func_pointer_,double a_,double b_,long int n_partition_,double eps_,std::vector<double> & res_)43 void alp_reg::find_tetta_general(
44 function_type *func_,
45 void* func_pointer_,
46 double a_,//[a,b] is the interval for search of equation solution
47 double b_,
48 long int n_partition_,
49 double eps_,
50 std::vector<double> &res_
51 )
52 {
53 	res_.resize(0);
54 	vector<long int> intervals(0);
55 
56 	if(n_partition_<=0)
57 	{
58 		throw error("Error in alp_reg::find_tetta_general\n",4);
59 
60 	};
61 
62 	long int i;
63 	double h=(b_-a_)/n_partition_;
64 	double x1,x2 = 0.0;
65 	for(i=0;i<n_partition_;i++)
66 	{
67 		if(i==0)
68 		{
69 		x1=(*func_)(//calculation of E(exp(tetta*X))
70 			a_+i*h,func_pointer_);
71 		if(fabs(x1)<eps_)
72 		{
73 			res_.push_back(a_+i*h);
74 		};
75 
76 		}
77 		else
78 		{
79 			x1=x2;
80 		};
81 
82 
83 		x2=(*func_)(//calculation of E(exp(tetta*X))
84 			a_+(i+1)*h,
85 			func_pointer_);
86 
87 		if(fabs(x2)<eps_)
88 		{
89 			res_.push_back(a_+(i+1)*h);
90 		};
91 
92 		if((x1*x2<0)&&(fabs(x1)>=eps_&&fabs(x2)>=eps_))
93 		{
94 			intervals.push_back(i);
95 		};
96 	};
97 
98 	for(i=0;i<(long int)intervals.size();i++)
99 	{
100 	double sol=find_single_tetta_general(
101 		func_,
102 		func_pointer_,
103 		a_+intervals[i]*h,//[a,b] is the interval for search of equation solution
104 		a_+(1+intervals[i])*h,
105 		eps_);
106 	res_.push_back(sol);
107 	};
108 
109 	sort(res_.begin(),res_.end());
110 
111 	return;
112 }
113 
find_single_tetta_general(function_type * func_,void * func_pointer_,double a_,double b_,double eps_)114 double alp_reg::find_single_tetta_general(
115 function_type *func_,
116 void* func_pointer_,
117 double a_,//[a,b] is the interval for search of equation solution
118 double b_,
119 double eps_
120 )
121 {
122 	if(b_<a_)
123 	{
124 		throw error("Error in alp_reg::find_single_tetta_general\n",4);
125 	};
126 
127 	double x1=a_;
128 	double x2=b_;
129 	double precision=(x2-x1)/2;
130 
131 	double y1=(*func_)(//calculation of E(exp(tetta*X))
132 			x1,
133 			func_pointer_);
134 	if(fabs(y1)<eps_)
135 	{
136 		return x1;
137 	};
138 
139 
140 	double y2=(*func_)(//calculation of E(exp(tetta*X))
141 			x2,
142 			func_pointer_);
143 	if(fabs(y2)<eps_)
144 	{
145 		return x2;
146 	};
147 
148 	while(precision>eps_)
149 	{
150 		double x12=(x1+x2)/2;
151 
152 		double y12=(*func_)(//calculation of E(exp(tetta*X))
153 		x12,
154 		func_pointer_);
155 
156 		if(fabs(y12)<eps_)
157 		{
158 			return x12;
159 		};
160 
161 		if(y12*y1<0)
162 		{
163 			x2=x12;
164 			y2=y12;
165 		}
166 		else
167 		{
168 			x1=x12;
169 			y1=y12;
170 		};
171 		precision=(x2-x1)/2;
172 	};
173 
174 	return (x1+x2)/2;
175 }
176 
177 //------------regression-----------------------------------
178 
correction_of_errors(double * errors_,long int number_of_elements_)179 void alp_reg::correction_of_errors(
180 double *errors_,
181 long int number_of_elements_)
182 {
183 
184 	if(number_of_elements_<=0)
185 	{
186 		throw error("Unexpected error\n",4);
187 	};
188 
189 
190 	double average_error=0;
191 	long int i;
192 	for(i=0;i<number_of_elements_;i++)
193 	{
194 		if(errors_[i]<0)
195 		{
196 			throw error("Error in alp_reg::correction_of_errors: input error in the regression model is less than 0\n",4);
197 		};
198 
199 		average_error+=errors_[i];
200 	};
201 
202 	average_error/=(double)number_of_elements_;
203 
204 	double error_eps;
205 
206 	if(average_error<=0)
207 	{
208 		error_eps=1e-50;
209 	}
210 	else
211 	{
212 		error_eps=average_error;
213 	};
214 
215 
216 	for(i=0;i<number_of_elements_;i++)
217 	{
218 		if(errors_[i]==0)
219 		{
220 			errors_[i]=error_eps;
221 		};
222 	};
223 }
224 
robust_regression_sum_with_cut_LSM(long int min_length_,long int number_of_elements_,double * values_,double * errors_,bool cut_left_tail_,bool cut_right_tail_,double y_,double & beta0_,double & beta1_,double & beta0_error_,double & beta1_error_,long int & k1_opt_,long int & k2_opt_,bool & res_was_calculated_)225 void alp_reg::robust_regression_sum_with_cut_LSM(
226 long int min_length_,
227 long int number_of_elements_,
228 double *values_,
229 double *errors_,
230 bool cut_left_tail_,
231 bool cut_right_tail_,
232 double y_,
233 double &beta0_,
234 double &beta1_,
235 double &beta0_error_,
236 double &beta1_error_,
237 long int &k1_opt_,
238 long int &k2_opt_,
239 bool &res_was_calculated_)
240 {
241 
242 	if(number_of_elements_<2)
243 	{
244 		throw error("Unexpected error\n",4);
245 	};
246 
247 	correction_of_errors(errors_,number_of_elements_);
248 
249 	//minization of the function
250 	double c=y_*y_;
251 
252 
253 	long int k1_start,k1_end;
254 	long int k2_start,k2_end;
255 
256 	if(cut_left_tail_&&cut_right_tail_)
257 	{
258 		k1_start=0;
259 		k1_end=number_of_elements_-1;
260 
261 		k2_start=0;
262 		k2_end=number_of_elements_-1;
263 
264 	}
265 	else
266 	{
267 		if(cut_left_tail_&&!cut_right_tail_)
268 		{
269 			k1_start=0;
270 			k1_end=number_of_elements_-1;
271 
272 			k2_start=number_of_elements_-1;
273 			k2_end=number_of_elements_-1;
274 
275 		}
276 		else
277 		{
278 			if(!cut_left_tail_&&cut_right_tail_)
279 			{
280 				k1_start=0;
281 				k1_end=0;
282 
283 				k2_start=0;
284 				k2_end=number_of_elements_-1;
285 			}
286 			else
287 			{
288 				k1_start=0;
289 				k1_end=0;
290 
291 				k2_start=number_of_elements_-1;
292 				k2_end=number_of_elements_-1;
293 
294 			};
295 		};
296 	};
297 
298 	long int k1_opt=0,k2_opt=0;
299 
300 	double func_opt=DBL_MAX;
301 	double beta0_opt=0;
302 	double beta1_opt=0;
303 	double beta0_opt_error=0;
304 	double beta1_opt_error=0;
305 
306 	long int k1,k2;
307 
308 
309 	res_was_calculated_=false;
310 
311 
312 	for(k1=k1_start;k1<=k1_end;k1++)
313 	{
314 
315 		for(k2=sls_basic::Tmax(k1+(long int)1,sls_basic::Tmax(k1,k2_start)+min_length_);k2<=k2_end;k2++)
316 		{
317 			double beta0_opt_tmp,beta1_opt_tmp,beta0_opt_error_tmp,beta1_opt_error_tmp;
318 			bool res_was_calculated;
319 
320 			double tmp=function_for_robust_regression_sum_with_cut_LSM(
321 				values_+k1,
322 				errors_+k1,
323 				k2-k1+1,
324 				k1,
325 				c,
326 				beta0_opt_tmp,
327 				beta1_opt_tmp,
328 				beta0_opt_error_tmp,
329 				beta1_opt_error_tmp,
330 				res_was_calculated);
331 
332 
333 
334 			if(tmp<func_opt&&res_was_calculated)
335 			{
336 				func_opt=tmp;
337 				beta0_opt=beta0_opt_tmp;
338 				beta1_opt=beta1_opt_tmp;
339 				beta0_opt_error=beta0_opt_error_tmp;
340 				beta1_opt_error=beta1_opt_error_tmp;
341 				k1_opt=k1;
342 				k2_opt=k2;
343 				res_was_calculated_=true;
344 			};
345 
346 		};
347 	};
348 
349 
350 
351 
352 	if(res_was_calculated_)
353 	{
354 		beta0_=beta0_opt;
355 		beta1_=beta1_opt;
356 		beta0_error_=beta0_opt_error;
357 		beta1_error_=beta1_opt_error;
358 		k1_opt_=k1_opt;
359 		k2_opt_=k2_opt;
360 	};
361 
362 
363 }
364 
function_for_robust_regression_sum_with_cut_LSM(double * values_,double * errors_,long int number_of_elements_,long int k_start_,double c_,double & beta0_,double & beta1_,double & beta0_error_,double & beta1_error_,bool & res_was_calculated_)365 double alp_reg::function_for_robust_regression_sum_with_cut_LSM(
366 double *values_,
367 double *errors_,
368 long int number_of_elements_,
369 long int k_start_,
370 double c_,
371 double &beta0_,
372 double &beta1_,
373 double &beta0_error_,
374 double &beta1_error_,
375 bool &res_was_calculated_)
376 {
377 	long int i;
378 	double a11=0;
379 	double a12=0;
380 	double a21=0;
381 	double a22=0;
382 	double y1=0;
383 	double y2=0;
384 
385 	double y1_error=0;
386 	double y2_error=0;
387 
388 	for(i=0;i<number_of_elements_;i++)
389 	{
390 		if(errors_[i]!=0)
391 		{
392 			double tmp=1.0/(errors_[i]*errors_[i]);
393 
394 			a11+=tmp;
395 			a12+=(double)(k_start_+i)*tmp;
396 			a22+=(double)((k_start_+i)*(k_start_+i))*tmp;
397 			y1+=values_[i]*tmp;
398 			y1_error+=tmp*tmp*errors_[i]*errors_[i];
399 			y2+=(double)(k_start_+i)*values_[i]*tmp;
400 			y2_error+=(double)(k_start_+i)*(double)(k_start_+i)*tmp*tmp*errors_[i]*errors_[i];
401 		};
402 	};
403 
404 	a21=a12;
405 	y1_error=alp_reg::sqrt_for_errors(y1_error);
406 	y2_error=alp_reg::sqrt_for_errors(y2_error);
407 
408 	double eps=1e-10*sls_basic::Tmax(fabs(a11*a22),fabs(a21*a12));
409 
410 	double den=a11*a22-a21*a12;
411 	if(fabs(den)<=eps)
412 	{
413 		res_was_calculated_=false;
414 		return 0;
415 	}
416 	else
417 	{
418 		res_was_calculated_=true;
419 	};
420 
421 	beta0_=(y1*a22-a12*y2)/den;
422 	beta1_=(a11*y2-a21*y1)/den;
423 
424 	beta0_error_=sqrt(y1_error*y1_error*a22*a22+a12*a12*y2_error*y2_error)/den;
425 	beta1_error_=sqrt(a11*a11*y2_error*y2_error+a21*a21*y1_error*y1_error)/den;
426 
427 
428 	double res=0;
429 	for(i=0;i<number_of_elements_;i++)
430 	{
431 		if(errors_[i]!=0)
432 		{
433 			double tmp=(beta0_+beta1_*(i+k_start_)-values_[i])/errors_[i];
434 			res+=tmp*tmp-c_;
435 		};
436 	};
437 
438 	return res;
439 }
440 
robust_regression_sum_with_cut_LSM_beta1_is_defined(long int min_length_,long int number_of_elements_,double * values_,double * errors_,bool cut_left_tail_,bool cut_right_tail_,double y_,double & beta0_,double beta1_,double & beta0_error_,double beta1_error_,long int & k1_opt_,long int & k2_opt_,bool & res_was_calculated_)441 void alp_reg::robust_regression_sum_with_cut_LSM_beta1_is_defined(
442 long int min_length_,
443 long int number_of_elements_,
444 double *values_,
445 double *errors_,
446 bool cut_left_tail_,
447 bool cut_right_tail_,
448 double y_,
449 double &beta0_,
450 double beta1_,
451 double &beta0_error_,
452 double beta1_error_,
453 long int &k1_opt_,
454 long int &k2_opt_,
455 bool &res_was_calculated_)
456 {
457 
458 	correction_of_errors(errors_,number_of_elements_);
459 
460 	//minization of the function
461 	double c=y_*y_;
462 
463 
464 	long int k1_start,k1_end;
465 	long int k2_start,k2_end;
466 
467 	if(cut_left_tail_&&cut_right_tail_)
468 	{
469 		k1_start=0;
470 		k1_end=number_of_elements_-1;
471 
472 		k2_start=0;
473 		k2_end=number_of_elements_-1;
474 
475 	}
476 	else
477 	{
478 		if(cut_left_tail_&&!cut_right_tail_)
479 		{
480 			k1_start=0;
481 			k1_end=number_of_elements_-1;
482 
483 			k2_start=number_of_elements_-1;
484 			k2_end=number_of_elements_-1;
485 
486 		}
487 		else
488 		{
489 			if(!cut_left_tail_&&cut_right_tail_)
490 			{
491 				k1_start=0;
492 				k1_end=0;
493 
494 				k2_start=0;
495 				k2_end=number_of_elements_-1;
496 			}
497 			else
498 			{
499 				k1_start=0;
500 				k1_end=0;
501 
502 				k2_start=number_of_elements_-1;
503 				k2_end=number_of_elements_-1;
504 
505 			};
506 		};
507 	};
508 
509 	long int k1_opt=0,k2_opt=0;
510 
511 	double func_opt=DBL_MAX;
512 	double beta0_opt=0;
513 	double beta0_opt_error=0;
514 
515 	long int k1,k2;
516 
517 	res_was_calculated_=false;
518 
519 
520 	for(k1=k1_start;k1<=k1_end;k1++)
521 	{
522 
523 		for(k2=sls_basic::Tmax(k1,k2_start)+min_length_;k2<=k2_end;k2++)
524 		{
525 			double beta0_opt_tmp,beta1_opt_tmp,beta0_opt_error_tmp,beta1_opt_error_tmp;
526 			bool res_was_calculated;
527 
528 			beta1_opt_tmp=beta1_;
529 			beta1_opt_error_tmp=beta1_error_;
530 
531 			double tmp=function_for_robust_regression_sum_with_cut_LSM_beta1_is_defined(
532 				values_+k1,
533 				errors_+k1,
534 				k2-k1+1,
535 				k1,
536 				c,
537 				beta0_opt_tmp,
538 				beta1_opt_tmp,
539 				beta0_opt_error_tmp,
540 				beta1_opt_error_tmp,
541 				res_was_calculated);
542 
543 			if(tmp<func_opt&&res_was_calculated)
544 			{
545 				func_opt=tmp;
546 				beta0_opt=beta0_opt_tmp;
547 				beta0_opt_error=beta0_opt_error_tmp;
548 				k1_opt=k1;
549 				k2_opt=k2;
550 				res_was_calculated_=true;
551 			};
552 
553 		};
554 	};
555 
556 	if(res_was_calculated_)
557 	{
558 		beta0_=beta0_opt;
559 		beta0_error_=beta0_opt_error;
560 		k1_opt_=k1_opt;
561 		k2_opt_=k2_opt;
562 	};
563 
564 
565 }
566 
function_for_robust_regression_sum_with_cut_LSM_beta1_is_defined(double * values_,double * errors_,long int number_of_elements_,long int k_start_,double c_,double & beta0_,double beta1_,double & beta0_error_,double beta1_error_,bool & res_was_calculated_)567 double alp_reg::function_for_robust_regression_sum_with_cut_LSM_beta1_is_defined(
568 double *values_,
569 double *errors_,
570 long int number_of_elements_,
571 long int k_start_,
572 double c_,
573 double &beta0_,
574 double beta1_,
575 double &beta0_error_,
576 double beta1_error_,
577 bool &res_was_calculated_)
578 {
579 	long int i;
580 	double a11=0;
581 	double y1=0;
582 
583 	double y1_error=0;
584 
585 
586 	for(i=0;i<number_of_elements_;i++)
587 	{
588 		if(errors_[i]!=0)
589 		{
590 			double tmp=1.0/(errors_[i]*errors_[i]);
591 
592 			a11+=tmp;
593 			y1+=(values_[i]-(double)(k_start_+i)*beta1_)*tmp;
594 			double error_tmp=errors_[i]*errors_[i]+(double)(k_start_+i)*(double)(k_start_+i)*beta1_error_*beta1_error_;
595 			y1_error+=tmp*tmp*error_tmp;
596 		};
597 	};
598 
599 	y1_error=sqrt(y1_error);
600 
601 	double eps=1e-10*fabs(a11);
602 
603 	double den=a11;
604 	if(fabs(den)<=eps)
605 	{
606 		res_was_calculated_=false;
607 		return 0;
608 	}
609 	else
610 	{
611 		res_was_calculated_=true;
612 	};
613 
614 	beta0_=y1/den;
615 
616 	beta0_error_=y1_error/den;
617 
618 
619 	double res=0;
620 	for(i=0;i<number_of_elements_;i++)
621 	{
622 		if(errors_[i]!=0)
623 		{
624 			double tmp=(beta0_+beta1_*(i+k_start_)-values_[i])/errors_[i];
625 			res+=tmp*tmp-c_;
626 		};
627 	};
628 
629 	return res;
630 
631 }
632 
median(long int dim_,double * array_)633 double alp_reg::median(
634 long int dim_,
635 double *array_)
636 {
637 	vector<double> array_vect(dim_);
638 	long int i;
639 	for(i=0;i<dim_;i++)
640 	{
641 		array_vect[i]=array_[i];
642 	};
643 	sort(array_vect.begin(),array_vect.end());
644 	if(dim_%2==0)
645 	{
646 		long int k=(long int)sls_basic::round((double)dim_/2.0);
647 		return 0.5*(array_vect[k-1]+array_vect[k]);
648 	}
649 	else
650 	{
651 		long int k=(long int)sls_basic::round((double)(dim_-1.0)/2.0);
652 		return array_vect[k];
653 
654 	};
655 }
656 
robust_sum(double * values,long int dim,long int N_points,bool * & remove_flag)657 double alp_reg::robust_sum(
658 double *values,
659 long int dim,
660 long int N_points,
661 bool *&remove_flag)
662 {
663 	remove_flag=NULL;
664 
665 	try
666 	{
667 		if(dim<=N_points)
668 		{
669 			throw error("Unexpected error\n",4);
670 		};
671 
672 		long int i;
673 		remove_flag=new bool[dim];
674 		sls_basic::assert_mem(remove_flag);
675 		for(i=0;i<dim;i++)
676 		{
677 			remove_flag[i]=true;
678 		};
679 
680 
681 		double med_val=alp_reg::median(
682 		dim,
683 		values);
684 
685 		vector<pair<double,long int> > array_vect(dim);
686 
687 		for(i=0;i<dim;i++)
688 		{
689 			pair<double,long int> P;
690 			P.first=-fabs(values[i]-med_val);
691 			P.second=i;
692 			array_vect[i]=P;
693 		};
694 
695 		sort(array_vect.begin(),array_vect.end());
696 
697 		for(i=0;i<N_points;i++)
698 		{
699 			remove_flag[array_vect[i].second]=false;
700 		};
701 
702 		double res=0.0;
703 
704 		for(i=0;i<dim;i++)
705 		{
706 			if(remove_flag[i])
707 			{
708 				res+=values[i];
709 			};
710 		};
711 
712 		res/=(double)(dim-N_points);
713 		return res;
714 
715 	}
716 	catch (...)
717 	{
718 		delete[]remove_flag;remove_flag=NULL;
719 		throw;
720 	};
721 
722 }
723 
724