1 // =============================================================================
2 // CD-HI/CD-HIT
3 //
4 // Cluster Database at High Identity
5 //
6 // CD-HIT clusters protein sequences at high identity threshold.
7 // This program can remove the high sequence redundance efficiently.
8 //
9 // program written by
10 //                    Weizhong Li
11 //                    UCSD, San Diego Supercomputer Center
12 //                    La Jolla, CA, 92093
13 //                    Email liwz@sdsc.edu
14 //
15 //                 at
16 //                    Adam Godzik's lab
17 //                    The Burnham Institute
18 //                    La Jolla, CA, 92037
19 //                    Email adam@burnham-inst.org
20 //
21 // modified by:
22 //                    Limin Fu
23 //                    Center for Research in Biological Systems (CRBS), UCSD
24 //                    La Jolla, CA, 92093
25 //                    Email: l2fu@ucsd.edu, fu@daovm.net
26 // =============================================================================
27 
28 #include "cdhit-common.h"
29 #include<valarray>
30 #include<stdint.h>
31 #include<assert.h>
32 #include<limits.h>
33 
34 #ifndef NO_OPENMP
35 
36 #include<omp.h>
37 
38 #define WITH_OPENMP " (+OpenMP)"
39 
40 #else
41 
42 #define WITH_OPENMP ""
43 
44 #define omp_set_num_threads(T) (T = T)
45 #define omp_get_thread_num() 0
46 
47 #endif
48 
49 //class function definition
50 const char aa[] = {"ARNDCQEGHILKMFPSTWYVBZX"};
51 //{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,2,6,20};
52 int aa2idx[] = {0, 2, 4, 3, 6, 13,7, 8, 9,20,11,10,12, 2,20,14,
53                 5, 1,15,16,20,19,17,20,18, 6};
54     // idx for  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O  P
55     //          Q  R  S  T  U  V  W  X  Y  Z
56     // so  aa2idx[ X - 'A'] => idx_of_X, eg aa2idx['A' - 'A'] => 0,
57     // and aa2idx['M'-'A'] => 12
58 
59 int BLOSUM62[] = {
60   4,                                                                  // A
61  -1, 5,                                                               // R
62  -2, 0, 6,                                                            // N
63  -2,-2, 1, 6,                                                         // D
64   0,-3,-3,-3, 9,                                                      // C
65  -1, 1, 0, 0,-3, 5,                                                   // Q
66  -1, 0, 0, 2,-4, 2, 5,                                                // E
67   0,-2, 0,-1,-3,-2,-2, 6,                                             // G
68  -2, 0, 1,-1,-3, 0, 0,-2, 8,                                          // H
69  -1,-3,-3,-3,-1,-3,-3,-4,-3, 4,                                       // I
70  -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,                                    // L
71  -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,                                 // K
72  -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5,                              // M
73  -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,                           // F
74  -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,                        // P
75   1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4,                     // S
76   0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,                  // T
77  -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11,               // W
78  -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,            // Y
79   0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,         // V
80  -2,-1, 3, 4,-3, 0, 1,-1, 0,-3,-4, 0,-3,-3,-2, 0,-1,-4,-3,-3, 4,      // B
81  -1, 0, 0, 1,-3, 3, 4,-2, 0,-3,-3, 1,-1,-3,-1, 0,-1,-3,-2,-2, 1, 4,   // Z
82   0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-1,-1,-1 // X
83 //A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  Z  X
84 //0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19  2  6 20
85 };
86 
87 
88 int na2idx[] = {0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4,
89                 4, 4, 4, 3, 3, 4, 4, 4, 4, 4};
90     // idx for  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O  P
91     //          Q  R  S  T  U  V  W  X  Y  Z
92     // so aa2idx[ X - 'A'] => idx_of_X, eg aa2idx['A' - 'A'] => 0,
93     // and aa2idx['M'-'A'] => 4
94 int BLOSUM62_na[] = {
95   2,                  // A
96  -2, 2,               // C
97  -2,-2, 2,            // G
98  -2,-2,-2, 2,         // T
99  -2,-2,-2, 1, 2,      // U
100  -2,-2,-2,-2,-2, 1,   // N
101   0, 0, 0, 0, 0, 0, 1 // X
102 //A  C  G  T  U  N  X
103 //0  1  2  3  3  4  5
104 };
105 
106 void setaa_to_na();
107 
108 struct TempFile
109 {
110 	FILE *file;
111 	char buf[512];
112 
TempFileTempFile113 	TempFile( const char *dir = NULL ){
114 		int len = dir ? strlen( dir ) : 0;
115 		assert( len < 400 );
116 		buf[0] = 0;
117 		if( len ){
118 			strcat( buf, dir );
119 			if( buf[len-1] != '/' && buf[len-1] != '\\' ){
120 				buf[len] = '/';
121 				len += 1;
122 			}
123 		}
124 		strcat( buf, "cdhit.temp." );
125 		len += 11;
126 		sprintf( buf + len, "%p", this );
127 		file = fopen( buf, "w+" );
128 	}
~TempFileTempFile129 	~TempFile(){
130 		if( file ){
131 			fclose( file );
132 			remove( buf );
133 		}
134 	}
135 };
136 
137 struct TempFiles
138 {
139 	NVector<TempFile*> files;
140 
~TempFilesTempFiles141 	~TempFiles(){ Clear(); }
142 
ClearTempFiles143 	void Clear(){
144 		int i;
145 		#pragma omp critical
146 		{
147 			for(i=0; i<files.size; i++) if( files[i] ) delete files[i];
148 			files.Clear();
149 		}
150 	}
151 };
152 
153 const char *temp_dir = "";
154 TempFiles temp_files;
155 
OpenTempFile(const char * dir=NULL)156 FILE* OpenTempFile( const char *dir = NULL )
157 {
158 	TempFile *file = new TempFile( dir );
159 	#pragma omp critical
160 	{
161 		temp_files.files.Append( file );
162 	}
163 	return file->file;
164 }
CleanUpTempFiles()165 static void CleanUpTempFiles()
166 {
167 	if( temp_files.files.Size() ) printf( "Clean up temporary files ...\n" );
168 	temp_files.Clear();
169 }
170 
171 int NAA1 ;
172 int NAA2 ;
173 int NAA3 ;
174 int NAA4 ;
175 int NAA5 ;
176 int NAA6 ;
177 int NAA7 ;
178 int NAA8 ;
179 int NAA9 ;
180 int NAA10;
181 int NAA11;
182 int NAA12;
183 int NAAN_array[13] = { 1 };
184 
InitNAA(int max)185 void InitNAA( int max )
186 {
187 	NAA1  = NAAN_array[1]  = max;
188 	NAA2  = NAAN_array[2]  = NAA1 * NAA1;
189 	NAA3  = NAAN_array[3]  = NAA1 * NAA2;
190 	NAA4  = NAAN_array[4]  = NAA2 * NAA2;
191 	NAA5  = NAAN_array[5]  = NAA2 * NAA3;
192 	NAA6  = NAAN_array[6]  = NAA3 * NAA3;
193 	NAA7  = NAAN_array[7]  = NAA3 * NAA4;
194 	NAA8  = NAAN_array[8]  = NAA4 * NAA4;
195 	NAA9  = NAAN_array[9]  = NAA4 * NAA5;
196 	NAA10 = NAAN_array[10] = NAA5 * NAA5;
197 	NAA11 = NAAN_array[11] = NAA5 * NAA6;
198 	NAA12 = NAAN_array[12] = NAA6 * NAA6;
199 }
200 
201 extern Options options;
202 ScoreMatrix mat;
203 Vector<int> Comp_AAN_idx;
204 
make_comp_iseq(int len,char * iseq_comp,char * iseq)205 void make_comp_iseq(int len, char *iseq_comp, char *iseq)
206 {
207 	int i, c[6] = {3,2,1,0,4,5};
208 	for (i=0; i<len; i++) iseq_comp[i] = c[ (int)iseq[len-i-1] ];
209 } // make_comp_iseq
210 
SetOptionCommon(const char * flag,const char * value)211 bool Options::SetOptionCommon( const char *flag, const char *value )
212 {
213 	int intval = atoi( value );
214 	if      (strcmp(flag, "-i" ) == 0) input = value;
215 	else if (strcmp(flag, "-j" ) == 0) input_pe = value;
216 	else if (strcmp(flag, "-o" ) == 0) output = value;
217 	else if (strcmp(flag, "-op") == 0) output_pe = value;
218 	else if (strcmp(flag, "-M" ) == 0) max_memory  = atoll(value) * 1000000;
219 	else if (strcmp(flag, "-l" ) == 0) min_length  = intval;
220 	else if (strcmp(flag, "-c" ) == 0) cluster_thd  = atof(value), useIdentity = true;
221 	else if (strcmp(flag, "-D" ) == 0) distance_thd  = atof(value), useDistance = true;
222 	else if (strcmp(flag, "-b" ) == 0) band_width  = intval;
223 	else if (strcmp(flag, "-n" ) == 0) NAA       = intval;
224 	else if (strcmp(flag, "-d" ) == 0) des_len   = intval;
225 	else if (strcmp(flag, "-s" ) == 0) diff_cutoff  = atof(value);
226 	else if (strcmp(flag, "-S" ) == 0) diff_cutoff_aa  = intval;
227 	else if (strcmp(flag, "-B" ) == 0) store_disk  = intval;
228 	else if (strcmp(flag, "-P" ) == 0) PE_mode  = intval;
229 	else if (strcmp(flag, "-cx") == 0) trim_len = intval;
230 	else if (strcmp(flag, "-cy") == 0) trim_len_R2 = intval;
231 	else if (strcmp(flag, "-ap") == 0) align_pos = intval;
232 	else if (strcmp(flag, "-sc") == 0) sort_output = intval;
233 	else if (strcmp(flag, "-sf") == 0) sort_outputf = intval;
234 	else if (strcmp(flag, "-p" ) == 0) print  = intval;
235 	else if (strcmp(flag, "-g" ) == 0) cluster_best  = intval;
236 	else if (strcmp(flag, "-G" ) == 0) global_identity  = intval;
237 	else if (strcmp(flag, "-aL") == 0) long_coverage = atof(value);
238 	else if (strcmp(flag, "-AL") == 0) long_control = intval;
239 	else if (strcmp(flag, "-aS") == 0) short_coverage = atof(value);
240 	else if (strcmp(flag, "-AS") == 0) short_control = intval;
241 	else if (strcmp(flag, "-A" ) == 0) min_control  = intval;
242 	else if (strcmp(flag, "-uL") == 0) long_unmatch_per = atof(value);
243 	else if (strcmp(flag, "-uS") == 0) short_unmatch_per = atof(value);
244 	else if (strcmp(flag, "-U") == 0) unmatch_len = intval;
245 	else if (strcmp(flag, "-tmp" ) == 0) temp_dir  = value;
246 	else if (strcmp(flag, "-bak" ) == 0) backupFile  = intval;
247 	else if (strcmp(flag, "-T" ) == 0){
248 #ifndef NO_OPENMP
249 		int cpu = omp_get_num_procs();
250 		threads  = intval;
251 		if( threads > cpu ){
252 			threads = cpu;
253 			printf( "Warning: total number of CPUs in the system is %i\n", cpu );
254 		}else if( threads < 0 ){
255 			threads += cpu;
256 			if( threads < 0 ) threads = 0;
257 		}
258 		if( threads == 0 ){
259 			threads = cpu;
260 			printf( "total number of CPUs in the system is %i\n", cpu );
261 		}
262 		if( threads != intval ) printf( "Actual number of CPUs to be used: %i\n\n", threads );
263 #else
264 		printf( "Option -T is ignored: multi-threading with OpenMP is NOT enabled!\n" );
265 #endif
266 	}else return false;
267 	return true;
268 }
SetOption(const char * flag,const char * value)269 bool Options::SetOption( const char *flag, const char *value )
270 {
271 	if( is454 ){
272 		if( strcmp(flag, "-s") == 0 ) return false;
273 		else if( strcmp(flag, "-S") == 0 ) return false;
274 		else if( strcmp(flag, "-G") == 0 ) return false;
275 		else if( strcmp(flag, "-A") == 0 ) return false;
276 		else if( strcmp(flag, "-r") == 0 ) return false;
277 		else if( strcmp(flag, "-D") == 0 ){ max_indel = atoi(value); return true; }
278 	}
279 	if( SetOptionCommon( flag, value ) ) return true;
280 	if (strcmp(flag, "-t" ) == 0) tolerance = atoi(value);
281 	else if (strcmp(flag, "-F" ) == 0) frag_size = atoi(value);
282 	else if (has2D && SetOption2D( flag, value ) ) return true;
283 	else if (isEST && SetOptionEST( flag, value ) ) return true;
284 	else return false;
285 	return true;
286 }
SetOption2D(const char * flag,const char * value)287 bool Options::SetOption2D( const char *flag, const char *value )
288 {
289 	if( SetOptionCommon( flag, value ) ) return true;
290 	if (strcmp(flag, "-i2" ) == 0) input2 = value;
291         else if (strcmp(flag, "-j2" ) == 0) input2_pe = value;
292 	else if (strcmp(flag, "-s2") == 0) diff_cutoff2 = atof(value);
293 	else if (strcmp(flag, "-S2") == 0) diff_cutoff_aa2 = atoi(value);
294 	else return false;
295 	return true;
296 }
SetOptionEST(const char * flag,const char * value)297 bool Options::SetOptionEST( const char *flag, const char *value )
298 {
299 	NAA_top_limit = 12;
300 	if( SetOptionCommon( flag, value ) ) return true;
301 	if (strcmp(flag, "-r" ) == 0) option_r  = atoi(value);
302 	else if (strcmp(flag, "-gap") == 0) mat.gap = MAX_SEQ * atoi(value);
303 	else if (strcmp(flag, "-gap-ext") == 0) mat.ext_gap = MAX_SEQ * atoi(value);
304 	else if (strcmp(flag, "-match") == 0) mat.set_match( atoi(value) );
305 	else if (strcmp(flag, "-mismatch") == 0) mat.set_mismatch( atoi(value) );
306 	else if (strcmp(flag, "-mask") == 0){
307 		string letters = value;
308 		int i, n = letters.size();
309 		for(i=0; i<n; i++){
310 			char ch = toupper( letters[i] );
311 			if( ch < 'A' || ch > 'Z' ) continue;
312 			na2idx[ ch - 'A' ] = 5;
313 		}
314 		setaa_to_na();
315 	}else return false;
316 	return true;
317 }
SetOptions(int argc,char * argv[],bool twod,bool est)318 bool Options::SetOptions( int argc, char *argv[], bool twod, bool est )
319 {
320 	int i, n;
321 	char date[100];
322 	strcpy( date, __DATE__ );
323 	n = strlen( date );
324 	for(i=1; i<n; i++) if( date[i-1] == ' ' and date[i] == ' ' ) date[i] = '0';
325 	printf( "================================================================\n" );
326 	printf( "Program: CD-HIT, V" CDHIT_VERSION WITH_OPENMP ", %s, " __TIME__ "\n", date );
327 	printf( "Command:" );
328 	n = 9;
329 	for(i=0; i<argc; i++){
330 		n += strlen( argv[i] ) + 1;
331 		if( n >= 64 ){
332 			printf( "\n         %s", argv[i] );
333 			n = strlen( argv[i] ) + 9;
334 		}else{
335 			printf( " %s", argv[i] );
336 		}
337 	}
338 	printf( "\n\n" );
339 	time_t tm = time(NULL);
340 	printf( "Started: %s", ctime(&tm) );
341 	printf( "================================================================\n" );
342 	printf( "                            Output                              \n" );
343 	printf( "----------------------------------------------------------------\n" );
344 	has2D = twod;
345 	isEST = est;
346 	for (i=1; i+1<argc; i+=2) if ( SetOption( argv[i], argv[i+1] ) == 0) return false;
347 	if( i < argc ) return false;
348 
349 	atexit( CleanUpTempFiles );
350 	return true;
351 }
Validate()352 void Options::Validate()
353 {
354 	if( useIdentity and useDistance ) bomb_error( "can not use both identity cutoff and distance cutoff" );
355 	if( useDistance ){
356 		if ((distance_thd > 1.0) || (distance_thd < 0.0)) bomb_error("invalid distance threshold");
357 	}else if( isEST ){
358 		if ((cluster_thd > 1.0) || (cluster_thd < 0.8)) bomb_error("invalid clstr threshold, should >=0.8");
359 	}else{
360 		if ((cluster_thd > 1.0) || (cluster_thd < 0.4)) bomb_error("invalid clstr");
361 	}
362 
363         if (input.size()  == 0) bomb_error("no input file");
364         if (output.size() == 0) bomb_error("no output file");
365         if (PE_mode) {
366           if (input_pe.size()  == 0) bomb_error("no input file for R2 sequences in PE mode");
367           if (output_pe.size() == 0) bomb_error("no output file for R2 sequences in PE mode");
368         }
369         if (isEST && (align_pos==1)) option_r = 0;
370 
371 	if (band_width < 1 ) bomb_error("invalid band width");
372 	if (NAA < 2 || NAA > NAA_top_limit) bomb_error("invalid word length");
373 	if (des_len < 0 ) bomb_error("too short description, not enough to identify sequences");
374 	if (not isEST && (tolerance < 0 || tolerance > 5) ) bomb_error("invalid tolerance");
375 	if ((diff_cutoff<0) || (diff_cutoff>1)) bomb_error("invalid value for -s");
376 	if (diff_cutoff_aa<0) bomb_error("invalid value for -S");
377 	if( has2D ){
378 		if ((diff_cutoff2<0) || (diff_cutoff2>1)) bomb_error("invalid value for -s2");
379 		if (diff_cutoff_aa2<0) bomb_error("invalid value for -S2");
380           if (PE_mode) {
381             if (input2_pe.size()  == 0) bomb_error("no input file for R2 sequences for 2nd db in PE mode");
382           }
383 	}
384 	if (global_identity == 0) print = 1;
385 	if (short_coverage < long_coverage) short_coverage = long_coverage;
386 	if (short_control > long_control) short_control = long_control;
387 	if ((global_identity == 0) && (short_coverage == 0.0) && (min_control == 0))
388 		bomb_error("You are using local identity, but no -aS -aL -A option");
389 	if (frag_size < 0) bomb_error("invalid fragment size");
390 
391 #if 0
392 	if( useDistance ){
393 		/* when required_aan becomes zero */
394 		if( distance_thd * NAA >= 1 )
395 			bomb_warning( "word length is too long for the distance cutoff" );
396 	}else{
397 		/* when required_aan becomes zero */
398 		if( cluster_thd <= 1.0 - 1.0 / NAA )
399 			bomb_warning( "word length is too long for the identity cutoff" );
400 	}
401 #endif
402 
403 	const char *message = "Your word length is %i, using %i may be faster!\n";
404 	if ( not isEST && tolerance ) {
405 		int i, clstr_idx = (int) (cluster_thd * 100) - naa_stat_start_percent;
406 		int tcutoff = naa_stat[tolerance-1][clstr_idx][5-NAA];
407 
408 		if (tcutoff < 5 )
409 			bomb_error("Too low cluster threshold for the word length.\n"
410 					"Increase the threshold or the tolerance, or decrease the word length.");
411 		for ( i=5; i>NAA; i--) {
412 			if ( naa_stat[tolerance-1][clstr_idx][5-i] > 10 ) {
413 				printf( message, NAA, i );
414 				break;
415 			}
416 		}
417 	} else if( isEST ) {
418 		if      ( cluster_thd > 0.9  && NAA < 8 ) printf( message, NAA, 8 );
419 		else if ( cluster_thd > 0.87 && NAA < 5 ) printf( message, NAA, 5 );
420 		else if ( cluster_thd > 0.80 && NAA < 4 ) printf( message, NAA, 4 );
421 		else if ( cluster_thd > 0.75 && NAA < 3 ) printf( message, NAA, 3 );
422 	} else {
423 		if      ( cluster_thd > 0.85 && NAA < 5 ) printf( message, NAA, 5 );
424 		else if ( cluster_thd > 0.80 && NAA < 4 ) printf( message, NAA, 4 );
425 		else if ( cluster_thd > 0.75 && NAA < 3 ) printf( message, NAA, 3 );
426 	}
427 
428 	if ( (min_length + 1) < NAA ) bomb_error("Too short -l, redefine it");
429 }
Print()430 void Options::Print()
431 {
432 	printf( "isEST = %i\n", isEST );
433 	printf( "has2D = %i\n", has2D );
434 	printf( "NAA = %i\n", NAA );
435 	printf( "NAA_top_limit = %i\n", NAA_top_limit );
436 	printf( "min_length = %i\n", min_length );
437 	printf( "cluster_best = %i\n", cluster_best );
438 	printf( "global_identity = %i\n", global_identity );
439 	printf( "cluster_thd = %g\n", cluster_thd );
440 	printf( "diff_cutoff = %g\n", diff_cutoff );
441 	printf( "diff_cutoff_aa = %i\n", diff_cutoff_aa );
442 	printf( "tolerance = %i\n", tolerance );
443 	printf( "long_coverage = %g\n", long_coverage );
444 	printf( "long_control = %i\n", long_control );
445 	printf( "short_coverage = %g\n", short_coverage );
446 	printf( "short_control = %i\n", short_control );
447 	printf( "frag_size = %i\n", frag_size );
448 	printf( "option_r = %i\n", option_r );
449 	printf( "print = %i\n", print );
450 }
451 
bomb_error(const char * message)452 void bomb_error(const char *message)
453 {
454 	fprintf( stderr, "\nFatal Error:\n%s\nProgram halted !!\n\n", message );
455 	temp_files.Clear();
456 	exit (1);
457 } // END void bomb_error
458 
bomb_error(const char * message,const char * message2)459 void bomb_error(const char *message, const char *message2)
460 {
461 	fprintf( stderr, "\nFatal Error:\n%s %s\nProgram halted !!\n\n", message, message2 );
462 	temp_files.Clear();
463 	exit (1);
464 } // END void bomb_error
465 
466 
bomb_warning(const char * message)467 void bomb_warning(const char *message)
468 {
469 	fprintf( stderr, "\nWarning:\n%s\nNot fatal, but may affect results !!\n\n", message );
470 } // END void bomb_warning
471 
472 
bomb_warning(const char * message,const char * message2)473 void bomb_warning(const char *message, const char *message2)
474 {
475 	fprintf( stderr, "\nWarning:\n%s %s\nNot fatal, but may affect results !!\n\n", message, message2 );
476 } // END void bomb_warning
477 
format_seq(char * seq)478 void format_seq(char *seq)
479 {
480 	int i, j;
481 	char c1;
482 	int len = strlen(seq);
483 
484 	for (i=0,j=0; i<len; i++) {
485 		c1 = toupper(seq[i]);
486 		if ( isalpha(c1) ) seq[j++] = c1;
487 	}
488 	seq[j] = 0;
489 } // END void format_seq
490 
strrev(char * p)491 void strrev(char *p)
492 {
493   char *q = p;
494   while(q && *q) ++q;
495   for(--q; p < q; ++p, --q)
496     *p = *p ^ *q,
497     *q = *p ^ *q,
498     *p = *p ^ *q;
499 }
500 
501 ////For smiple len1 <= len2, len2 is for existing representative
502 ////walk along all diag path of two sequences,
503 ////find the diags with most aap
504 ////return top n diags
505 ////added on 2006 11 13
506 ////band 0                      XXXXXXXXXXXXXXXXXX               seq2, rep seq
507 ////                            XXXXXXXXXXXXXXX                  seq1
508 ////band 1                      XXXXXXXXXXXXXXXXXX               seq2, rep seq
509 ////                             XXXXXXXXXXXXXXX                 seq1
510 ////extreme right (+)           XXXXXXXXXXXXXXXXXX               seq2, rep seq
511 ////    band = len2-1                            XXXXXXXXXXXXXXX seq1
512 ////band-1                      XXXXXXXXXXXXXXXXXX               seq2, rep seq
513 ////                           XXXXXXXXXXXXXXX                   seq1
514 ////extreme left (-)            XXXXXXXXXXXXXXXXXX               seq2, rep seq
515 ////              XXXXXXXXXXXXXXX   band = -(len1-1)             seq1
516 ////index of diag_score = band+len1-1;
diag_test_aapn(int NAA1,char iseq2[],int len1,int len2,WorkingBuffer & buffer,int & best_sum,int band_width,int & band_left,int & band_center,int & band_right,int required_aa1)517 int diag_test_aapn(int NAA1, char iseq2[], int len1, int len2, WorkingBuffer & buffer,
518 		int &best_sum, int band_width, int &band_left, int &band_center, int &band_right, int required_aa1)
519 {
520 	int i, i1, j, k;
521 	int *pp;
522 	int nall = len1+len2-1;
523 	Vector<int> & taap = buffer.taap;
524 	Vector<INTs> & aap_begin = buffer.aap_begin;
525 	Vector<INTs> & aap_list = buffer.aap_list;
526 	Vector<int> & diag_score = buffer.diag_score;
527 	Vector<int> & diag_score2 = buffer.diag_score2;
528 
529 	if (nall > MAX_DIAG) bomb_error("in diag_test_aapn, MAX_DIAG reached");
530 	for (pp=&diag_score[0], i=nall; i; i--, pp++) *pp=0;
531 	for (pp=&diag_score2[0], i=nall; i; i--, pp++) *pp=0;
532 
533 	int c22, cpx;
534 	INTs *bip;
535 	int len11 = len1-1;
536 	int len22 = len2-1;
537 	i1 = len11;
538 	for (i=0; i<len22; i++,i1++) {
539 		c22 = iseq2[i]*NAA1+ iseq2[i+1];
540 		cpx = 1 + (iseq2[i] != iseq2[i+1]);
541 		if ( (j=taap[c22]) == 0) continue;
542 		int m = aap_begin[c22];
543 		for(int k=0; k<j; k++){
544 			diag_score[ i1 - aap_list[m+k] ] ++;
545 			diag_score2[ i1 - aap_list[m+k] ] += cpx;
546 		}
547 	}
548 
549 	//find the best band range
550 	//  int band_b = required_aa1;
551 	int band_b = required_aa1-1 >= 0 ? required_aa1-1:0;  // on dec 21 2001
552 	int band_e = nall - band_b;
553 
554 	int band_m = ( band_b+band_width-1 < band_e ) ? band_b+band_width-1 : band_e;
555 	int best_score=0;
556 	int best_score2=0;
557 	int max_diag = 0;
558 	int max_diag2 = 0;
559 	int imax_diag = 0;
560 	for (i=band_b; i<=band_m; i++){
561 		best_score += diag_score[i];
562 		best_score2 += diag_score2[i];
563 		if( diag_score2[i] > max_diag2 ){
564 			max_diag2 = diag_score2[i];
565 			max_diag = diag_score[i];
566 			imax_diag = i;
567 		}
568 	}
569 	int from=band_b;
570 	int end =band_m;
571 	int score = best_score;
572 	int score2 = best_score2;
573 	for (k=from, j=band_m+1; j<band_e; j++, k++) {
574 		score -= diag_score[k];
575 		score += diag_score[j];
576 		score2 -= diag_score2[k];
577 		score2 += diag_score2[j];
578 		if ( score2 > best_score2 ) {
579 			from = k + 1;
580 			end  = j;
581 			best_score = score;
582 			best_score2 = score2;
583 			if( diag_score2[j] > max_diag2 ){
584 				max_diag2 = diag_score2[j];
585 				max_diag = diag_score[j];
586 				imax_diag = j;
587 			}
588 		}
589 	}
590 	int mlen = imax_diag;
591 	if( imax_diag > len1 ) mlen = nall - imax_diag;
592 	int emax = int((1.0 - options.cluster_thd) * mlen) + 1;
593 	for (j=from; j<imax_diag; j++) { // if aap pairs fail to open gap
594 		if ( (imax_diag - j) > emax || diag_score[j] < 1 ) {
595 			best_score -= diag_score[j]; from++;
596 		} else break;
597 	}
598 	for (j=end; j>imax_diag; j--) { // if aap pairs fail to open gap
599 		if ( (j - imax_diag) > emax || diag_score[j] < 1 ) {
600 			best_score -= diag_score[j]; end--;
601 		} else break;
602 	}
603 
604 	//  delete [] diag_score;
605 	band_left = from - len1 + 1;
606 	band_right= end - len1 + 1;
607 	band_center = imax_diag - len1 + 1;
608 	best_sum = best_score;
609 	return OK_FUNC;
610 }
611 // END diag_test_aapn
612 
613 
diag_test_aapn_est(int NAA1,char iseq2[],int len1,int len2,WorkingBuffer & buffer,int & best_sum,int band_width,int & band_left,int & band_center,int & band_right,int required_aa1)614 int diag_test_aapn_est(int NAA1, char iseq2[], int len1, int len2, WorkingBuffer & buffer,
615         int &best_sum, int band_width, int &band_left, int &band_center, int &band_right, int required_aa1)
616 {
617 	int i, i1, j, k;
618 	int *pp, *pp2;
619 	int nall = len1+len2-1;
620 	int NAA2 = NAA1 * NAA1;
621 	int NAA3 = NAA2 * NAA1;
622 	Vector<int> & taap = buffer.taap;
623 	Vector<INTs> & aap_begin = buffer.aap_begin;
624 	Vector<INTs> & aap_list = buffer.aap_list;
625 	Vector<int> & diag_score = buffer.diag_score;
626 	Vector<int> & diag_score2 = buffer.diag_score2;
627 
628 	if (nall > MAX_DIAG) bomb_error("in diag_test_aapn_est, MAX_DIAG reached");
629 	pp = & diag_score[0];
630 	pp2 = & diag_score2[0];
631 	for (i=nall; i; i--, pp++, pp2++) *pp = *pp2 =0;
632 
633 	INTs *bip;
634 	int c22, cpx;
635 	int len22 = len2-3;
636 	i1 = len1-1;
637 	for (i=0; i<len22; i++,i1++,iseq2++) {
638 		unsigned char c0 = iseq2[0];
639 		unsigned char c1 = iseq2[1];
640 		unsigned char c2 = iseq2[2];
641 		unsigned char c3 = iseq2[3];
642 		if ((c0>=4) || (c1>=4) || (c2>=4) || (c3>=4)) continue; //skip N
643 
644 		c22 = c0*NAA3+ c1*NAA2 + c2*NAA1 + c3;
645 		if ( (j=taap[c22]) == 0) continue;
646 		cpx = 1 + (c0 != c1) + (c1 != c2) + (c2 != c3);
647 		bip = & aap_list[ aap_begin[c22] ];     //    bi = aap_begin[c22];
648 		for (; j; j--, bip++) {
649 			diag_score[i1 - *bip]++;
650 			diag_score2[i1 - *bip] += cpx;
651 		}
652 	}
653 #if 0
654 	int mmax = 0;
655 	int immax = 0;
656 	for(i=0; i<=nall; i++){
657 		if( i%len2 ==0 or i == nall ) printf( "\n" );
658 		printf( "%3i ", diag_score[i] );
659 		if( diag_score[i] > mmax ){
660 			mmax = diag_score[i];
661 			immax = i;
662 		}
663 	}
664 #endif
665 
666 	//find the best band range
667 	//  int band_b = required_aa1;
668 	int band_b = required_aa1-1 >= 0 ? required_aa1-1:0;  // on dec 21 2001
669 	int band_e = nall - band_b;
670 
671 	if( options.is454 ){
672 		band_b = len1 - band_width;
673 		band_e = len1 + band_width;
674 		if( band_b < 0 ) band_b = 0;
675 		if( band_e > nall ) band_e = nall;
676 	}
677 
678 	int band_m = ( band_b+band_width-1 < band_e ) ? band_b+band_width-1 : band_e;
679 	int best_score=0;
680 	int best_score2=0;
681 	int max_diag = 0;
682 	int max_diag2 = 0;
683 	int imax_diag = 0;
684 	for (i=band_b; i<=band_m; i++){
685 		best_score += diag_score[i];
686 		best_score2 += diag_score2[i];
687 		if( diag_score2[i] > max_diag2 ){
688 			max_diag2 = diag_score2[i];
689 			max_diag = diag_score[i];
690 			imax_diag = i;
691 		}
692 	}
693 	int from=band_b;
694 	int end =band_m;
695 	int score = best_score;
696 	int score2 = best_score2;
697 
698 	for (k=from, j=band_m+1; j<band_e; j++, k++) {
699 		score -= diag_score[k];
700 		score += diag_score[j];
701 		score2 -= diag_score2[k];
702 		score2 += diag_score2[j];
703 		if ( score2 > best_score2 ) {
704 			from = k + 1;
705 			end  = j;
706 			best_score = score;
707 			best_score2 = score2;
708 			if( diag_score2[j] > max_diag2 ){
709 				max_diag2 = diag_score2[j];
710 				max_diag = diag_score[j];
711 				imax_diag = j;
712 			}
713 		}
714 	}
715 #if 0
716 	printf( "%i %i\n", required_aa1, from );
717 	printf( "max=%3i  imax=%3i; band:  %3i  %3i  %i\n", max_diag, imax_diag, band_b, band_e, band_m );
718 	printf( "best: %i\n", best_score );
719 	printf( "from: %i, end: %i,  best: %i\n", from, end, best_score );
720 #endif
721 	int mlen = imax_diag;
722 	if( imax_diag > len1 ) mlen = nall - imax_diag;
723 	int emax = int((1.0 - options.cluster_thd) * mlen) + 1;
724 	for (j=from; j<imax_diag; j++) { // if aap pairs fail to open gap
725 		if ( (imax_diag - j) > emax || diag_score[j] < 1 ) {
726 			best_score -= diag_score[j]; from++;
727 		} else break;
728 	}
729 	for (j=end; j>imax_diag; j--) { // if aap pairs fail to open gap
730 		if ( (j - imax_diag) > emax || diag_score[j] < 1 ) {
731 			best_score -= diag_score[j]; end--;
732 		} else break;
733 	}
734 
735 	band_left = from-len1+1;
736 	band_right= end-len1+1;
737 	band_center = imax_diag - len1 + 1;
738 	best_sum = best_score;
739 	if( options.is454 ){
740 		if( band_left > 0 ) best_sum = 0;
741 		if( band_right < 0 ) best_sum = 0;
742 	}
743 #if 0
744 	printf( "%3i:  best: %i,  %i  %i  %i\n", required_aa1, best_score, band_left, band_right, band_width );
745 	printf( "max=%3i  imax=%3i; band:  %3i  %3i  %i\n", mmax, immax, band_b, band_e, band_m );
746 #endif
747 	return OK_FUNC;
748 }
749 // END diag_test_aapn_est
750 
751 
752 /*
753 local alignment of two sequence within a diag band
754 for band 0 means direction (0,0) -> (1,1)
755          1 means direction (0,1) -> (1,2)
756         -1 means direction (1,0) -> (2,1)
757 added on 2006 11 13
758 band 0                      XXXXXXXXXXXXXXXXXX               seq2, rep seq
759                             XXXXXXXXXXXXXXX                  seq1
760 band 1                      XXXXXXXXXXXXXXXXXX               seq2, rep seq
761                              XXXXXXXXXXXXXXX                 seq1
762 extreme right (+)           XXXXXXXXXXXXXXXXXX               seq2, rep seq
763     band = len2-1                            XXXXXXXXXXXXXXX seq1
764 band-1                      XXXXXXXXXXXXXXXXXX               seq2, rep seq
765                            XXXXXXXXXXXXXXX                   seq1
766 extreme left (-)            XXXXXXXXXXXXXXXXXX               seq2, rep seq
767               XXXXXXXXXXXXXXX   band = -(len1-1)             seq1
768 iseq len are integer sequence and its length,
769 mat is matrix, return ALN_PAIR class
770 
771        band:  -101   seq2 len2 = 17
772                 \\\1234567890123456
773               0  \xxxxxxxxxxxxxxxxx
774               1   xxxxxxxxxxxxxxxxx\ most right band = len2-1
775               2   xxxxxxxxxxxxxxxxx
776     seq1      3   xxxxxxxxxxxxxxxxx
777     len1 = 11 4   xxxxxxxxxxxxxxxxx
778               5   xxxxxxxxxxxxxxxxx
779               6   xxxxxxxxxxxxxxxxx
780               7   xxxxxxxxxxxxxxxxx
781               8   xxxxxxxxxxxxxxxxx
782               9   xxxxxxxxxxxxxxxxx
783               0   xxxxxxxxxxxxxxxxx
784                   \
785                    most left band = -(len1-1)
786 
787 */
788 
local_band_align(char iseq1[],char iseq2[],int len1,int len2,ScoreMatrix & mat,int & best_score,int & iden_no,int & alnln,float & dist,int * alninfo,int band_left,int band_center,int band_right,WorkingBuffer & buffer)789 int local_band_align( char iseq1[], char iseq2[], int len1, int len2, ScoreMatrix &mat,
790 		int &best_score, int &iden_no, int &alnln, float &dist, int *alninfo,
791 		int band_left, int band_center, int band_right, WorkingBuffer & buffer)
792 {
793 	int i, j, k, j1;
794 	int jj, kk;
795 	int iden_no1;
796 	int64_t best_score1;
797 	iden_no = 0;
798 
799 	if ( (band_right >= len2 ) ||
800 			(band_left  <= -len1) ||
801 			(band_left  > band_right) ) return FAILED_FUNC;
802 
803 	// allocate mem for score_mat[len1][len2] etc
804 	int band_width = band_right - band_left + 1;
805 	int band_width1 = band_width + 1;
806 
807     // score_mat, back_mat [i][j]: i index of seqi (0 to len(seqi)-1), j index of band (0 to band_width-1)
808 	MatrixInt64 & score_mat = buffer.score_mat;
809 	MatrixInt   & back_mat = buffer.back_mat;
810 
811 	//printf( "%i  %i\n", band_right, band_left );
812 
813 	if( score_mat.size() <= len1 ){
814 		VectorInt   row( band_width1, 0 );
815 		VectorInt64 row2( band_width1, 0 );
816 		while( score_mat.size() <= len1 ){
817 			score_mat.Append( row2 );
818 			back_mat.Append( row );
819 		}
820 	}
821 	for(i=0; i<=len1; i++){
822 		if( score_mat[i].Size() < band_width1 ) score_mat[i].Resize( band_width1 );
823 		if( back_mat[i].Size() < band_width1 ) back_mat[i].Resize( band_width1 );
824 	}
825 
826 	best_score = 0;
827 	/* seq1 is query, seq2 is rep
828                   seq2    len2 = 17       seq2    len2 = 17    seq2    len2 = 17
829                   01234567890123456       01234567890123456    01234567890123456
830        0          xxxxxxxxxxxxxxxxx \\\\\\XXXxxxxxxxxxxxxxx    xXXXXXXXxxxxxxxxx
831        1     \\\\\Xxxxxxxxxxxxxxxxx  \\\\\Xxx\xxxxxxxxxxxxx    xx\xxxxx\xxxxxxxx
832        2      \\\\X\xxxxxxxxxxxxxxx   \\\\Xxxx\xxxxxxxxxxxx    xxx\xxxxx\xxxxxxx
833   seq1 3       \\\Xx\xxxxxxxxxxxxxx    \\\Xxxxx\xxxxxxxxxxx    xxxx\xxxxx\xxxxxx
834   len1 4        \\Xxx\xxxxxxxxxxxxx     \\Xxxxxx\xxxxxxxxxx    xxxxx\xxxxx\xxxxx
835   = 11 5         \Xxxx\xxxxxxxxxxxx      \Xxxxxxx\xxxxxxxxx    xxxxxx\xxxxx\xxxx
836        6          Xxxxx\xxxxxxxxxxx       Xxxxxxxx\xxxxxxxx    xxxxxxx\xxxxx\xxx
837        7          x\xxxx\xxxxxxxxxx       x\xxxxxxx\xxxxxxx    xxxxxxxx\xxxxx\xx
838        8          xx\xxxx\xxxxxxxxx       xx\xxxxxxx\xxxxxx    xxxxxxxxx\xxxxx\x
839        9          xxx\xxxx\xxxxxxxx       xxx\xxxxxxx\xxxxx    xxxxxxxxxx\xxxxx\
840        0          xxxx\xxxx\xxxxxxx       xxxx\xxxxxxx\xxxx    xxxxxxxxxxx\xxxxx
841                   band_left < 0 (-6)      band_left < 0 (-6)   band_left >=0 (1)
842                   band_right < 0 (-1)     band_right >=0 (2)   band_right >=0(7)
843                   band_width 6            band_width 9         band_width 7
844        init score_mat, and iden_mat (place with upper 'X')
845      */
846 
847 	if (band_left < 0) {  //set score to left border of the matrix within band
848 		int tband = (band_right < 0) ? band_right : 0;
849 		//for (k=band_left; k<tband; k++)
850 		for (k=band_left; k<=tband; k++) { // fixed on 2006 11 14
851 			i = -k;
852 			j1 = k-band_left;
853 			// penalty for leading gap opening = penalty for gap extension
854             // each of the left side query hunging residues give ext_gap (-1)
855 			score_mat[i][j1] =  mat.ext_gap * i;
856 			back_mat[i][j1] = DP_BACK_TOP;
857 		}
858 		back_mat[-tband][tband-band_left] = DP_BACK_NONE;
859 	}
860 
861 	if (band_right >=0) { //set score to top border of the matrix within band
862 		int tband = (band_left > 0) ? band_left : 0;
863 		for (j=tband; j<=band_right; j++) {
864 			j1 = j-band_left;
865 			score_mat[0][j1] = mat.ext_gap * j;
866 			back_mat[0][j1] = DP_BACK_LEFT;
867 		}
868 		back_mat[0][tband-band_left] = DP_BACK_NONE;
869 	}
870 
871 	int gap_open[2] = { mat.gap, mat.ext_gap };
872 	int max_diag = band_center - band_left;
873 	int extra_score[4] = { 4, 3, 2, 1 };
874 	for (i=1; i<=len1; i++) {
875 		int J0 = 1 - band_left - i;
876 		int J1 = len2 - band_left - i;
877 		if( J0 < 0 ) J0 = 0;
878 		if( J1 >= band_width ) J1 = band_width;
879 		for (j1=J0; j1<=J1; j1++){
880 			j = j1+i+band_left;
881 
882 			int ci = iseq1[i-1];
883 			int cj = iseq2[j-1];
884 			int sij = mat.matrix[ci][cj];
885 			//int iden_ij = (ci == cj);
886 			int s1, k0, back;
887 
888 			/* extra score according to the distance to the best diagonal */
889 			int extra = extra_score[ abs(j1 - max_diag) & 3 ]; // max distance 3
890 			sij += extra * (sij>0);
891 
892 			back = DP_BACK_LEFT_TOP;
893 			best_score1 = score_mat[i-1][j1] + sij;
894 			int gap0 = gap_open[ (i == len1) | (j == len2) ];
895 			int gap = 0;
896 			int64_t score;
897 
898 			if( j1 > 0 ){
899 				gap = gap0;
900 				if( back_mat[i][j1-1] == DP_BACK_LEFT ) gap = mat.ext_gap;
901 				if( (score = score_mat[i][j1-1] + gap) > best_score1 ){
902 					back = DP_BACK_LEFT;
903 					best_score1 = score;
904 				}
905 			}
906 			if(j1+1<band_width){
907 				gap = gap0;
908 				if( back_mat[i-1][j1+1] == DP_BACK_TOP ) gap = mat.ext_gap;
909 				if( (score = score_mat[i-1][j1+1] + gap) > best_score1 ){
910 					back = DP_BACK_TOP;
911 					best_score1 = score;
912 				}
913 			}
914 			score_mat[i][j1] = best_score1;
915 			back_mat[i][j1]  = back;
916 			//printf( "%2i(%2i) ", best_score1, iden_no1 );
917 
918 		}
919 		//printf( "\n" );
920 	}
921 	i = j = 0;
922 	if( len2 - band_left < len1 ){
923 		i = len2 - band_left;
924 		j = len2;
925 	}else if( len1 + band_right < len2 ){
926 		i = len1;
927 		j = len1 + band_right;
928 	}else{
929 		i = len1;
930 		j = len2;
931 	}
932 	j1 = j - i - band_left;
933 	best_score = score_mat[i][j1];
934 	best_score1 = score_mat[i][j1];
935 
936 #if 1
937 	const char *letters = "acgtnx";
938 	const char *letters2 = "ACGTNX";
939 #else
940 	const char *letters = "arndcqeghilkmfpstwyvbzx";
941 	const char *letters2 = "ARNDCQEGHILKMFPSTWYVBZX";
942 #endif
943 	int back = back_mat[i][j1];
944 	int last = back;
945 	int count = 0, count2 = 0, count3 = 0;
946 	int match, begin1, begin2, end1, end2;
947 	int gbegin1=0, gbegin2=0, gend1=0, gend2=0;
948 	int64_t score, smin = best_score1, smax = best_score1 - 1;
949 	int posmin, posmax, pos = 0;
950 	int bl, dlen = 0, dcount = 0;
951 	posmin = posmax = 0;
952 	begin1 = begin2 = end1 = end2 = 0;
953 
954 #ifdef PRINT
955 #define PRINT
956 	printf( "%i %i\n", best_score, score_mat[i][j1] );
957 	printf( "%i %i %i\n", band_left, band_center, band_right );
958 	printf( "%i %i %i %i\n", i, j, j1, len2 );
959 #endif
960 #ifdef MAKEALIGN
961 #define MAKEALIGN
962 	char AA[ MAX_SEQ ], BB[ MAX_SEQ ];
963 	int NN = 0;
964 	int IA, IB;
965 	for(IA=len1;IA>i;IA--){
966 		AA[NN] = letters[ iseq1[IA-1] ];
967 		BB[NN++] = '-';
968 	}
969 	for(IB=len2;IB>j;IB--){
970 		AA[NN] = '-';
971 		BB[NN++] = letters[ iseq2[IB-1] ];
972 	}
973 #endif
974 
975 	int masked = 0;
976 	int indels = 0;
977 	int max_indels = 0;
978 	while( back != DP_BACK_NONE ){
979 		switch( back ){
980 		case DP_BACK_TOP  :
981 #ifdef PRINT
982 			printf( "%5i: %c %c %9i\n", pos, letters[ iseq1[i-1] ], '|', score_mat[i][j1] );
983 #endif
984 #ifdef MAKEALIGN
985 			AA[NN] = letters[ iseq1[i-1] ];
986 			BB[NN++] = '-';
987 #endif
988 			bl = (last != back) & (j != 1) & (j != len2);
989 			dlen += bl;
990 			dcount += bl;
991 			score = score_mat[i][j1];
992 			if( score < smin ){
993 				count2 = 0;
994 				smin = score;
995 				posmin = pos - 1;
996 				begin1 = i;
997 				begin2 = j;
998 			}
999 			i -= 1;
1000 			j1 += 1;
1001 			break;
1002 		case DP_BACK_LEFT :
1003 #ifdef PRINT
1004 			printf( "%5i: %c %c %9i\n", pos, '|', letters[ iseq2[j-1] ], score_mat[i][j1] );
1005 #endif
1006 #ifdef MAKEALIGN
1007 			AA[NN] = '-';
1008 			BB[NN++] = letters[ iseq2[j-1] ];
1009 #endif
1010 			bl = (last != back) & (i != 1) & (i != len1);
1011 			dlen += bl;
1012 			dcount += bl;
1013 			score = score_mat[i][j1];
1014 			if( score < smin ){
1015 				count2 = 0;
1016 				smin = score;
1017 				posmin = pos - 1;
1018 				begin1 = i;
1019 				begin2 = j;
1020 			}
1021 			j1 -= 1;
1022 			j -= 1;
1023 			break;
1024 		case DP_BACK_LEFT_TOP :
1025 #ifdef PRINT
1026 			if( iseq1[i-1] == iseq2[j-1] ){
1027 				printf( "%5i: %c %c %9i\n", pos, letters2[ iseq1[i-1] ], letters2[ iseq2[j-1] ], score_mat[i][j1] );
1028 			}else{
1029 				printf( "%5i: %c %c %9i\n", pos, letters[ iseq1[i-1] ], letters[ iseq2[j-1] ], score_mat[i][j1] );
1030 			}
1031 #endif
1032 #ifdef MAKEALIGN
1033 			if( iseq1[i-1] == iseq2[j-1] ){
1034 				AA[NN] = letters2[ iseq1[i-1] ];
1035 				BB[NN++] = letters2[ iseq2[j-1] ];
1036 			}else{
1037 				AA[NN] = letters[ iseq1[i-1] ];
1038 				BB[NN++] = letters[ iseq2[j-1] ];
1039 			}
1040 #endif
1041 			if( alninfo && options.global_identity ){
1042 				if( i == 1 || j == 1 ){
1043 					gbegin1 = i-1;
1044 					gbegin2 = j-1;
1045 				}else if( i == len1 || j == len2 ){
1046 					gend1 = i-1;
1047 					gend2 = j-1;
1048 				}
1049 			}
1050 			score = score_mat[i][j1];
1051 			i -= 1;
1052 			j -= 1;
1053 			match = iseq1[i] == iseq2[j];
1054 			if( score > smax ){
1055 				count = 0;
1056 				smax = score;
1057 				posmax = pos;
1058 				end1 = i;
1059 				end2 = j;
1060 			}
1061 			if( options.isEST && (iseq1[i] > 4 || iseq2[j] > 4) ){
1062 				masked += 1;
1063 			}else{
1064 				dlen += 1;
1065 				dcount += ! match;
1066 				count += match;
1067 				count2 += match;
1068 				count3 += match;
1069 			}
1070 			if( score < smin ){
1071 				int mm = match == 0;
1072 				count2 = 0;
1073 				smin = score;
1074 				posmin = pos - mm;
1075 				begin1 = i + mm;
1076 				begin2 = j + mm;
1077 			}
1078 			break;
1079 		default : printf( "%i\n", back ); break;
1080 		}
1081 		if( options.is454 ){
1082 			if( back == DP_BACK_LEFT_TOP ){
1083 				if( indels > max_indels ) max_indels = indels;
1084 				indels = 0;
1085 			}else{
1086 				if( last == DP_BACK_LEFT_TOP ){
1087 					indels = 1;
1088 				}else if( indels ){
1089 					indels += 1;
1090 				}
1091 			}
1092 		}
1093 		pos += 1;
1094 		last = back;
1095 		back = back_mat[i][j1];
1096 	}
1097 	if( options.is454 and max_indels > options.max_indel ) return FAILED_FUNC;
1098 	iden_no = options.global_identity ? count3 : count - count2;
1099 	alnln = posmin - posmax + 1 - masked;
1100 	dist = dcount/(float)dlen;
1101 	//dist = - 0.75 * log( 1.0 - dist * 4.0 / 3.0 );
1102 	int umtail1 = len1 - 1 - end1;
1103 	int umtail2 = len2 - 1 - end2;
1104 	int umhead = begin1 < begin2 ? begin1 : begin2;
1105 	int umtail = umtail1 < umtail2 ? umtail1 : umtail2;
1106 	int umlen = umhead + umtail;
1107 	if( umlen > options.unmatch_len ) return FAILED_FUNC;
1108 	if( umlen > len1 * options.short_unmatch_per ) return FAILED_FUNC;
1109 	if( umlen > len2 * options.long_unmatch_per ) return FAILED_FUNC;
1110 	if( alninfo ){
1111 		alninfo[0] = begin1;
1112 		alninfo[1] = end1;
1113 		alninfo[2] = begin2;
1114 		alninfo[3] = end2;
1115 		alninfo[4] = masked;
1116 		if( options.global_identity ){
1117 			alninfo[0] = gbegin1;
1118 			alninfo[1] = gend1;
1119 			alninfo[2] = gbegin2;
1120 			alninfo[3] = gend2;
1121 		}
1122 	}
1123 #ifdef PRINT
1124 	printf( "%6i %6i:  %4i %4i %4i %4i\n", alnln, iden_no, begin1, end1, begin2, end2 );
1125 	printf( "%6i %6i:  %4i %4i\n", posmin, posmax, posmin - posmax, count - count2 );
1126 	printf( "smin = %9i, smax = %9i\n", smin, smax );
1127 	printf( "dlen = %5i, dcount = %5i, dist = %.3f\n", dlen, dcount, dcount/(float)dlen );
1128 #endif
1129 #ifdef MAKEALIGN
1130 	float identity = iden_no / (float)( options.global_identity ? (len1 - masked) : alnln);
1131 	if( identity < options.cluster_thd ) return OK_FUNC;
1132 	while(i--){
1133 		AA[NN] = letters[ iseq1[i-1] ];
1134 		BB[NN++] = '-';
1135 	}
1136 	while(j--){
1137 		AA[NN] = '-';
1138 		BB[NN++] = letters[ iseq2[j-1] ];
1139 	}
1140 	AA[NN] = '\0';
1141 	BB[NN] = '\0';
1142 	for(i=0; i<NN/2; i++){
1143 		char aa = AA[i], bb = BB[i];
1144 		AA[i] = AA[NN-i-1];
1145 		BB[i] = BB[NN-i-1];
1146 		AA[NN-i-1] = aa;
1147 		BB[NN-i-1] = bb;
1148 	}
1149 	static int fcount = 0;
1150 	fcount += 1;
1151 	FILE *fout = fopen( "alignments.txt", "a" );
1152 	if( fout == NULL ){
1153 		if( fcount <= 1 ) printf( "alignment files open failed\n" );
1154 		return OK_FUNC;
1155 	}
1156 	fprintf( fout, "\n\n######################################################\n" );
1157 	fprintf( fout, "# length X = %i\n", len2 );
1158 	fprintf( fout, "# length Y = %i\n", len1 );
1159 	fprintf( fout, "# best align X: %i-%i\n", begin2+1, end2+1 );
1160 	fprintf( fout, "# best align Y: %i-%i\n", begin1+1, end1+1 );
1161 	if( alninfo ){
1162 		fprintf( fout, "# align X: %i-%i\n", alninfo[2]+1, alninfo[3]+1 );
1163 		fprintf( fout, "# align Y: %i-%i\n", alninfo[0]+1, alninfo[1]+1 );
1164 	}
1165 	fprintf( fout, "# alignment length: %i\n", alnln );
1166 	fprintf( fout, "# identity count: %i\n", iden_no );
1167 	fprintf( fout, "# identity: %g\n", identity );
1168 	fprintf( fout, "# distance: %g\n", dist );
1169 	if( options.is454 ) fprintf( fout, "# max indel: %i\n", max_indels );
1170 #if 0
1171 	fprintf( fout, "%i %s\n", seq1->index, AA );
1172 	fprintf( fout, "%i %s\n", seq2->index, BB );
1173 #else
1174 	bool printaa = true;
1175 	IB = IA = 0;
1176 	fprintf( fout, "\n\nX " );
1177 	while( IA < NN ){
1178 		if( printaa ){
1179 			fprintf( fout, "%c", BB[IB] );
1180 			IB += 1;
1181 			if( IB % 75 ==0 or IB == NN ) printaa = false, fprintf( fout, "\nY " );
1182 		}else{
1183 			fprintf( fout, "%c", AA[IA] );
1184 			IA += 1;
1185 			if( IA % 75 ==0 ) printaa = true, fprintf( fout, "\n\nX " );
1186 		}
1187 	}
1188 #endif
1189 	fclose( fout );
1190 #endif
1191 
1192 	return OK_FUNC;
1193 } // END int local_band_align
1194 
1195 
1196 
setaa_to_na()1197 void setaa_to_na()
1198 {
1199 	int i;
1200 	for (i=0; i<26; i++) aa2idx[i]   = na2idx[i];
1201 } // END void setaa_to_na
1202 
1203 
1204 /////////////////
ScoreMatrix()1205 ScoreMatrix::ScoreMatrix()
1206 {
1207 	init();
1208 }
1209 
init()1210 void ScoreMatrix::init()
1211 {
1212 	set_gap( -11, -1 );
1213 	set_matrix( BLOSUM62 );
1214 }
1215 
set_gap(int gap1,int ext_gap1)1216 void ScoreMatrix::set_gap(int gap1, int ext_gap1)
1217 {
1218 	int i;
1219 	gap = MAX_SEQ * gap1;
1220 	ext_gap = MAX_SEQ * ext_gap1;
1221 }
1222 
set_matrix(int * mat1)1223 void ScoreMatrix::set_matrix(int *mat1)
1224 {
1225 	int i, j, k;
1226 	k = 0;
1227 	for ( i=0; i<MAX_AA; i++)
1228 		for ( j=0; j<=i; j++)
1229 			matrix[j][i] = matrix[i][j] = MAX_SEQ * mat1[ k++ ];
1230 }
1231 
set_to_na()1232 void ScoreMatrix::set_to_na()
1233 {
1234 	set_gap( -6, -1 );
1235 	set_matrix( BLOSUM62_na );
1236 }
1237 // Only for est
set_match(int score)1238 void ScoreMatrix::set_match( int score )
1239 {
1240 	int i;
1241 	for ( i=0; i<5; i++) matrix[i][i] = MAX_SEQ * score;
1242 	//matrix[3][4] = matrix[4][3] = MAX_SEQ * score;
1243 }
1244 // Only for est
set_mismatch(int score)1245 void ScoreMatrix::set_mismatch( int score )
1246 {
1247 	int i, j;
1248 	for ( i=0; i<MAX_AA; i++)
1249 		for ( j=0; j<i; j++)
1250 			matrix[j][i] = matrix[i][j] = MAX_SEQ * score;
1251 	matrix[3][4] = matrix[4][3] = MAX_SEQ;
1252 }
1253 
WordTable(int naa,int naan)1254 WordTable::WordTable( int naa, int naan )
1255 {
1256 	NAA      = 0;
1257 	NAAN     = 0;
1258 	is_aa    = 1;
1259 	size = 0;
1260 	frag_count = 0;
1261 	Init( naa, naan );
1262 }
1263 
SetDNA()1264 void WordTable::SetDNA()
1265 {
1266 	is_aa = 0;
1267 }
1268 
Init(int naa,int naan)1269 void WordTable::Init(int naa, int naan)
1270 {
1271 	NAA  = naa;
1272 	NAAN = naan;
1273 	indexCounts.resize( NAAN );
1274 }
1275 
Clear()1276 void WordTable::Clear()
1277 {
1278 	int i;
1279 #if 0
1280 	int n1 = 0, n2 = 0, n3 = 0, ns = 0;
1281 	for(i=0; i<NAAN; i++){
1282 		NVector<IndexCount> & ics = indexCounts[i];
1283 		for(int j=0; j<ics.size; j++){
1284 			IndexCount ic = ics[j];
1285 			n1 += ic.count == 1;
1286 			n2 += ic.count == 2;
1287 			n3 += ic.count == 3;
1288 			ns += ic.count >= 4;
1289 		}
1290 	}
1291 	printf( "%9i %9i %9i %9i\n", n1, n2, n3, ns );
1292 #endif
1293 	size = 0;
1294 	frag_count = 0;
1295 	sequences.clear();
1296 	for (i=0; i<NAAN; i++) indexCounts[i].size = 0;//Clear();
1297 }
1298 
AddWordCounts(NVector<IndexCount> & counts,Sequence * seq,bool skipN)1299 int WordTable::AddWordCounts( NVector<IndexCount> & counts, Sequence *seq, bool skipN)
1300 {
1301 	int aan_no = counts.Size();
1302 	int i, j, k, idx = sequences.size();
1303 	for (i=0; i<aan_no; i++) {
1304 		IndexCount ic = counts[i];
1305 		if ( (k=ic.count) ) {
1306 			j = ic.index;
1307 			if ( skipN && j<0) continue; // for those has 'N'
1308 			NVector<IndexCount> & row = indexCounts[j];
1309 			ic.index = idx;
1310 			row.Append( ic );
1311 			size += 1;
1312 		}
1313 	}
1314 	sequences.Append( seq );
1315 	return OK_FUNC;
1316 }
AddWordCountsFrag(NVector<IndexCount> & counts,int frag,int frag_size,int repfrag)1317 int WordTable::AddWordCountsFrag( NVector<IndexCount> & counts, int frag, int frag_size, int repfrag )
1318 {
1319 	return 0;
1320 }
AddWordCounts(int aan_no,Vector<int> & word_encodes,Vector<INTs> & word_encodes_no,int idx,bool skipN)1321 int WordTable::AddWordCounts(int aan_no, Vector<int> & word_encodes, Vector<INTs> & word_encodes_no, int idx, bool skipN)
1322 {
1323 	int i, j, k;
1324 	//printf( "seq %6i: ", idx );
1325 	for (i=0; i<aan_no; i++) {
1326 		if ( (k=word_encodes_no[i]) ) {
1327 			j = word_encodes[i];
1328 			if( skipN && j<0) continue; // for those has 'N'
1329 			NVector<IndexCount> & row = indexCounts[j];
1330 			row.Append( IndexCount( idx, k ) );
1331 			size += 1;
1332 			//if( k >1 ) printf( " %3i", k );
1333 		}
1334 	}
1335 	//printf( "\n" );
1336 	return OK_FUNC;
1337 }
1338 
AddWordCountsFrag(int aan_no,Vector<int> & word_encodes,Vector<INTs> & word_encodes_no,int frag,int frag_size)1339 int WordTable::AddWordCountsFrag( int aan_no, Vector<int> & word_encodes,
1340     Vector<INTs> & word_encodes_no, int frag, int frag_size )
1341 {
1342 	int i, j, k, i1, k1, fra;
1343 
1344 	for (i=0; i<frag; i++) {
1345 		k = (i+1)*frag_size < aan_no ? (i+1)*frag_size-1: aan_no-1;
1346 		//quick_sort(&word_encodes[0], i*frag_size, k);
1347 		std::sort( word_encodes.begin() + i*frag_size, word_encodes.begin() + k + 1 );
1348 	}
1349 	for(j=aan_no-1; j; j--) {
1350 		if (word_encodes[j] == word_encodes[j-1]) {
1351 			word_encodes_no[j-1] += word_encodes_no[j];
1352 			word_encodes_no[j]=0;
1353 		}
1354 	}
1355 	// END check_word_encodes
1356 
1357 	for (i=0; i<aan_no; i+=frag_size) {
1358 		k = frag_size < (aan_no-i) ? frag_size : (aan_no -i);
1359 		fra = i / frag_size;
1360 		//AddWordCounts(k, word_encodes+i, word_encodes_no+i, NR90f_no+fra);
1361 		for (i1=i; i1<i+k; i1++) {
1362 			if ( (k1=word_encodes_no[i1]) ) {
1363 				j = word_encodes[i1];
1364 				NVector<IndexCount> & row = indexCounts[j];
1365 				row.Append( IndexCount( frag_count + fra, k1 ) );
1366 				size += 1;
1367 			}
1368 		}
1369 	}
1370 	frag_count += frag;
1371 
1372 	return 0;
1373 }
1374 
1375 
PrintAll()1376 void WordTable::PrintAll()
1377 {
1378 	int  i, j, k;
1379 	int cols = 0;
1380 	long long total_words = 0;
1381 	k = 0;
1382 	for (i=0; i<NAAN; i++) {
1383 		int size = indexCounts[i].Size();
1384 		if ( size == 0 ) continue;
1385 		cols++;
1386 		cout << k << "\t" << i << "\tsize:" << size << "\t";
1387 		for (j=0; j<size; j++) {
1388 			cout << indexCounts[i][j].index << "," << indexCounts[i][j].count << " ";
1389 			total_words += indexCounts[i][j].count;
1390 		}
1391 		cout << endl;
1392 		k++;
1393 	}
1394 
1395 	cout << "total cols: " << cols << " total words: " << total_words << endl;
1396 }
1397 
1398 
1399 /* Quick Sort.
1400  * Adam Drozdek: Data Structures and Algorithms in C++, 2nd Edition.
1401  */
PartialQuickSort(IndexCount * data,int first,int last,int partial)1402 void PartialQuickSort( IndexCount *data, int first, int last, int partial )
1403 {
1404 	int lower=first+1, upper=last;
1405 	IndexCount pivot;
1406 	IndexCount val;
1407 	if( first >= last ) return;
1408 	val = data[first];
1409 	data[first] = data[ (first+last)/2 ];
1410 	data[ (first+last)/2 ] = val;
1411 	pivot = data[ first ];
1412 
1413 	while( lower <= upper ){
1414 		while( lower <= last && data[lower].count < pivot.count ) lower ++;
1415 		while( pivot.count < data[upper].count ) upper --;
1416 		if( lower < upper ){
1417 			val = data[lower];
1418 			data[lower] = data[upper];
1419 			data[upper] = val;
1420 			upper --;
1421 		}
1422 		lower ++;
1423 	}
1424 	val = data[first];
1425 	data[first] = data[upper];
1426 	data[upper] = val;
1427 	if( first < upper-1 ) PartialQuickSort( data, first, upper-1, partial );
1428 	if( upper >= partial ) return;
1429 	if( upper+1 < last ) PartialQuickSort( data, upper+1, last, partial );
1430 }
CountWords(int aan_no,Vector<int> & word_encodes,Vector<INTs> & word_encodes_no,NVector<IndexCount> & lookCounts,NVector<uint32_t> & indexMapping,bool est,int min)1431 int WordTable::CountWords(int aan_no, Vector<int> & word_encodes, Vector<INTs> & word_encodes_no,
1432     NVector<IndexCount> &lookCounts, NVector<uint32_t> & indexMapping,
1433 	bool est, int min)
1434 {
1435 	int S = frag_count ? frag_count : sequences.size();
1436 	int  j, k, j0, j1, k1, m;
1437 	int ix1, ix2, ix3, ix4;
1438 	IndexCount tmp;
1439 
1440 	IndexCount *ic = lookCounts.items;
1441 	for(j=0; j<lookCounts.size; j++, ic++) indexMapping[ ic->index ] = 0;
1442 	lookCounts.size = 0;
1443 
1444 	int *we = & word_encodes[0];
1445 	j0 = 0;
1446 	if( est ) while( *we <0 ) j0++, we++; // if met short word has 'N'
1447 	INTs *wen = & word_encodes_no[j0];
1448 	//printf( "\nquery : " );
1449 	for (; j0<aan_no; j0++, we++, wen++) {
1450 		j  = *we;
1451 		j1 = *wen;
1452 		//if( j1 >1 ) printf( " %3i", j1 );
1453 		if( j1==0 ) continue;
1454 		NVector<IndexCount> & one = indexCounts[j];
1455 		k1 = one.Size();
1456 		IndexCount *ic = one.items;
1457 
1458 		int rest = aan_no - j0 + 1;
1459 		for (k=0; k<k1; k++, ic++){
1460 			int c = ic->count < j1 ? ic->count : j1;
1461 			uint32_t *idm = indexMapping.items + ic->index;
1462 			if( *idm ==0 ){
1463 				if( rest < min ) continue;
1464 				IndexCount *ic2 = lookCounts.items + lookCounts.size;
1465 				lookCounts.size += 1;
1466 				*idm = lookCounts.size;
1467 				ic2->index = ic->index;
1468 				ic2->count = c;
1469 			}else{
1470 				lookCounts[ *idm - 1 ].count += c;
1471 			}
1472 		}
1473 	}
1474 	//printf( "%6i %6i\n", S, lookCounts.size );
1475 	lookCounts[ lookCounts.size ].count = 0;
1476 	//printf( "\n\n" );
1477 	return OK_FUNC;
1478 }
1479 
Sequence()1480 Sequence::Sequence()
1481 {
1482 	memset( this, 0, sizeof( Sequence ) );
1483 	distance = 2.0;
1484 }
Sequence(const Sequence & other)1485 Sequence::Sequence( const Sequence & other )
1486 {
1487 	int i;
1488 	//printf( "new: %p  %p\n", this, & other );
1489 	memcpy( this, & other, sizeof( Sequence ) );
1490 	distance = 2.0;
1491 	if( other.data ){
1492 		size = bufsize = other.size;
1493                 size_R2 = 0;
1494 		data = new char[size+1];
1495 		//printf( "data: %p  %p\n", data, other.data );
1496 		data[size] = 0;
1497 		memcpy( data, other.data, size );
1498 		//for (i=0; i<size; i++) data[i] = other.data[i];
1499 	}
1500 	if( other.identifier ){
1501 		int len = strlen( other.identifier );
1502 		identifier = new char[len+1];
1503 		memcpy( identifier, other.identifier, len );
1504 		identifier[len] = 0;
1505 	}
1506 }
1507 
1508 // back to back merge for PE
1509 // R1 -> XXXXXXABC ------------------- NMLYYYYYY <--R2
1510 // >R1           >R2
1511 // XXXXXXABC     YYYYYYLMN =====> Merge into
1512 // >R12
1513 // NMLYYYYYYXXXXXXABC
Sequence(const Sequence & other,const Sequence & other2,int mode)1514 Sequence::Sequence( const Sequence & other, const Sequence & other2, int mode )
1515 {
1516 	int i;
1517         if (mode != 1) bomb_error("unknown mode");
1518 
1519 	//printf( "new: %p  %p\n", this, & other );
1520 	memcpy( this, & other, sizeof( Sequence ) );
1521 	distance = 2.0;
1522 
1523 	if( other.data && other2.data ){
1524 		size = bufsize = (other.size + other2.size);
1525                 size_R2 = other2.size;
1526 		data = new char[size+1];
1527 		//printf( "data: %p  %p\n", data, other.data );
1528 		data[size] = 0;
1529                 data[size_R2] = 0;
1530                 memcpy( data, other2.data, size_R2); // copy R2 first
1531                 strrev( data );                      // reverse R2 on data
1532 		memcpy( data+size_R2, other.data, size-size_R2 ); // copy R1 to end of R2
1533 		//for (i=0; i<size; i++) data[i] = other.data[i];
1534 		des_begin2 = other2.des_begin;
1535                 tot_length2= other2.tot_length;
1536 	}
1537         else if ( other.data || other2.data ) {
1538                 bomb_error("Not both PE sequences have data");
1539         }
1540 
1541 	if( other.identifier ){ // only use R1
1542 		int len = strlen( other.identifier );
1543 		identifier = new char[len+1];
1544 		memcpy( identifier, other.identifier, len );
1545 		identifier[len] = 0;
1546 	}
1547 }
1548 
1549 
~Sequence()1550 Sequence::~Sequence()
1551 {
1552 	//printf( "delete: %p\n", this );
1553 	if( data ) delete[] data;
1554 	if( identifier ) delete[] identifier;
1555 }
1556 
Clear()1557 void Sequence::Clear()
1558 {
1559 	if( data ) delete[] data;
1560 	/* do not set size to zero here, it is need for writing output */
1561 	bufsize = 0;
1562 	data = NULL;
1563 }
1564 
operator =(const char * s)1565 void Sequence::operator=( const char *s )
1566 {
1567 	size = 0; // avoid copying;
1568 	Resize( strlen( s ) );
1569 	strcpy( data, s );
1570 }
operator +=(const char * s)1571 void Sequence::operator+=( const char *s )
1572 {
1573 	int i, m = size, n = strlen( s );
1574 	Reserve( m + n );
1575 	memcpy( data+m, s, n );
1576 }
Resize(int n)1577 void Sequence::Resize( int n )
1578 {
1579 	int i, m = size < n ? size : n;
1580 	size = n;
1581 	if( size != bufsize ){
1582 		char *old = data;
1583 		bufsize = size;
1584 		data = new char[ bufsize + 1 ];
1585 		if ( data == NULL ) bomb_error( "Memory" );
1586 		if ( old ){
1587 			memcpy( data, old, m );
1588 			delete []old;
1589 		}
1590 		if( size ) data[size] = 0;
1591 	}
1592 }
Reserve(int n)1593 void Sequence::Reserve( int n )
1594 {
1595 	int i, m = size < n ? size : n;
1596 	size = n;
1597 	if( size > bufsize ){
1598 		char *old = data;
1599 		bufsize = size + size/5 + 1;
1600 		data = new char[ bufsize + 1 ];
1601 		if ( data == NULL ) bomb_error( "Memory" );
1602 		if ( old ){
1603 			memcpy( data, old, m );
1604 			delete []old;
1605 		}
1606 	}
1607 	if( size ) data[size] = 0;
1608 }
trim(int trim_len)1609 void Sequence::trim(int trim_len) {
1610     if (trim_len >= size) return;
1611     size = trim_len;
1612     if (size) data[size]=0;
1613 }
ConvertBases()1614 void Sequence::ConvertBases()
1615 {
1616 	int i;
1617 	for(i=0; i<size; i++) data[i] = aa2idx[data[i] - 'A'];
1618 }
1619 
Swap(Sequence & other)1620 void Sequence::Swap( Sequence & other )
1621 {
1622 	Sequence tmp;
1623 	memcpy( & tmp, this, sizeof( Sequence ) );
1624 	memcpy( this, & other, sizeof( Sequence ) );
1625 	memcpy( & other, & tmp, sizeof( Sequence ) );
1626 	memset( & tmp, 0, sizeof( Sequence ) );
1627 }
Format()1628 int Sequence::Format()
1629 {
1630 	int i, j=0, m = 0;
1631 	while( size && isspace( data[size-1] ) ) size --;
1632 	if( size && data[size-1] == '*' ) size --;
1633 	if( size ) data[size] = 0;
1634 	for (i=0; i<size; i++){
1635 		char ch = data[i];
1636 		m += ! (isalpha( ch ) | isspace( ch ));
1637 	}
1638 	if( m ) return m;
1639 	for (i=0; i<size; i++){
1640 		char ch = data[i];
1641 		if ( isalpha( ch ) ) data[j++] = toupper( ch );
1642 	}
1643 	data[j] = 0;
1644 	size = j;
1645 	return 0;
1646 }
1647 
SwapIn()1648 void Sequence::SwapIn()
1649 {
1650 	if( data ) return;
1651 	if( swap == NULL ) bomb_error( "Can not swap in sequence" );
1652 	Resize( size );
1653 	fseek( swap, offset, SEEK_SET );
1654 	if( fread( data, 1, size, swap ) ==0 ) bomb_error( "Can not swap in sequence" );
1655 	data[size] = 0;
1656 }
SwapOut()1657 void Sequence::SwapOut()
1658 {
1659 	if( swap && data ){
1660 		delete[] data;
1661 		bufsize = 0;
1662 		data = NULL;
1663 	}
1664 }
PrintInfo(int id,FILE * fout,const Options & options,char * buf)1665 void Sequence::PrintInfo( int id, FILE *fout, const Options & options, char *buf )
1666 {
1667 	const char *tag = options.isEST ? "nt" : "aa";
1668 	bool print = options.print != 0;
1669 	bool strand = options.isEST;
1670 	fprintf( fout, "%i\t%i%s, >%s...", id, size, tag, identifier+1 );
1671 	if( identity ){
1672 		int *c = coverage;
1673 		fprintf( fout, " at " );
1674 		if (print) fprintf( fout, "%i:%i:%i:%i/", c[0], c[1], c[2], c[3] );
1675 		if (strand) fprintf( fout, "%c/", (state & IS_MINUS_STRAND) ? '-' : '+' );
1676 		fprintf( fout, "%.2f%%", identity*100 );
1677 		if( options.useDistance ) fprintf( fout, "/%.2f%%", distance*100 );
1678 		fprintf( fout, "\n" );
1679 	}else{
1680 		fprintf( fout, " *\n" );
1681 	}
1682 }
1683 
1684 // by liwz gzip version 2019-02
1685 // by liwz
1686 // disable swap option
1687 // change des_begin, des_length, des_length2, dat_length => des_begin, tot_length
1688 // where des_begin is the FILE pointer of sequence record start
1689 //       tot_length is the total bytes of sequence record
Readgz(const char * file,const Options & options)1690 void SequenceDB::Readgz( const char *file, const Options & options )
1691 {
1692 #ifdef WITH_ZLIB
1693     Sequence one;
1694     Sequence des;
1695     gzFile fin = gzopen(file, "r");
1696     char *buffer = NULL;
1697     char *res = NULL;
1698     int option_l = options.min_length;
1699     if( fin == NULL ) bomb_error( "Failed to open the database file" );
1700     Clear();
1701     buffer = new char[ MAX_LINE_SIZE+1 ];
1702 
1703     while (not gzeof( fin ) || one.size) { /* do not break when the last sequence is not handled */
1704         buffer[0] = '>';
1705         if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL && one.size == 0) break;
1706         if( buffer[0] == '+' ){
1707             int len = strlen( buffer );
1708             int len2 = len;
1709             while( len2 && buffer[len2-1] != '\n' ){
1710                 if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) break;
1711                 len2 = strlen( buffer );
1712                 len += len2;
1713             }
1714             one.tot_length += len;
1715 
1716             // read next line quality score
1717             if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) bomb_error("can not read quality score after");
1718             len = strlen( buffer );
1719             len2 = len;
1720             while( len2 && buffer[len2-1] != '\n' ){
1721                 if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) break;
1722                 len2 = strlen( buffer );
1723                 len += len2;
1724             }
1725             one.tot_length += len;
1726         }else if (buffer[0] == '>' || buffer[0] == '@' || (res==NULL && one.size)) {
1727             if ( one.size ) { // write previous record
1728                 if( one.identifier == NULL || one.Format() ){
1729                     printf( "Warning: from file \"%s\",\n", file );
1730                     printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
1731                     if( one.identifier ) printf( "%s\n", one.identifier );
1732                     printf( "%s\n", one.data );
1733                     one.size = 0;
1734                 }
1735                 one.index = sequences.size();
1736                 if( one.size > option_l ) {
1737                     if (options.trim_len    > 0) one.trim(options.trim_len);
1738                     sequences.Append( new Sequence( one ) );
1739                 }
1740             }
1741             one.size = 0;
1742             one.tot_length = 0;
1743 
1744             int len = strlen( buffer );
1745             int len2 = len;
1746             des.size = 0;
1747             des += buffer;
1748             while( len2 && buffer[len2-1] != '\n' ){
1749                 if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) break;
1750                 des += buffer;
1751                 len2 = strlen( buffer );
1752                 len += len2;
1753             }
1754             size_t offset = gztell( fin );
1755             one.des_begin = offset - len;
1756             one.tot_length += len;              // count first line
1757 
1758             int i = 0;
1759             if( des.data[i] == '>' || des.data[i] == '@' || des.data[i] == '+' ) i += 1;
1760             if( des.data[i] == ' ' or des.data[i] == '\t' ) i += 1;
1761             if( options.des_len and options.des_len < des.size ) des.size = options.des_len;
1762             while( i < des.size and ! isspace( des.data[i] ) ) i += 1;
1763             des.data[i] = 0;
1764             one.identifier = des.data;
1765         } else {
1766             one.tot_length += strlen(buffer);  one += buffer;
1767         }
1768     }
1769     one.identifier = NULL;
1770     delete[] buffer;
1771     gzclose( fin );
1772 #else
1773     bomb_error("this program was not compiled with zlib");
1774 #endif
1775 }
1776 
1777 
1778 
1779 // by liwz
1780 // disable swap option
1781 // change des_begin, des_length, des_length2, dat_length => des_begin, tot_length
1782 // where des_begin is the FILE pointer of sequence record start
1783 //       tot_length is the total bytes of sequence record
Read(const char * file,const Options & options)1784 void SequenceDB::Read( const char *file, const Options & options )
1785 {
1786     int f_len = strlen(file);
1787     if (strcmp(file + f_len - 3, ".gz") == 0 ) {
1788         Readgz(file, options);
1789         return;
1790     }
1791 
1792     Sequence one;
1793     Sequence des;
1794     FILE *fin = fopen( file, "rb" );
1795     char *buffer = NULL;
1796     char *res = NULL;
1797     int option_l = options.min_length;
1798     if( fin == NULL ) bomb_error( "Failed to open the database file" );
1799     Clear();
1800     buffer = new char[ MAX_LINE_SIZE+1 ];
1801 
1802     while (not feof( fin ) || one.size) { /* do not break when the last sequence is not handled */
1803         buffer[0] = '>';
1804         if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL && one.size == 0) break;
1805         if( buffer[0] == '+' ){
1806             int len = strlen( buffer );
1807             int len2 = len;
1808             while( len2 && buffer[len2-1] != '\n' ){
1809                 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
1810                 len2 = strlen( buffer );
1811                 len += len2;
1812             }
1813             one.tot_length += len;
1814 
1815             // read next line quality score
1816             if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) bomb_error("can not read quality score after");
1817             len = strlen( buffer );
1818             len2 = len;
1819             while( len2 && buffer[len2-1] != '\n' ){
1820                 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
1821                 len2 = strlen( buffer );
1822                 len += len2;
1823             }
1824             one.tot_length += len;
1825         }else if (buffer[0] == '>' || buffer[0] == '@' || (res==NULL && one.size)) {
1826             if ( one.size ) { // write previous record
1827                 if( one.identifier == NULL || one.Format() ){
1828                     printf( "Warning: from file \"%s\",\n", file );
1829                     printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
1830                     if( one.identifier ) printf( "%s\n", one.identifier );
1831                     printf( "%s\n", one.data );
1832                     one.size = 0;
1833                 }
1834                 one.index = sequences.size();
1835                 if( one.size > option_l ) {
1836                     if (options.trim_len    > 0) one.trim(options.trim_len);
1837                     sequences.Append( new Sequence( one ) );
1838                 }
1839             }
1840             one.size = 0;
1841             one.tot_length = 0;
1842 
1843             int len = strlen( buffer );
1844             int len2 = len;
1845             des.size = 0;
1846             des += buffer;
1847             while( len2 && buffer[len2-1] != '\n' ){
1848                 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
1849                 des += buffer;
1850                 len2 = strlen( buffer );
1851                 len += len2;
1852             }
1853             size_t offset = ftell( fin );
1854             one.des_begin = offset - len;
1855             one.tot_length += len;              // count first line
1856 
1857             int i = 0;
1858             if( des.data[i] == '>' || des.data[i] == '@' || des.data[i] == '+' ) i += 1;
1859             if( des.data[i] == ' ' or des.data[i] == '\t' ) i += 1;
1860             if( options.des_len and options.des_len < des.size ) des.size = options.des_len;
1861             while( i < des.size and ! isspace( des.data[i] ) ) i += 1;
1862             des.data[i] = 0;
1863             one.identifier = des.data;
1864         } else {
1865             one.tot_length += strlen(buffer);  one += buffer;
1866         }
1867     }
1868 #if 0
1869     int i, n = 0;
1870     for(i=0; i<sequences.size(); i++) n += sequences[i].bufsize + 4;
1871     cout<<n<<"\t"<<sequences.capacity() * sizeof(Sequence)<<endl;
1872     int i;
1873     scanf( "%i", & i );
1874 #endif
1875     one.identifier = NULL;
1876     delete[] buffer;
1877     fclose( fin );
1878 }
1879 
1880 
1881 // by liwz gzip version 2019-02
1882 // PE reads liwz, disable swap option
Readgz(const char * file,const char * file2,const Options & options)1883 void SequenceDB::Readgz( const char *file, const char *file2, const Options & options )
1884 {
1885 #ifdef WITH_ZLIB
1886     Sequence one, two;
1887     Sequence des;
1888     gzFile fin = gzopen(file, "r");
1889     gzFile fin2= gzopen(file2,"r");
1890     char *buffer = NULL;
1891     char *buffer2= NULL;
1892     char *res = NULL;
1893     char *res2= NULL;
1894     int option_l = options.min_length;
1895     if( fin == NULL ) bomb_error( "Failed to open the database file" );
1896     if( fin2== NULL ) bomb_error( "Failed to open the database file" );
1897     Clear();
1898     buffer = new char[ MAX_LINE_SIZE+1 ];
1899     buffer2= new char[ MAX_LINE_SIZE+1 ];
1900 
1901     while (((not gzeof( fin )) && (not gzeof( fin2)) ) || (one.size && two.size)) { /* do not break when the last sequence is not handled */
1902         buffer[0] = '>'; res =gzgets(fin,  buffer,  MAX_LINE_SIZE);
1903         buffer2[0]= '>'; res2=gzgets(fin2, buffer2, MAX_LINE_SIZE);
1904 
1905         if ( (res      == NULL) && (res2     != NULL)) bomb_error( "Paired input files have different number sequences" );
1906         if ( (res      != NULL) && (res2     == NULL)) bomb_error( "Paired input files have different number sequences" );
1907         if ( (one.size == 0   ) && (two.size >     0)) bomb_error( "Paired input files have different number sequences" );
1908         if ( (one.size >  0   ) && (two.size ==    0)) bomb_error( "Paired input files have different number sequences" );
1909         if ( (res      == NULL) && (one.size ==    0)) break;
1910 
1911         if( buffer[0] == '+' ){ // fastq 3rd line
1912             // file 1
1913             int len = strlen( buffer );
1914             int len2 = len;
1915             while( len2 && buffer[len2-1] != '\n' ){ // read until the end of the line
1916                 if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) break;
1917                 len2 = strlen( buffer );
1918                 len += len2;
1919             }
1920             one.tot_length += len;
1921 
1922             // read next line quality score
1923             if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) bomb_error("can not read quality score after");
1924             len = strlen( buffer );
1925             len2 = len;
1926             while( len2 && buffer[len2-1] != '\n' ){
1927                 if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) break;
1928                 len2 = strlen( buffer );
1929                 len += len2;
1930             }
1931             one.tot_length += len;
1932 
1933             // file 2
1934             len = strlen( buffer2 );
1935             len2 = len;
1936             while( len2 && buffer2[len2-1] != '\n' ){ // read until the end of the line
1937                 if ( (res2=gzgets(fin2, buffer2, MAX_LINE_SIZE)) == NULL ) break;
1938                 len2 = strlen( buffer2 );
1939                 len += len2;
1940             }
1941             two.tot_length += len;
1942 
1943             // read next line quality score
1944             if ( (res2=gzgets(fin2, buffer2, MAX_LINE_SIZE)) == NULL ) bomb_error("can not read quality score after");
1945             len = strlen( buffer2 );
1946             len2 = len;
1947             while( len2 && buffer2[len2-1] != '\n' ){
1948                 if ( (res2=gzgets(fin2, buffer2, MAX_LINE_SIZE)) == NULL ) break;
1949                 len2 = strlen( buffer2 );
1950                 len += len2;
1951             }
1952             two.tot_length += len;
1953 
1954         }else if (buffer[0] == '>' || buffer[0] == '@' || (res==NULL && one.size)) {
1955             if ( one.size && two.size ) { // write previous record
1956                 if( one.identifier == NULL || one.Format() ){
1957                     printf( "Warning: from file \"%s\",\n", file );
1958                     printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
1959                     if( one.identifier ) printf( "%s\n", one.identifier );
1960                     printf( "%s\n", one.data );
1961                     one.size=0; two.size=0;
1962                 }
1963                 if( two.identifier == NULL || two.Format() ){
1964                     printf( "Warning: from file \"%s\",\n", file2 );
1965                     printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
1966                     if( two.identifier ) printf( "%s\n", two.identifier );
1967                     printf( "%s\n", two.data );
1968                     one.size=0; two.size = 0;
1969                 }
1970                 one.index = sequences.size();
1971                 if( (one.size + two.size)> option_l ) {
1972                     if (options.trim_len    > 0) one.trim(options.trim_len);
1973                     if (options.trim_len_R2 > 0) two.trim(options.trim_len_R2);
1974                     sequences.Append( new Sequence( one, two, 1 ) );
1975                 }
1976             }
1977             // R1
1978             one.size = 0;
1979             one.tot_length = 0;
1980 
1981             int len = strlen( buffer );
1982             int len2 = len;
1983             des.size = 0;
1984             des += buffer;
1985             while( len2 && buffer[len2-1] != '\n' ){
1986                 if ( (res=gzgets(fin, buffer, MAX_LINE_SIZE)) == NULL ) break;
1987                 des += buffer;
1988                 len2 = strlen( buffer );
1989                 len += len2;
1990             }
1991             size_t offset = gztell( fin );
1992             one.des_begin = offset - len; // offset of ">" or "@"
1993             one.tot_length += len;              // count first line
1994 
1995             int i = 0;
1996             if( des.data[i] == '>' || des.data[i] == '@' || des.data[i] == '+' ) i += 1;
1997             if( des.data[i] == ' ' or des.data[i] == '\t' ) i += 1;
1998             if( options.des_len and options.des_len < des.size ) des.size = options.des_len;
1999             while( i < des.size and ! isspace( des.data[i] ) ) i += 1;
2000             des.data[i] = 0;                   // find first non-space letter
2001             one.identifier = des.data;
2002 
2003             // R2
2004             two.size = 0;
2005             two.tot_length = 0;
2006 
2007             len = strlen( buffer2 );
2008             len2 = len;
2009             while( len2 && buffer2[len2-1] != '\n' ){
2010                 if ( (res=gzgets(fin2, buffer2, MAX_LINE_SIZE)) == NULL ) break;
2011                 len2 = strlen( buffer2 );
2012                 len += len2;
2013             }
2014             offset = gztell( fin2 );
2015             two.des_begin = offset - len;
2016             two.tot_length += len;              // count first line
2017             two.identifier = des.data;
2018         } else {
2019             one.tot_length += strlen(buffer);  one += buffer;
2020             two.tot_length+= strlen(buffer2); two+= buffer2;
2021         }
2022     }
2023     one.identifier = NULL;
2024     two.identifier = NULL;
2025     delete[] buffer;
2026     gzclose( fin );
2027     delete[] buffer2;
2028     gzclose( fin2 );
2029 #else
2030     bomb_error("this program was not compiled with zlib");
2031 #endif
2032 
2033 }
2034 
2035 // PE reads liwz, disable swap option
Read(const char * file,const char * file2,const Options & options)2036 void SequenceDB::Read( const char *file, const char *file2, const Options & options )
2037 {
2038     int f_len = strlen(file);
2039     int f_len2= strlen(file2);
2040     if (strcmp(file + f_len - 3, ".gz") == 0 ) {
2041         if ( strcmp(file2 + f_len2 - 3, ".gz") ) bomb_error( "Both input files need to be in .gz format" );
2042         Readgz(file, file2, options);
2043         return;
2044     }
2045 
2046     Sequence one, two;
2047     Sequence des;
2048     FILE *fin = fopen( file, "rb" );
2049     FILE *fin2= fopen( file2,"rb" );
2050     char *buffer = NULL;
2051     char *buffer2= NULL;
2052     char *res = NULL;
2053     char *res2= NULL;
2054     int option_l = options.min_length;
2055     if( fin == NULL ) bomb_error( "Failed to open the database file" );
2056     if( fin2== NULL ) bomb_error( "Failed to open the database file" );
2057     Clear();
2058     buffer = new char[ MAX_LINE_SIZE+1 ];
2059     buffer2= new char[ MAX_LINE_SIZE+1 ];
2060 
2061     while (((not feof( fin )) && (not feof( fin2)) ) || (one.size && two.size)) { /* do not break when the last sequence is not handled */
2062         buffer[0] = '>'; res =fgets( buffer,  MAX_LINE_SIZE, fin  );
2063         buffer2[0]= '>'; res2=fgets( buffer2, MAX_LINE_SIZE, fin2 );
2064 
2065         if ( (res      == NULL) && (res2     != NULL)) bomb_error( "Paired input files have different number sequences" );
2066         if ( (res      != NULL) && (res2     == NULL)) bomb_error( "Paired input files have different number sequences" );
2067         if ( (one.size == 0   ) && (two.size >     0)) bomb_error( "Paired input files have different number sequences" );
2068         if ( (one.size >  0   ) && (two.size ==    0)) bomb_error( "Paired input files have different number sequences" );
2069         if ( (res      == NULL) && (one.size ==    0)) break;
2070 
2071         if( buffer[0] == '+' ){ // fastq 3rd line
2072             // file 1
2073             int len = strlen( buffer );
2074             int len2 = len;
2075             while( len2 && buffer[len2-1] != '\n' ){ // read until the end of the line
2076                 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
2077                 len2 = strlen( buffer );
2078                 len += len2;
2079             }
2080             one.tot_length += len;
2081 
2082             // read next line quality score
2083             if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) bomb_error("can not read quality score after");
2084             len = strlen( buffer );
2085             len2 = len;
2086             while( len2 && buffer[len2-1] != '\n' ){
2087                 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
2088                 len2 = strlen( buffer );
2089                 len += len2;
2090             }
2091             one.tot_length += len;
2092 
2093             // file 2
2094             len = strlen( buffer2 );
2095             len2 = len;
2096             while( len2 && buffer2[len2-1] != '\n' ){ // read until the end of the line
2097                 if ( (res2=fgets( buffer2, MAX_LINE_SIZE, fin2 )) == NULL ) break;
2098                 len2 = strlen( buffer2 );
2099                 len += len2;
2100             }
2101             two.tot_length += len;
2102 
2103             // read next line quality score
2104             if ( (res2=fgets( buffer2, MAX_LINE_SIZE, fin2 )) == NULL ) bomb_error("can not read quality score after");
2105             len = strlen( buffer2 );
2106             len2 = len;
2107             while( len2 && buffer2[len2-1] != '\n' ){
2108                 if ( (res2=fgets( buffer2, MAX_LINE_SIZE, fin2 )) == NULL ) break;
2109                 len2 = strlen( buffer2 );
2110                 len += len2;
2111             }
2112             two.tot_length += len;
2113 
2114         }else if (buffer[0] == '>' || buffer[0] == '@' || (res==NULL && one.size)) {
2115             if ( one.size && two.size ) { // write previous record
2116                 if( one.identifier == NULL || one.Format() ){
2117                     printf( "Warning: from file \"%s\",\n", file );
2118                     printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
2119                     if( one.identifier ) printf( "%s\n", one.identifier );
2120                     printf( "%s\n", one.data );
2121                     one.size=0; two.size=0;
2122                 }
2123                 if( two.identifier == NULL || two.Format() ){
2124                     printf( "Warning: from file \"%s\",\n", file2 );
2125                     printf( "Discarding invalid sequence or sequence without identifier and description!\n\n" );
2126                     if( two.identifier ) printf( "%s\n", two.identifier );
2127                     printf( "%s\n", two.data );
2128                     one.size=0; two.size = 0;
2129                 }
2130                 one.index = sequences.size();
2131                 if( (one.size + two.size)> option_l ) {
2132                     if (options.trim_len    > 0) one.trim(options.trim_len);
2133                     if (options.trim_len_R2 > 0) two.trim(options.trim_len_R2);
2134                     sequences.Append( new Sequence( one, two, 1 ) );
2135                 }
2136             }
2137             // R1
2138             one.size = 0;
2139             one.tot_length = 0;
2140 
2141             int len = strlen( buffer );
2142             int len2 = len;
2143             des.size = 0;
2144             des += buffer;
2145             while( len2 && buffer[len2-1] != '\n' ){
2146                 if ( (res=fgets( buffer, MAX_LINE_SIZE, fin )) == NULL ) break;
2147                 des += buffer;
2148                 len2 = strlen( buffer );
2149                 len += len2;
2150             }
2151             size_t offset = ftell( fin );
2152             one.des_begin = offset - len; // offset of ">" or "@"
2153             one.tot_length += len;              // count first line
2154 
2155             int i = 0;
2156             if( des.data[i] == '>' || des.data[i] == '@' || des.data[i] == '+' ) i += 1;
2157             if( des.data[i] == ' ' or des.data[i] == '\t' ) i += 1;
2158             if( options.des_len and options.des_len < des.size ) des.size = options.des_len;
2159             while( i < des.size and ! isspace( des.data[i] ) ) i += 1;
2160             des.data[i] = 0;                   // find first non-space letter
2161             one.identifier = des.data;
2162 
2163             // R2
2164             two.size = 0;
2165             two.tot_length = 0;
2166 
2167             len = strlen( buffer2 );
2168             len2 = len;
2169             while( len2 && buffer2[len2-1] != '\n' ){
2170                 if ( (res=fgets( buffer2, MAX_LINE_SIZE, fin2 )) == NULL ) break;
2171                 len2 = strlen( buffer2 );
2172                 len += len2;
2173             }
2174             offset = ftell( fin2 );
2175             two.des_begin = offset - len;
2176             two.tot_length += len;              // count first line
2177             two.identifier = des.data;
2178         } else {
2179             one.tot_length += strlen(buffer);  one += buffer;
2180             two.tot_length+= strlen(buffer2); two+= buffer2;
2181         }
2182     }
2183 #if 0
2184     int i, n = 0;
2185     for(i=0; i<sequences.size(); i++) n += sequences[i].bufsize + 4;
2186     cout<<n<<"\t"<<sequences.capacity() * sizeof(Sequence)<<endl;
2187     int i;
2188     scanf( "%i", & i );
2189 #endif
2190     one.identifier = NULL;
2191     two.identifier = NULL;
2192     delete[] buffer;
2193     fclose( fin );
2194     delete[] buffer2;
2195     fclose( fin2 );
2196 }
2197 
2198 #if 0
2199 void SequenceDB::Sort( int first, int last )
2200 {
2201 	int lower=first+1, upper=last;
2202 	Sequence *pivot;
2203 	Sequence *val;
2204 	if( first >= last ) return;
2205 	val = sequences[first];
2206 	sequences[first] = sequences[ (first+last)/2 ];
2207 	sequences[ (first+last)/2 ] = val;
2208 	pivot = sequences[ first ];
2209 
2210 	while( lower <= upper ){
2211 		while( lower <= last && sequences[lower]->stats < pivot->stats ) lower ++;
2212 		while( pivot->stats < sequences[upper]->stats ) upper --;
2213 		if( lower < upper ){
2214 			val = sequences[lower];
2215 			sequences[lower] = sequences[upper];
2216 			sequences[upper] = val;
2217 			upper --;
2218 		}
2219 		lower ++;
2220 	}
2221 	val = sequences[first];
2222 	sequences[first] = sequences[upper];
2223 	sequences[upper] = val;
2224 	if( first < upper-1 ) Sort( first, upper-1 );
2225 	if( upper+1 < last ) Sort( upper+1, last );
2226 }
2227 #endif
SortDivide(Options & options,bool sort)2228 void SequenceDB::SortDivide( Options & options, bool sort )
2229 {
2230 	int i, j, k, len;
2231 	int N = sequences.size();
2232 	total_letter=0;
2233 	total_desc=0;
2234 	max_len = 0;
2235 	min_len = (size_t)-1;
2236 	for (i=0; i<N; i++) {
2237 		Sequence *seq = sequences[i];
2238 		len = seq->size;
2239 		total_letter += len;
2240 		if (len > max_len) max_len = len;
2241 		if (len < min_len) min_len = len;
2242 		if (seq->swap == NULL) seq->ConvertBases();
2243 		if( seq->identifier ) total_desc += strlen( seq->identifier );
2244 	}
2245 	options.max_entries = max_len * MAX_TABLE_SEQ;
2246 	if (max_len >= 65536 and sizeof(INTs) <=2)
2247 		bomb_warning("Some seqs longer than 65536, you may define LONG_SEQ");
2248 
2249 	if (max_len > MAX_SEQ )
2250 		bomb_warning("Some seqs are too long, please rebuild the program with make parameter "
2251 				"MAX_SEQ=new-maximum-length (e.g. make MAX_SEQ=10000000)");
2252 
2253 	cout << "longest and shortest : " << max_len << " and " << min_len << endl;
2254 	cout << "Total letters: " << total_letter << endl;
2255 	// END change all the NR_seq to iseq
2256 
2257 	len_n50 = (max_len + min_len) / 2; // will be properly set, if sort is true;
2258 	if( sort ){
2259 		// **************************** Form NR_idx[], Sort them from Long to short
2260 		long long sum = 0;
2261 		int M = max_len - min_len + 1;
2262 		Vector<int> count( M, 0 ); // count for each size = max_len - i
2263 		Vector<int> accum( M, 0 ); // count for all size > max_len - i
2264 		Vector<int> offset( M, 0 ); // offset from accum[i] when filling sorting
2265 		Vector<Sequence*> sorting( N ); // TODO: use a smaller class if this consumes to much memory!
2266 
2267 		for (i=0; i<N; i++) count[ max_len - sequences[i]->size ] ++;
2268 		for (i=1; i<M; i++) accum[i] = accum[i-1] + count[i-1];
2269 		for (i=0; i<M; i++){
2270 			sum += (max_len - i) * count[i];
2271 			if( sum >= (total_letter>>1) ){
2272 				len_n50 = max_len - i;
2273 				break;
2274 			}
2275 		}
2276 		for (i=0; i<N; i++){
2277 			int len = max_len - sequences[i]->size;
2278 			int id = accum[len] + offset[len];
2279 			//sequences[i].index = id;
2280 			sorting[id] = sequences[i];
2281 			offset[len] ++;
2282 		}
2283 		options.max_entries = 0;
2284 		for (i=0; i<N; i++){
2285 			sequences[i] = sorting[i];
2286 			if( i < MAX_TABLE_SEQ ) options.max_entries += sequences[i]->size;
2287 		}
2288 #if 0
2289 		if( options.isEST ){
2290 			int start = 0;
2291 			for (i=0; i<M; i++){
2292 				Sort( start, accum[i] );
2293 				start = accum[i];
2294 			}
2295 		}
2296 #endif
2297 		cout << "Sequences have been sorted" << endl;
2298 		// END sort them from long to short
2299 	}
2300 }// END sort_seqs_divide_segs
2301 
DivideSave(const char * db,const char * newdb,int n,const Options & options)2302 void SequenceDB::DivideSave( const char *db, const char *newdb, int n, const Options & options )
2303 {
2304 	if( n == 0 or sequences.size() ==0 ) return;
2305 
2306 	size_t max_seg = total_letter / n + sequences[0]->size;
2307 	if( max_seg >= MAX_BIN_SWAP ) max_seg = (size_t) MAX_BIN_SWAP;
2308 
2309 	FILE *fin = fopen( db, "rb" );
2310 	char *buf = new char[MAX_LINE_SIZE+1];
2311 	char outfile[512];
2312 	size_t seg_size = 0;
2313 	int i, j, count, rest, seg = 0;
2314 	sprintf( outfile, "%s-%i", newdb, 0 );
2315 	FILE *fout = fopen( outfile, "w+" );
2316 	n = sequences.size();
2317 	for (i=0; i<n; i++){
2318 		Sequence *seq = sequences[i];
2319 		fseek( fin, seq->des_begin, SEEK_SET );
2320 
2321 		seg_size += seq->size;
2322 		if( seg_size >= max_seg ){
2323 			seg += 1;
2324 			sprintf( outfile, "%s-%i", newdb, seg );
2325 			fclose( fout );
2326 			fout = fopen( outfile, "w+" );
2327 			seg_size = seq->size;
2328 		}
2329 
2330 		count = seq->tot_length / MAX_LINE_SIZE;
2331 		rest  = seq->tot_length % MAX_LINE_SIZE;
2332 		//printf( "count = %6i,  rest = %6i\n", count, rest );
2333 		for (j=0; j<count; j++){
2334 			if( fread( buf, 1, MAX_LINE_SIZE, fin ) ==0 ) bomb_error( "Can not swap in sequence" );
2335 			fwrite( buf, 1, MAX_LINE_SIZE, fout );
2336 		}
2337 		if( rest ){
2338 			if( fread( buf, 1, rest, fin ) ==0 ) bomb_error( "Can not swap in sequence" );
2339 			fwrite( buf, 1, rest, fout );
2340 		}
2341 	}
2342 	fclose( fin );
2343 	fclose( fout );
2344 	delete []buf;
2345 }
2346 
2347 // input db is gzipped
WriteClustersgz(const char * db,const char * newdb,const Options & options)2348 void SequenceDB::WriteClustersgz( const char *db, const char *newdb, const Options & options )
2349 {
2350 #ifdef WITH_ZLIB
2351     gzFile fin = gzopen(db, "r");
2352 	FILE *fout = fopen( newdb, "w+" );
2353 	int i, j, n = rep_seqs.size();
2354 	int count, rest;
2355 	char *buf = new char[MAX_LINE_SIZE+1];
2356 	vector<uint64_t> sorting( n );
2357 	if( fin == NULL || fout == NULL ) bomb_error( "file opening failed" );
2358 	for (i=0; i<n; i++) sorting[i] = ((uint64_t)sequences[ rep_seqs[i] ]->index << 32) | rep_seqs[i];
2359 	std::sort( sorting.begin(), sorting.end() );
2360 	for (i=0; i<n; i++){
2361 		Sequence *seq = sequences[ sorting[i] & 0xffffffff ];
2362 		gzseek( fin, seq->des_begin, SEEK_SET );
2363 
2364 		count = seq->tot_length / MAX_LINE_SIZE;
2365 		rest  = seq->tot_length % MAX_LINE_SIZE;
2366 		//printf( "count = %6i,  rest = %6i\n", count, rest );
2367 		for (j=0; j<count; j++){
2368 			if( gzread(fin, buf, MAX_LINE_SIZE) ==0 ) bomb_error( "Can not swap in sequence" );
2369 			fwrite( buf, 1, MAX_LINE_SIZE, fout );
2370 		}
2371 		if( rest ){
2372 			if( gzread(fin, buf, rest) ==0 ) bomb_error( "Can not swap in sequence" );
2373 			fwrite( buf, 1, rest, fout );
2374 		}
2375 	}
2376 	gzclose( fin );
2377 	fclose( fout );
2378 	delete []buf;
2379 #else
2380     bomb_error("this program was not compiled with zlib");
2381 #endif
2382 
2383 }
2384 
WriteClusters(const char * db,const char * newdb,const Options & options)2385 void SequenceDB::WriteClusters( const char *db, const char *newdb, const Options & options )
2386 {
2387     int f_len = strlen(db);
2388     if (strcmp(db + f_len - 3, ".gz") == 0 ) {
2389         WriteClustersgz(db, newdb, options);
2390         return;
2391     }
2392 
2393 	FILE *fin = fopen( db, "rb" );
2394 	FILE *fout = fopen( newdb, "w+" );
2395 	int i, j, n = rep_seqs.size();
2396 	int count, rest;
2397 	char *buf = new char[MAX_LINE_SIZE+1];
2398 	vector<uint64_t> sorting( n );
2399 	if( fin == NULL || fout == NULL ) bomb_error( "file opening failed" );
2400 	for (i=0; i<n; i++) sorting[i] = ((uint64_t)sequences[ rep_seqs[i] ]->index << 32) | rep_seqs[i];
2401 	std::sort( sorting.begin(), sorting.end() );
2402 	for (i=0; i<n; i++){
2403 		Sequence *seq = sequences[ sorting[i] & 0xffffffff ];
2404 		fseek( fin, seq->des_begin, SEEK_SET );
2405 
2406 		count = seq->tot_length / MAX_LINE_SIZE;
2407 		rest  = seq->tot_length % MAX_LINE_SIZE;
2408 		//printf( "count = %6i,  rest = %6i\n", count, rest );
2409 		for (j=0; j<count; j++){
2410 			if( fread( buf, 1, MAX_LINE_SIZE, fin ) ==0 ) bomb_error( "Can not swap in sequence" );
2411 			fwrite( buf, 1, MAX_LINE_SIZE, fout );
2412 		}
2413 		if( rest ){
2414 			if( fread( buf, 1, rest, fin ) ==0 ) bomb_error( "Can not swap in sequence" );
2415 			fwrite( buf, 1, rest, fout );
2416 		}
2417 	}
2418 	fclose( fin );
2419 	fclose( fout );
2420 	delete []buf;
2421 }
2422 
2423 
2424 // input db is gzipped
2425 // liwz PE output
WriteClustersgz(const char * db,const char * db_pe,const char * newdb,const char * newdb_pe,const Options & options)2426 void SequenceDB::WriteClustersgz( const char *db, const char *db_pe, const char *newdb, const char *newdb_pe, const Options & options )
2427 {
2428 #ifdef WITH_ZLIB
2429     gzFile fin    = gzopen(db,    "r");
2430 	gzFile fin_pe = gzopen(db_pe, "r");
2431 	FILE *fout = fopen( newdb, "w+" );
2432 	FILE *fout_pe = fopen( newdb_pe, "w+" );
2433 	int i, j, n = rep_seqs.size();
2434 	int count, rest;
2435 	char *buf = new char[MAX_LINE_SIZE+1];
2436 	vector<uint64_t> sorting( n );
2437 	if( fin == NULL || fout == NULL ) bomb_error( "file opening failed" );
2438 	if( fin_pe == NULL || fout_pe == NULL ) bomb_error( "file opening failed" );
2439 	for (i=0; i<n; i++) sorting[i] = ((uint64_t)sequences[ rep_seqs[i] ]->index << 32) | rep_seqs[i];
2440 	std::sort( sorting.begin(), sorting.end() );
2441 
2442         //sort fasta / fastq
2443         int *clstr_size;
2444         int *clstr_idx1;
2445         if (options.sort_outputf) {
2446             clstr_size = new int[n];
2447             clstr_idx1 = new int[n];
2448             for (i=0; i<n; i++) {
2449                 clstr_size[i] = 0;
2450                 clstr_idx1[i]  = i;
2451             }
2452 
2453             int N = sequences.size();
2454             for (i=0; i<N; i++) {
2455                 int id = sequences[i]->cluster_id;
2456                 if (id < 0) continue;
2457                 if (id >=n) continue;
2458                 clstr_size[id]++;
2459             }
2460             quick_sort_idxr(clstr_size, clstr_idx1, 0, n-1);
2461         }
2462 
2463 	for (i=0; i<n; i++){
2464 		Sequence *seq = sequences[ sorting[i] & 0xffffffff ];
2465                 if (options.sort_outputf) seq = sequences[  rep_seqs[ clstr_idx1[i] ] ];
2466                 //R1
2467 		gzseek( fin, seq->des_begin, SEEK_SET );
2468 
2469 		count = seq->tot_length / MAX_LINE_SIZE;
2470 		rest  = seq->tot_length % MAX_LINE_SIZE;
2471 		//printf( "count = %6i,  rest = %6i\n", count, rest );
2472 		for (j=0; j<count; j++){
2473 			if( gzread(fin, buf, MAX_LINE_SIZE) ==0 ) bomb_error( "Can not swap in sequence" );
2474 			fwrite( buf, 1, MAX_LINE_SIZE, fout );
2475 		}
2476 		if( rest ){
2477 			if( gzread(fin, buf, rest) ==0 ) bomb_error( "Can not swap in sequence" );
2478 			fwrite( buf, 1, rest, fout );
2479 		}
2480 
2481                 //R2
2482 		gzseek( fin_pe, seq->des_begin2, SEEK_SET );
2483 
2484 		count = seq->tot_length2 / MAX_LINE_SIZE;
2485 		rest  = seq->tot_length2 % MAX_LINE_SIZE;
2486 		//printf( "count = %6i,  rest = %6i\n", count, rest );
2487 		for (j=0; j<count; j++){
2488 			if( gzread(fin_pe, buf, MAX_LINE_SIZE) ==0 ) bomb_error( "Can not swap in sequence" );
2489 			fwrite( buf, 1, MAX_LINE_SIZE, fout_pe );
2490 		}
2491 		if( rest ){
2492 			if( gzread(fin_pe, buf, rest) ==0 ) bomb_error( "Can not swap in sequence" );
2493 			fwrite( buf, 1, rest, fout_pe );
2494 		}
2495 
2496 	}
2497 	gzclose( fin );
2498 	gzclose( fin_pe );
2499 	fclose( fout );
2500 	fclose( fout_pe );
2501 	delete []buf;
2502 #else
2503     bomb_error("this program was not compiled with zlib");
2504 #endif
2505 }
2506 
2507 
2508 // liwz PE output
WriteClusters(const char * db,const char * db_pe,const char * newdb,const char * newdb_pe,const Options & options)2509 void SequenceDB::WriteClusters( const char *db, const char *db_pe, const char *newdb, const char *newdb_pe, const Options & options )
2510 {
2511     int f_len = strlen(db);
2512     if (strcmp(db + f_len - 3, ".gz") == 0 ) {
2513         WriteClustersgz(db, db_pe, newdb, newdb_pe, options);
2514         return;
2515     }
2516 
2517 	FILE *fin = fopen( db, "rb" );
2518 	FILE *fout = fopen( newdb, "w+" );
2519 	FILE *fin_pe = fopen( db_pe, "rb" );
2520 	FILE *fout_pe = fopen( newdb_pe, "w+" );
2521 	int i, j, n = rep_seqs.size();
2522 	int count, rest;
2523 	char *buf = new char[MAX_LINE_SIZE+1];
2524 	vector<uint64_t> sorting( n );
2525 	if( fin == NULL || fout == NULL ) bomb_error( "file opening failed" );
2526 	if( fin_pe == NULL || fout_pe == NULL ) bomb_error( "file opening failed" );
2527 	for (i=0; i<n; i++) sorting[i] = ((uint64_t)sequences[ rep_seqs[i] ]->index << 32) | rep_seqs[i];
2528 	std::sort( sorting.begin(), sorting.end() );
2529 
2530         //sort fasta / fastq
2531         int *clstr_size;
2532         int *clstr_idx1;
2533         if (options.sort_outputf) {
2534             clstr_size = new int[n];
2535             clstr_idx1 = new int[n];
2536             for (i=0; i<n; i++) {
2537                 clstr_size[i] = 0;
2538                 clstr_idx1[i]  = i;
2539             }
2540 
2541             int N = sequences.size();
2542             for (i=0; i<N; i++) {
2543                 int id = sequences[i]->cluster_id;
2544                 if (id < 0) continue;
2545                 if (id >=n) continue;
2546                 clstr_size[id]++;
2547             }
2548             quick_sort_idxr(clstr_size, clstr_idx1, 0, n-1);
2549         }
2550 
2551 	for (i=0; i<n; i++){
2552 		Sequence *seq = sequences[ sorting[i] & 0xffffffff ];
2553                 if (options.sort_outputf) seq = sequences[  rep_seqs[ clstr_idx1[i] ] ];
2554                 //R1
2555 		fseek( fin, seq->des_begin, SEEK_SET );
2556 
2557 		count = seq->tot_length / MAX_LINE_SIZE;
2558 		rest  = seq->tot_length % MAX_LINE_SIZE;
2559 		//printf( "count = %6i,  rest = %6i\n", count, rest );
2560 		for (j=0; j<count; j++){
2561 			if( fread( buf, 1, MAX_LINE_SIZE, fin ) ==0 ) bomb_error( "Can not swap in sequence" );
2562 			fwrite( buf, 1, MAX_LINE_SIZE, fout );
2563 		}
2564 		if( rest ){
2565 			if( fread( buf, 1, rest, fin ) ==0 ) bomb_error( "Can not swap in sequence" );
2566 			fwrite( buf, 1, rest, fout );
2567 		}
2568 
2569                 //R2
2570 		fseek( fin_pe, seq->des_begin2, SEEK_SET );
2571 
2572 		count = seq->tot_length2 / MAX_LINE_SIZE;
2573 		rest  = seq->tot_length2 % MAX_LINE_SIZE;
2574 		//printf( "count = %6i,  rest = %6i\n", count, rest );
2575 		for (j=0; j<count; j++){
2576 			if( fread( buf, 1, MAX_LINE_SIZE, fin_pe ) ==0 ) bomb_error( "Can not swap in sequence" );
2577 			fwrite( buf, 1, MAX_LINE_SIZE, fout_pe );
2578 		}
2579 		if( rest ){
2580 			if( fread( buf, 1, rest, fin_pe ) ==0 ) bomb_error( "Can not swap in sequence" );
2581 			fwrite( buf, 1, rest, fout_pe );
2582 		}
2583 
2584 	}
2585 	fclose( fin );
2586 	fclose( fout );
2587 	fclose( fin_pe );
2588 	fclose( fout_pe );
2589 	delete []buf;
2590 }
2591 
WriteExtra1D(const Options & options)2592 void SequenceDB::WriteExtra1D( const Options & options )
2593 {
2594 	string db_clstr = options.output + ".clstr";
2595 	string db_clstr_bak = options.output + ".bak.clstr";
2596 	int i, i0, k, N = sequences.size();
2597 	vector<long long> sorting( N );
2598 	for (i=0; i<N; i++) sorting[i] = ((long long)sequences[i]->index << 32) | i;
2599 	std::sort( sorting.begin(), sorting.end() );
2600 
2601 	FILE *fout;
2602 	char *buf = new char[ MAX_DES + 1 ];
2603 
2604 	if( options.backupFile ){
2605 		fout = fopen( db_clstr_bak.c_str(), "w+" );
2606 		for (i=0; i<N; i++) {
2607 			Sequence *seq = sequences[ sorting[i] & 0xffffffff ];
2608 			seq->PrintInfo( seq->cluster_id, fout, options, buf );
2609 		}
2610 		fclose( fout );
2611 	}
2612 
2613 	cout << "writing clustering information" << endl;
2614 	int M = rep_seqs.size();
2615 	Vector<Vector<int> > clusters( M );
2616 	for (i=0; i<N; i++){
2617 		int k = sorting[i] & 0xffffffff;
2618 		int id = sequences[k]->cluster_id;
2619 		clusters[id].Append( k );
2620 	}
2621 
2622 	fout = fopen( db_clstr.c_str(), "w+" );
2623 
2624         if (options.sort_output) {
2625             int *clstr_size = new int[M];
2626             int *clstr_idx1 = new int[M];
2627 
2628             for (i=0; i<M; i++) {
2629                 clstr_size[i] = (int)clusters[i].size();
2630                 clstr_idx1[i]  = i;
2631             }
2632             quick_sort_idxr(clstr_size, clstr_idx1, 0, M-1);
2633 
2634   	    for (i=0; i<M; i++) {
2635                 i0 = clstr_idx1[i];
2636 		fprintf( fout, ">Cluster %i\n", i );
2637 		for (k=0; k<(int)clusters[i0].size(); k++)
2638 			sequences[ clusters[i0][k] ]->PrintInfo( k, fout, options, buf );
2639 	    }
2640         }
2641         else {
2642   	    for (i=0; i<M; i++) {
2643 		fprintf( fout, ">Cluster %i\n", i );
2644 		for (k=0; k<(int)clusters[i].size(); k++)
2645 			sequences[ clusters[i][k] ]->PrintInfo( k, fout, options, buf );
2646 	    }
2647 
2648         }
2649 
2650 	delete []buf;
2651 }
WriteExtra2D(SequenceDB & other,const Options & options)2652 void SequenceDB::WriteExtra2D( SequenceDB & other, const Options & options )
2653 {
2654 	string db_clstr = options.output + ".clstr";
2655 	string db_clstr_bak = options.output + ".bak.clstr";
2656 	int i, k, N = other.sequences.size();
2657 	int N2 = sequences.size();
2658 	vector<long long> sorting( N );
2659 	for (i=0; i<N; i++) sorting[i] = ((long long)other.sequences[i]->index << 32) | i;
2660 	std::sort( sorting.begin(), sorting.end() );
2661 
2662 	FILE *fout;
2663 	char *buf = new char[ MAX_DES + 1 ];
2664 	if( options.backupFile ){
2665 		fout = fopen( db_clstr_bak.c_str(), "w+" );
2666 		for (i=0; i<N; i++) {
2667 			Sequence *seq = other.sequences[ sorting[i] & 0xffffffff ];
2668 			seq->PrintInfo( seq->cluster_id, fout, options, buf );
2669 		}
2670 		for (i=0; i<N2; i++) {
2671 			Sequence *seq = sequences[i];
2672 			if( seq->state & IS_REDUNDANT ) seq->PrintInfo( seq->cluster_id, fout, options, buf );
2673 		}
2674 		fclose( fout );
2675 	}
2676 
2677 	cout << "writing clustering information" << endl;
2678 	Vector<Vector<int> > clusters( N );
2679 	for (i=0; i<N2; i++){
2680 		int id = sequences[i]->cluster_id;
2681 		if( sequences[i]->state & IS_REDUNDANT ) clusters[id].Append( i );
2682 	}
2683 
2684 	fout = fopen( db_clstr.c_str(), "w+" );
2685 	for (i=0; i<N; i++) {
2686 		Sequence *seq = other.sequences[ i ];
2687 		fprintf( fout, ">Cluster %i\n", i );
2688 		seq->PrintInfo( 0, fout, options, buf );
2689 		for (k=0; k<(int)clusters[i].size(); k++)
2690 			sequences[ clusters[i][k] ]->PrintInfo( k+1, fout, options, buf );
2691 	}
2692 	delete []buf;
2693 }
ControlShortCoverage(int len,const Options & options)2694 void WorkingParam::ControlShortCoverage( int len, const Options & options )
2695 {
2696 	len_eff = len;
2697 	aln_cover_flag = 0;
2698 	if ((options.short_coverage > 0.0) || (options.min_control>0) ) { // has alignment coverage control
2699 		aln_cover_flag = 1;
2700 		min_aln_lenS = (int) (double(len) * options.short_coverage);
2701 		if ( len-options.short_control > min_aln_lenS) min_aln_lenS = len-options.short_control;
2702 		if ( options.min_control > min_aln_lenS) min_aln_lenS = options.min_control;
2703 	}
2704 	if (options.global_identity == 0) len_eff = min_aln_lenS; //global_identity==0
2705 }
ControlLongCoverage(int len2,const Options & options)2706 void WorkingParam::ControlLongCoverage( int len2, const Options & options )
2707 {
2708 	if (aln_cover_flag) {
2709 		min_aln_lenL = (int) (double(len2) * options.long_coverage);
2710 		if ( len2-options.long_control > min_aln_lenL) min_aln_lenL = len2-options.long_control;
2711 		if ( options.min_control > min_aln_lenL) min_aln_lenL = options.min_control;
2712 	}
2713 }
2714 
2715 
2716 // when alignment coverage such as -aL is specified
2717 // if a existing rep is too long, it won't be qulified
upper_bound_length_rep(int len,double opt_s,int opt_S,double opt_aL,int opt_AL)2718 int upper_bound_length_rep(int len, double opt_s, int opt_S, double opt_aL, int opt_AL )
2719 {
2720 	int len_upper_bound = 99999999;
2721 	double r1 = (opt_s > opt_aL) ? opt_s : opt_aL;
2722 	int    a2 = (opt_S < opt_AL) ? opt_S : opt_AL;
2723 	if (r1 > 0.0) len_upper_bound = (int) ( ((float) len)  / r1);
2724 	if ((len+a2) < len_upper_bound)  len_upper_bound = len+a2;
2725 
2726 	return len_upper_bound;
2727 } // END upper_bound_length_rep
upper_bound_length_rep(int len,const Options & options)2728 int upper_bound_length_rep(int len, const Options & options )
2729 {
2730 	double opt_s = options.diff_cutoff;
2731 	int    opt_S = options.diff_cutoff_aa;
2732 	double opt_aL = options.long_coverage;
2733 	int    opt_AL = options.long_control;
2734 	return upper_bound_length_rep( len, opt_s, opt_S, opt_aL, opt_AL );
2735 }
2736 
2737 
cal_aax_cutoff(double & aa1_cutoff,double & aa2_cutoff,double & aan_cutoff,double cluster_thd,int tolerance,int naa_stat_start_percent,int naa_stat[5][61][4],int NAA)2738 void cal_aax_cutoff(double &aa1_cutoff, double &aa2_cutoff, double &aan_cutoff,
2739 		double cluster_thd, int tolerance, int naa_stat_start_percent,
2740 		int naa_stat[5][61][4], int NAA)
2741 {
2742 	aa1_cutoff = cluster_thd;
2743 	aa2_cutoff = 1 - (1-cluster_thd)*2;
2744 	aan_cutoff = 1 - (1-cluster_thd)*NAA;
2745 	if (tolerance==0) return;
2746 
2747 	int clstr_idx = (int) (cluster_thd * 100) - naa_stat_start_percent;
2748 	if (clstr_idx <0) clstr_idx = 0;
2749 	double d2  = ((double) (naa_stat[tolerance-1][clstr_idx][3]     )) / 100;
2750 	double dn  = ((double) (naa_stat[tolerance-1][clstr_idx][5-NAA] )) / 100;
2751 	aa2_cutoff = d2 > aa2_cutoff ? d2 : aa2_cutoff;
2752 	aan_cutoff = dn > aan_cutoff ? dn : aan_cutoff;
2753 	return;
2754 } // END cal_aax_cutoff
2755 
2756 
update_aax_cutoff(double & aa1_cutoff,double & aa2_cutoff,double & aan_cutoff,int tolerance,int naa_stat_start_percent,int naa_stat[5][61][4],int NAA,double cluster_thd)2757 void update_aax_cutoff(double &aa1_cutoff, double &aa2_cutoff, double &aan_cutoff,
2758 		int tolerance, int naa_stat_start_percent,
2759 		int naa_stat[5][61][4], int NAA, double cluster_thd)
2760 {
2761 	if (cluster_thd > 1.0) cluster_thd = 1.00;
2762 
2763 	double aa1_t, aa2_t, aan_t;
2764 	cal_aax_cutoff(aa1_t, aa2_t, aan_t, cluster_thd, tolerance, naa_stat_start_percent,
2765 			naa_stat, NAA);
2766 	if (aa1_t > aa1_cutoff) aa1_cutoff = aa1_t;
2767 	if (aa2_t > aa2_cutoff) aa2_cutoff = aa2_t;
2768 	if (aan_t > aan_cutoff) aan_cutoff = aan_t;
2769 	return;
2770 } // END update_aax_cutoff
2771 
ComputeRequiredBases(int NAA,int ss,const Options & option)2772 void WorkingParam::ComputeRequiredBases( int NAA, int ss, const Options & option )
2773 {
2774 	// d: distance, fraction of errors;
2775 	// e: number of errors;
2776 	// g: length of the maximum gap;
2777 	// m: word length;
2778 	// n: sequence length;
2779 	// alignment length = n - g + 1;
2780 	// d = e / (n - g + 1);
2781 	// e >= 1, so that, g <= n + 1 - 1/d
2782 	// word count = (n - g - m + 1) - (e - 1)*m;
2783 	//            = (n - g - m + 1) - (d*(n - g + 1) - 1)*m
2784 	//            = (n - g + 1) - d*m*(n - g + 1)
2785 	//            = (n - g + 1)*(1 - d*m)
2786 	// minimum word count is reached when g == n + 1 - 1/d
2787 	// so, minimum word count = 1/d - m.
2788 	// if g == band_width: word count = (n - band + 1)*(1 - d*m);
2789 	if( options.useDistance ){
2790 		int band = options.band_width + 1;
2791 		int invd = int( 1.0 / (options.distance_thd + 1E-9) );
2792 		int k = len_eff < invd ? len_eff : invd;
2793 		int ks = len_eff - ss + 1;
2794 		int kn = len_eff - NAA + 1;
2795 		int ks2 = invd - ss;
2796 		int kn2= invd - NAA;
2797 		int ks3 = int((len_eff - band + 1.0)*(1.0 - options.distance_thd * ss));
2798 		int kn3 = int((len_eff - band + 1.0)*(1.0 - options.distance_thd * NAA));
2799 		//if( ks3 > ks2 ) ks2 = ks3;
2800 		//if( kn3 > kn2 ) kn2 = kn3;
2801 		required_aa1 = required_aas = (ks2 < ks ? ks2 : ks);
2802 		required_aan = kn2 < kn ? kn2 : kn;
2803 		if( required_aa1 <=0 ) required_aa1 = required_aas = 1;
2804 		if( required_aan <=0 ) required_aan = 1;
2805 		//required_aa1 = required_aas = required_aan = 0;
2806 		return;
2807 	}
2808 	// (N-K)-K*(1-C)*N = C*K*N-(K-1)*N-K = (C*K-K+1)*N-K
2809 	required_aa1 = (len_eff - ss) - int(ss * ceil( (1.0 - aa1_cutoff) * len_eff ));
2810 	if( required_aa1 < 0 ) required_aa1 = 0;
2811 	required_aas = required_aa1;
2812 	required_aan = (len_eff - NAA) - int(NAA * ceil( (1.0 - aa1_cutoff) * len_eff ));
2813 	//printf( "%i %i\n", required_aa1, required_aan );
2814 	if( required_aan < 0 ) required_aan = 0;
2815 
2816 	int aa1_old = int (aa1_cutoff* (double) len_eff) - ss + 1;
2817 	int aas_old = int (aas_cutoff* (double) len_eff);
2818 	int aan_old = int (aan_cutoff* (double) len_eff);
2819 
2820 	double thd = option.cluster_thd;
2821 	//double rest = (len_eff - ss) / double(len_eff * ss);
2822 	double rest = (len_eff - NAA) / double(len_eff * NAA);
2823 	double thd0 = 1.0 - rest;
2824 	double fnew = 0;
2825 	double fold = 1;
2826 	if( thd > thd0 ){
2827 		fnew = (thd - thd0) / rest;
2828 		fold = 1.0 - fnew;
2829 	}
2830 	//printf( "%g %g %g\n", thd, thd0, fnew );
2831 
2832 	required_aa1 = (int)(fnew*required_aa1 + fold*aa1_old);
2833 	required_aas = (int)(fnew*required_aas + fold*aas_old);
2834 	required_aan = (int)(fnew*required_aan + fold*aan_old);
2835 }
EncodeWords(Sequence * seq,int NAA,bool est)2836 int WorkingBuffer::EncodeWords( Sequence *seq, int NAA, bool est )
2837 {
2838 	char *seqi = seq->data;
2839 	int len = seq->size;
2840 	// check_word_encodes
2841 	int aan_no = len - NAA + 1;
2842 	int i, j, i0, i1;
2843 	int skip = 0;
2844 	unsigned char k, k1;
2845 	for (j=0; j<aan_no; j++) {
2846 		char *word = seqi + j;
2847 		int encode = 0;
2848 		for (k=0, k1=NAA-1; k<NAA; k++, k1--) encode += word[k] * NAAN_array[k1];
2849 		word_encodes[j] = word_encodes_backup[j] = encode;
2850 	}
2851 
2852 	if( est ){
2853 		for (j=0; j<len; j++){
2854 			if ( seqi[j] >= 4 ) {                      // here N is 4
2855 				i0 = (j-NAA+1 > 0) ? j-NAA+1 : 0;
2856 				i1 = j < aan_no ? j : aan_no - 1;
2857 				for (i=i0; i<=i1; i++) word_encodes[i]=-1;
2858 			}
2859 		}
2860 		for (j=0; j<aan_no; j++) skip += (word_encodes[j] == -1);
2861 	}
2862 
2863 	std::sort( word_encodes.begin(), word_encodes.begin() + aan_no );
2864 	for(j=0; j<aan_no; j++) word_encodes_no[j]=1;
2865 	for(j=aan_no-1; j; j--) {
2866 		if (word_encodes[j] == word_encodes[j-1]) {
2867 			word_encodes_no[j-1] += word_encodes_no[j];
2868 			word_encodes_no[j]=0;
2869 		}
2870 	}
2871 	return skip;
2872 	// END check_word_encodes
2873 }
ComputeAAP(const char * seqi,int size)2874 void WorkingBuffer::ComputeAAP( const char *seqi, int size )
2875 {
2876 	int len1 = size - 1;
2877 	int sk, j1, mm, c22;
2878 	for (sk=0; sk<NAA2; sk++) taap[sk] = 0;
2879 	for (j1=0; j1<len1; j1++) {
2880 		c22= seqi[j1]*NAA1 + seqi[j1+1];
2881 		taap[c22]++;
2882 	}
2883 	for (sk=0,mm=0; sk<NAA2; sk++) {
2884 		aap_begin[sk] = mm; mm+=taap[sk]; taap[sk] = 0;
2885 	}
2886 	for (j1=0; j1<len1; j1++) {
2887 		c22= seqi[j1]*NAA1 + seqi[j1+1];
2888 		aap_list[aap_begin[c22]+taap[c22]++] =j1;
2889 	}
2890 }
ComputeAAP2(const char * seqi,int size)2891 void WorkingBuffer::ComputeAAP2( const char *seqi, int size )
2892 {
2893 	int len1 = size - 3;
2894 	int sk, j1, mm, c22;
2895 	for (sk=0; sk<NAA4; sk++) taap[sk] = 0;
2896 	for (j1=0; j1<len1; j1++) {
2897 		if ((seqi[j1]>=4) || (seqi[j1+1]>=4) || (seqi[j1+2]>=4) || (seqi[j1+3]>=4)) continue; //skip N
2898 		c22 = seqi[j1]*NAA3 + seqi[j1+1]*NAA2 + seqi[j1+2]*NAA1 + seqi[j1+3];
2899 		taap[c22]++;
2900 	}
2901 	for (sk=0,mm=0; sk<NAA4; sk++) {
2902 		aap_begin[sk] = mm;  mm += taap[sk];  taap[sk] = 0;
2903 	}
2904 	for (j1=0; j1<len1; j1++) {
2905 		if ((seqi[j1]>=4) || (seqi[j1+1]>=4) || (seqi[j1+2]>=4) || (seqi[j1+3]>=4)) continue; //skip N
2906 		c22 = seqi[j1]*NAA3 + seqi[j1+1]*NAA2 + seqi[j1+2]*NAA1 + seqi[j1+3];
2907 		aap_list[aap_begin[c22]+taap[c22]++] =j1;
2908 	}
2909 }
2910 
ClusterOne(Sequence * seq,int id,WordTable & table,WorkingParam & param,WorkingBuffer & buffer,const Options & options)2911 void SequenceDB::ClusterOne( Sequence *seq, int id, WordTable & table,
2912 		WorkingParam & param, WorkingBuffer & buffer, const Options & options )
2913 {
2914 	if (seq->state & IS_REDUNDANT) return;
2915 	int frag_size = options.frag_size;
2916 	int NAA = options.NAA;
2917 	int len = seq->size;
2918 	int len_bound = upper_bound_length_rep(len, options);
2919 	param.len_upper_bound = len_bound;
2920 	int flag = CheckOne( seq, table, param, buffer, options );
2921 
2922 	if( flag == 0 ){
2923 		if ((seq->identity>0) && (options.cluster_best)) {
2924 			// because of the -g option, this seq is similar to seqs in old SEGs
2925 			seq->state |= IS_REDUNDANT ;
2926 			seq->Clear();
2927 		} else {                  // else add to NR90 db
2928 			int aan_no = len - NAA + 1;
2929 			int size = rep_seqs.size();
2930 			rep_seqs.Append( id );
2931 			seq->cluster_id = size;
2932 			seq->identity = 0;
2933 			seq->state |= IS_REP;
2934 			if (frag_size){ /* not used for EST */
2935 				int frg1 = (len - NAA ) / frag_size + 1;
2936 				table.AddWordCountsFrag( aan_no, buffer.word_encodes_backup,
2937 						buffer.word_encodes_no, frg1, frag_size );
2938 			}else{
2939 				table.AddWordCounts(aan_no, buffer.word_encodes, buffer.word_encodes_no, table.sequences.size(), options.isEST);
2940 			}
2941 			table.sequences.Append( seq );
2942 			if( frag_size ){
2943 				while( table.sequences.size() < table.frag_count )
2944 					table.sequences.Append( seq );
2945 			}
2946 		}
2947 	}
2948 	if ( (id+1) % 1000 == 0 ) {
2949 		int size = rep_seqs.size();
2950 		printf( "." );
2951 		fflush( stdout );
2952 		if ( (id+1) % 10000 == 0 ) printf( "\r..........%9i  finished  %9i  clusters\n", id+1, size );
2953 	}
2954 }
2955 #include<assert.h>
MinimalMemory(int frag_no,int bsize,int T,const Options & options,size_t extra)2956 size_t SequenceDB::MinimalMemory( int frag_no, int bsize, int T, const Options & options, size_t extra )
2957 {
2958 	int N = sequences.size();
2959 	int F = frag_no < MAX_TABLE_SEQ ? frag_no : MAX_TABLE_SEQ;
2960 	size_t mem_need = 0;
2961 	size_t mem, mega = 1000000;
2962 	int table = T > 1 ? 2 : 1;
2963 
2964 	printf( "\nApproximated minimal memory consumption:\n" );
2965 	mem = N*sizeof(Sequence) + total_desc + N + extra;
2966 	if( options.store_disk == false ) mem += total_letter + N;
2967 	printf( "%-16s: %zuM\n", "Sequence", mem/mega );
2968 	mem_need += mem;
2969 
2970 	mem = bsize;
2971 	printf( "%-16s: %i X %zuM = %zuM\n", "Buffer", T, mem/mega, T*mem/mega );
2972 	mem_need += T*mem;
2973 
2974 	mem = F*(sizeof(Sequence*) + sizeof(IndexCount)) + NAAN*sizeof(NVector<IndexCount>);
2975 	printf( "%-16s: %i X %zuM = %zuM\n", "Table", table, mem/mega, table*mem/mega );
2976 	mem_need += table*mem;
2977 
2978 	mem = sequences.capacity()*sizeof(Sequence*) + N*sizeof(int);
2979 	mem += Comp_AAN_idx.size()*sizeof(int);
2980 	printf( "%-16s: %zuM\n", "Miscellaneous", mem/mega );
2981 	mem_need += mem;
2982 
2983 	printf( "%-16s: %zuM\n\n", "Total", mem_need/mega );
2984 
2985 	if(options.max_memory and options.max_memory < mem_need + 50*table ){
2986 		char msg[200];
2987 		sprintf( msg, "not enough memory, please set -M option greater than %zu\n",
2988 				50*table + mem_need/mega );
2989 		bomb_error(msg);
2990 	}
2991 	return mem_need;
2992 }
MemoryLimit(size_t mem_need,const Options & options)2993 size_t MemoryLimit( size_t mem_need, const Options & options )
2994 {
2995 	size_t mem_limit = (options.max_memory - mem_need) / sizeof(IndexCount);
2996 
2997 	//printf( "Table limit with the given memory limit:\n" );
2998 	if( options.max_memory == 0 ){
2999 		mem_limit = options.max_entries;
3000 		if( mem_limit > MAX_TABLE_SIZE ) mem_limit = MAX_TABLE_SIZE;
3001 	}
3002 	//printf( "Max number of representatives: %zu\n", mem_limit );
3003 	//printf( "Max number of word counting entries: %zu\n\n", mem_limit );
3004 	return mem_limit;
3005 }
ComputeTableLimits(int min_len,int max_len,int typical_len,size_t mem_need)3006 void Options::ComputeTableLimits( int min_len, int max_len, int typical_len, size_t mem_need )
3007 {
3008 //liwz Fri Jan 15 15:44:47 PST 2016
3009 //T=1 scale=1
3010 //T=2 scale=0.6035
3011 //T=4 scale=0.375
3012 //T=8 scale=0.2392
3013 //T=16 scale=0.1562
3014 //T=32 scale=0.104
3015 //T=64 scale=0.0703
3016 
3017 	double scale = 0.5/threads + 0.5/sqrt(threads);
3018 	max_sequences = (size_t)(scale * MAX_TABLE_SEQ);
3019 	max_entries = (size_t)(scale * (500*max_len + 500000*typical_len + 50000000));
3020 	if( max_memory ){
3021 		double frac = max_sequences / (double) max_entries;
3022 		max_entries = (options.max_memory - mem_need) / sizeof(IndexCount);
3023 		max_sequences = (size_t)(max_entries * frac);
3024 		if( max_sequences < MAX_TABLE_SEQ / 100 ) max_sequences = MAX_TABLE_SEQ / 100;
3025 		if( max_sequences > MAX_TABLE_SEQ ) max_sequences = MAX_TABLE_SEQ;
3026 	}
3027 	printf( "Table limit with the given memory limit:\n" );
3028 	printf( "Max number of representatives: %zu\n", max_sequences );
3029 	printf( "Max number of word counting entries: %zu\n\n", max_entries );
3030 }
DoClustering(int T,const Options & options)3031 void SequenceDB::DoClustering( int T, const Options & options )
3032 {
3033 	int i, j, k;
3034 	int NAA = options.NAA;
3035 	double aa1_cutoff = options.cluster_thd;
3036 	double aas_cutoff = 1 - (1-options.cluster_thd)*4;
3037 	double aan_cutoff = 1 - (1-options.cluster_thd)*options.NAA;
3038 	int seq_no = sequences.size();
3039 	int frag_no = seq_no;
3040 	int frag_size = options.frag_size;
3041 	int len, len_bound;
3042 	int flag;
3043 	valarray<size_t>  letters(T);
3044 
3045 	//printf( "%li\n", options.mem_limit );
3046 
3047 	if (frag_size){
3048 		frag_no = 0;
3049 		for (i=0; i<seq_no; i++) frag_no += (sequences[i]->size - NAA) / frag_size + 1;
3050 	}
3051 
3052 	if( not options.isEST )
3053 		cal_aax_cutoff(aa1_cutoff, aas_cutoff, aan_cutoff, options.cluster_thd,
3054 				options.tolerance, naa_stat_start_percent, naa_stat, NAA);
3055 
3056 	Vector<WorkingParam> params(T);
3057 	Vector<WorkingBuffer> buffers(T);
3058 	for(i=0; i<T; i++){
3059 		params[i].Set( aa1_cutoff, aas_cutoff, aan_cutoff );
3060 		buffers[i].Set( frag_no, max_len, options );
3061 	}
3062 
3063 	// word_table as self comparing table and table buffer:
3064 	WordTable word_table( options.NAA, NAAN );
3065 
3066 	WordTable last_table( options.NAA, NAAN );
3067 
3068 	int N = sequences.size();
3069 	int K = N - 100 * T;
3070 	size_t mem_need = MinimalMemory( frag_no, buffers[0].total_bytes, T, options );
3071 	size_t mem_limit = MemoryLimit( mem_need, options );
3072 	size_t mem, mega = 1000000;
3073 	size_t tabsize = 0;
3074 	int remaining = 0;
3075 
3076 	Options opts( options );
3077 	opts.ComputeTableLimits( min_len, max_len, len_n50, mem_need );
3078 
3079 	omp_set_num_threads(T);
3080 	for(i=0; i<N; ){
3081 		int start = i;
3082 		int m = i;
3083 		size_t sum = remaining;
3084 		float redundancy = (rep_seqs.size() + 1.0) / (i + 1.0);
3085 		size_t max_items = opts.max_entries;
3086 		size_t max_seqs = opts.max_sequences;
3087 		size_t items = 0;
3088 		if( i == 0 && max_seqs > 1000 ){ // first SCB with small size
3089 			max_items /= 8;
3090 			max_seqs /= 8;
3091 		}
3092 		while( m < N && (sum*redundancy) < max_seqs && items < max_items ){
3093 			Sequence *seq = sequences[m];
3094 			if( ! (seq->state & IS_REDUNDANT) ){
3095 				if ( options.store_disk ) seq->SwapIn();
3096 				//items += seq->size;
3097 				items += (size_t)(seq->size * redundancy);
3098 				sum += 1;
3099 			}
3100 			m ++;
3101 		}
3102 		if( (m > i + 1E4) && (m > i + (N - i) / (2+T)) ) m = i + (N - i) / (2+T);
3103 		if( m == i || m >= N ){
3104 			m = N;
3105 			if( m > i + 1E3 ) m = i + (N - i) / (2+T);
3106 		}
3107 		//printf( "m = %i  %i,  %i\n", i, m, m-i );
3108 		printf( "\r# comparing sequences from  %9i  to  %9i\n", i, m );
3109 		if( last_table.size ){
3110 			int print = (m-i)/20 + 1;
3111 			#pragma omp parallel for schedule( dynamic, 1 )
3112 			for(int j=i; j<m; j++){
3113 				Sequence *seq = sequences[j];
3114 				if (seq->state & IS_REDUNDANT) continue;
3115 				int tid = omp_get_thread_num();
3116 				CheckOne( seq, last_table, params[tid], buffers[tid], options );
3117 				if ( options.store_disk && (seq->state & IS_REDUNDANT) ) seq->SwapOut();
3118 				if( j%print==0 ){
3119 					printf( "." ); fflush( stdout );
3120 				}
3121 			}
3122 			int may_stop = 0;
3123 			int self_stop = 0;
3124 			float p0 = 0;
3125 			int min = last_table.sequences[ last_table.sequences.size()-1 ]->size;
3126 			int m0 = m;
3127 			bool stop = false;
3128 			#pragma omp parallel for schedule( dynamic, 1 )
3129 			for(int j=m-1; j<N; j++){
3130 				#pragma omp flush (stop)
3131 				if( ! stop ){
3132 					if( j+1 == N ) may_stop = 1;
3133 					if( j == (m0-1) ){ // use m0 to avoid other iterations satisfying the condition:
3134 						int tid = omp_get_thread_num();
3135 						for(int ks=i; ks<m; ks++){
3136 							Sequence *seq = sequences[ks];
3137 							i = ks + 1;
3138 							if (seq->state & IS_REDUNDANT) continue;
3139 							ClusterOne( seq, ks, word_table, params[tid], buffers[tid], options );
3140 							if ( options.store_disk && (seq->state & IS_REDUNDANT) ) seq->SwapOut();
3141 							if( may_stop and word_table.sequences.size() >= 100 ) break;
3142 							if( word_table.size >= max_items ) break;
3143 							int tmax = max_seqs - (frag_size ? seq->size / frag_size + 1 : 0);
3144 							if( word_table.sequences.size() >= tmax ) break;
3145 						}
3146 						self_stop = 1;
3147 					}else{
3148 						Sequence *seq = sequences[j];
3149 						if (seq->state & IS_REDUNDANT) continue;
3150 						if ( options.store_disk ){
3151 							#pragma omp critical
3152 							seq->SwapIn();
3153 						}
3154 						int tid = omp_get_thread_num();
3155 						CheckOne( seq, last_table, params[tid], buffers[tid], options );
3156 						if ( options.store_disk && (seq->state & IS_REDUNDANT) ) seq->SwapOut();
3157 						if( min > params[tid].len_upper_bound ){
3158 							may_stop = 1;
3159 							stop = true;
3160 							#pragma omp flush (stop)
3161 						}
3162 						if( self_stop && tid ==1 ){
3163 							float p = (100.0*j)/N;
3164 							if( p > p0+1E-1 ){ // print only if the percentage changed
3165 								printf( "\r%4.1f%%", p );
3166 								fflush( stdout );
3167 								p0 = p;
3168 							}
3169 						}
3170 
3171 					}
3172 				}
3173 			}
3174 		}
3175 		if( i == start || m == N ){
3176 			//printf( "comparing the first or last or very small group ...\n" ); fflush( stdout );
3177 			for(k=i; k<m; ){
3178 				int kk, mm = k, sum = 0;
3179 				while( mm < m && sum < 1E5 ){
3180 					if( ! (sequences[mm]->state & IS_REDUNDANT) ) sum += sequences[mm]->size;
3181 					mm += 1;
3182 				}
3183 				if( mm < k + 1000 ) mm = k + 1000;
3184 				if( mm > m ) mm = m;
3185 				#pragma omp parallel for schedule( dynamic, 1 )
3186 				for(kk=k; kk<mm; kk++){
3187 					Sequence *seq = sequences[kk];
3188 					if (seq->state & IS_REDUNDANT) continue;
3189 					int tid = omp_get_thread_num();
3190 					CheckOne( seq, word_table, params[tid], buffers[tid], options );
3191 					if ( options.store_disk && (seq->state & IS_REDUNDANT) ) seq->SwapOut();
3192 				}
3193 				bool bk = false;
3194 				for(int ks=k; ks<mm; ks++){
3195 					Sequence *seq = sequences[ks];
3196 					i = k = ks + 1;
3197 					if (seq->state & IS_REDUNDANT) continue;
3198 					ClusterOne( seq, ks, word_table, params[0], buffers[0], options );
3199 					bk = true;
3200 					if ( options.store_disk && (seq->state & IS_REDUNDANT) ) seq->SwapOut();
3201 					if( word_table.size >= max_items ) break;
3202 					int tmax = max_seqs - (frag_size ? seq->size / frag_size + 1 : 0);
3203 					if( word_table.sequences.size() >= tmax ) break;
3204 					bk = false;
3205 				}
3206 				if( bk ) break;
3207 			}
3208 		}else if( i < m ){
3209 			remaining = remaining/2 + (m - i);
3210 			printf( "\r---------- %6i remaining sequences to the next cycle\n", m-i );
3211 		}
3212 		printf( "---------- new table with %8i representatives\n", word_table.sequences.size() );
3213 		if( (last_table.size + word_table.size) > tabsize )
3214 			tabsize = last_table.size + word_table.size;
3215 		last_table.Clear();
3216 		last_table.sequences.swap( word_table.sequences );
3217 		last_table.indexCounts.swap( word_table.indexCounts );
3218 		last_table.size = word_table.size;
3219 		word_table.size = 0;
3220 	}
3221 	printf( "\n%9i  finished  %9i  clusters\n", sequences.size(), rep_seqs.size() );
3222 	mem = (mem_need + tabsize*sizeof(IndexCount))/mega;
3223 	printf( "\nApproximated maximum memory consumption: %zuM\n", mem );
3224 	last_table.Clear();
3225 	word_table.Clear();
3226 }
3227 
CheckOne(Sequence * seq,WordTable & table,WorkingParam & param,WorkingBuffer & buf,const Options & options)3228 int SequenceDB::CheckOne( Sequence *seq, WordTable & table, WorkingParam & param, WorkingBuffer & buf, const Options & options )
3229 {
3230 	int len = seq->size;
3231 	param.len_upper_bound = upper_bound_length_rep(len, options);
3232 	if( options.isEST ) return CheckOneEST( seq, table, param, buf, options );
3233 	return CheckOneAA( seq, table, param, buf, options );
3234 }
CheckOneAA(Sequence * seq,WordTable & table,WorkingParam & param,WorkingBuffer & buf,const Options & options)3235 int SequenceDB::CheckOneAA( Sequence *seq, WordTable & table, WorkingParam & param, WorkingBuffer & buf, const Options & options )
3236 {
3237 	NVector<IndexCount> & lookCounts = buf.lookCounts;
3238 	NVector<uint32_t> & indexMapping = buf.indexMapping;
3239 	Vector<INTs> & word_encodes_no = buf.word_encodes_no;
3240 	Vector<INTs> & aap_list = buf.aap_list;
3241 	Vector<INTs> & aap_begin = buf.aap_begin;
3242 	Vector<int>  & word_encodes = buf.word_encodes;
3243 	Vector<int>  & taap = buf.taap;
3244 	double aa1_cutoff = param.aa1_cutoff;
3245 	double aa2_cutoff = param.aas_cutoff;
3246 	double aan_cutoff = param.aan_cutoff;
3247 
3248 	char *seqi = seq->data;
3249 	int j, k, j1, len = seq->size;
3250 	int flag = 0;
3251 	int frag_size = options.frag_size;
3252 	int & aln_cover_flag = param.aln_cover_flag;
3253 	int & required_aa1 = param.required_aa1;
3254 	int & required_aa2 = param.required_aas;
3255 	int & required_aan = param.required_aan;
3256 	int & min_aln_lenS = param.min_aln_lenS;
3257 	int & min_aln_lenL = param.min_aln_lenL;
3258 
3259 	int NAA = options.NAA;
3260 	int S = table.sequences.size();
3261 	int len_eff = len;
3262 
3263 	if( S ){
3264 		int min = table.sequences[S-1]->size;
3265 		if( min < len ){
3266 			if( len * options.diff_cutoff2 > min ) min = (int)(len * options.diff_cutoff2);
3267 			if( (len - options.diff_cutoff_aa2) > min ) min = len - options.diff_cutoff_aa2;
3268 			len_eff = min;
3269 		}
3270 	}
3271 
3272     //liwz 2016 01, seq is too short for the shortest (longer) seq in word_table to satisfy -aL option
3273     //longer seqeunce * -aL -band_width
3274     if ( S ) {
3275 		int min = table.sequences[S-1]->size;
3276 		int min_red = min * options.long_coverage - options.band_width;
3277 		if (len < min_red) return 0; // return flag=0
3278 	}
3279 
3280 	param.ControlShortCoverage( len_eff, options );
3281 	param.ComputeRequiredBases( options.NAA, 2, options );
3282 
3283 	buf.EncodeWords( seq, options.NAA, false );
3284 
3285 	// if minimal alignment length > len, return
3286 	// I can not return earlier, because I need to calc the word_encodes etc
3287 	if (options.min_control>len) return 0; // return flag=0
3288 
3289 	// lookup_aan
3290 	int aan_no = len - options.NAA + 1;
3291 	int M = frag_size ? table.frag_count : S;
3292 	table.CountWords(aan_no, word_encodes, word_encodes_no, lookCounts, indexMapping, false, required_aan);
3293 
3294 	// contained_in_old_lib()
3295 	int len_upper_bound = param.len_upper_bound;
3296 	int len_lower_bound = param.len_lower_bound;
3297 	int band_left, band_right, best_score, band_width1, best_sum, len2, alnln, len_eff1;
3298 	int tiden_no, band_center;
3299 	float tiden_pc, distance=0;
3300 	int talign_info[5];
3301 	int best1, sum;
3302 	INTs *lookptr;
3303 	char *seqj;
3304 	int frg2 = frag_size ? (len - NAA + options.band_width ) / frag_size + 1 + 1 : 0;
3305 	int lens;
3306 	int has_aa2 = 0;
3307 
3308 	IndexCount *ic = lookCounts.items;
3309 	ic = lookCounts.items;
3310 	for(; ic->count; ic++){
3311 		if( ! frag_size ){
3312 			indexMapping[ ic->index ] = 0;
3313 			if ( ic->count < required_aan ) continue;
3314 		}
3315 
3316 		Sequence *rep = table.sequences[ ic->index ];
3317 		len2 = rep->size;
3318 		if (len2 > len_upper_bound ) continue;
3319 		if (options.has2D && len2 < len_lower_bound ) continue;
3320 		if( frag_size ){
3321 			uint32_t *ims = & indexMapping[ ic->index ];
3322 			int count = ic->count;
3323 			k = (len2 - NAA) / frag_size + 1;
3324 			sum = 0;
3325 			for (j1=0; j1<frg2; j1++){
3326 				uint32_t im = ims[j1];
3327 				if( im ) sum += lookCounts[im-1].count;
3328 			}
3329 			count = sum;
3330 			for (j1=frg2; j1<k; j1++) {
3331 				uint32_t im1 = ims[j1];
3332 				uint32_t im2 = ims[j1-frg2];
3333 				if( im1 ) sum += lookCounts[im1-1].count;
3334 				if( im2 ) sum -= lookCounts[im2-1].count;
3335 				if (sum > count) count = sum;
3336 			}
3337 			if ( count < required_aan ) continue;
3338 		}
3339 
3340 		param.ControlLongCoverage( len2, options );
3341 
3342 		if ( has_aa2 == 0 )  { // calculate AAP array
3343 			buf.ComputeAAP( seqi, seq->size );
3344 			has_aa2 = 1;
3345 		}
3346 		seqj = rep->data; //NR_seq[NR90_idx[j]];
3347 
3348 		band_width1 = (options.band_width < len+len2-2 ) ? options.band_width : len+len2-2;
3349 		diag_test_aapn(NAA1, seqj, len, len2, buf, best_sum,
3350 				band_width1, band_left, band_center, band_right, required_aa1);
3351 		if ( best_sum < required_aa2 ) continue;
3352 
3353 		int rc = FAILED_FUNC;
3354 		if (options.print || aln_cover_flag) //return overlap region
3355 			rc = local_band_align(seqi, seqj, len, len2, mat,
3356 					best_score, tiden_no, alnln, distance, talign_info,
3357 					band_left, band_center, band_right, buf);
3358 		else
3359 			rc = local_band_align(seqi, seqj, len, len2, mat,
3360 					best_score, tiden_no, alnln, distance, talign_info,
3361 					band_left, band_center, band_right, buf);
3362 		if ( rc == FAILED_FUNC ) continue;
3363 		if ( tiden_no < required_aa1 ) continue;
3364 		lens = len;
3365 		if( options.has2D && len > len2 ) lens = len2;
3366 		len_eff1 = (options.global_identity == 0) ? alnln : (lens - talign_info[4]);
3367 		tiden_pc = tiden_no / (float) len_eff1;
3368 		if( options.useDistance ){
3369 			if (distance > options.distance_thd ) continue;
3370 			if (distance >= seq->distance) continue; // existing distance
3371 		}else{
3372 			if (tiden_pc < options.cluster_thd) continue;
3373 			if (tiden_pc <= seq->identity) continue; // existing iden_no
3374 		}
3375 		if (aln_cover_flag) {
3376 			if ( talign_info[3]-talign_info[2]+1 < min_aln_lenL) continue;
3377 			if ( talign_info[1]-talign_info[0]+1 < min_aln_lenS) continue;
3378 		}
3379 		if( options.has2D ) seq->state |= IS_REDUNDANT ;
3380 		flag = 1; seq->identity = tiden_pc; seq->cluster_id = rep->cluster_id;
3381 		seq->distance = distance;
3382 		seq->coverage[0] = talign_info[0] +1;
3383 		seq->coverage[1] = talign_info[1] +1;
3384 		seq->coverage[2] = talign_info[2] +1;
3385 		seq->coverage[3] = talign_info[3] +1;
3386 		if (not options.cluster_best) break;
3387 		update_aax_cutoff(aa1_cutoff, aa2_cutoff, aan_cutoff,
3388 				options.tolerance, naa_stat_start_percent, naa_stat, NAA, tiden_pc);
3389 		param.ComputeRequiredBases( options.NAA, 2, options );
3390 	}
3391 	if( frag_size ) ic = lookCounts.items;
3392 	while( ic->count ){
3393 		indexMapping[ ic->index ] = 0;
3394 		ic += 1;
3395 	}
3396 	lookCounts.size = 0;
3397 	if (flag == 1) { // if similar to old one delete it
3398 		if (! options.cluster_best) {
3399 			seq->Clear();
3400 			seq->state |= IS_REDUNDANT ;
3401 		}
3402 	}
3403 	return flag;
3404 
3405 }
CheckOneEST(Sequence * seq,WordTable & table,WorkingParam & param,WorkingBuffer & buf,const Options & options)3406 int SequenceDB::CheckOneEST( Sequence *seq, WordTable & table, WorkingParam & param, WorkingBuffer & buf, const Options & options )
3407 {
3408 	NVector<IndexCount> & lookCounts = buf.lookCounts;
3409 	NVector<uint32_t> & indexMapping = buf.indexMapping;
3410 	Vector<INTs> & word_encodes_no = buf.word_encodes_no;
3411 	Vector<INTs> & aap_list = buf.aap_list;
3412 	Vector<INTs> & aap_begin = buf.aap_begin;
3413 	Vector<int>  & word_encodes = buf.word_encodes;
3414 	Vector<int>  & taap = buf.taap;
3415 	Vector<int> & aan_list_comp = buf.aan_list_comp;
3416 	char *seqi_comp = & buf.seqi_comp[0];
3417 
3418 	int & aln_cover_flag = param.aln_cover_flag;
3419 	int & required_aa1 = param.required_aa1;
3420 	int & required_aas = param.required_aas;
3421 	int & required_aan = param.required_aan;
3422 	int & min_aln_lenS = param.min_aln_lenS;
3423 	int & min_aln_lenL = param.min_aln_lenL;
3424 
3425 	char *seqi = seq->data;
3426 	int j, len = seq->size;
3427 	int flag = 0;
3428 	int S = table.sequences.size();
3429 	int len_eff = len;
3430 	if( S ){
3431 		int min = table.sequences[S-1]->size;
3432 		if( min < len ){
3433 			if( len * options.diff_cutoff2 > min ) min = (int)(len * options.diff_cutoff2);
3434 			if( (len - options.diff_cutoff_aa2) > min ) min = len - options.diff_cutoff_aa2;
3435 			len_eff = min;
3436 		}
3437 	}
3438 
3439 
3440         //liwz 2016 01, seq is too short for the shortest (longer) seq in word_table to satisfy -aL option
3441         //longer seqeunce * -aL -band_width
3442         if ( S ) {
3443 		int min = table.sequences[S-1]->size;
3444 		int min_red = min * options.long_coverage - options.band_width;
3445 		if (len < min_red) return 0; // return flag=0
3446 	}
3447 
3448 
3449 	param.ControlShortCoverage( len_eff, options );
3450 	param.ComputeRequiredBases( options.NAA, 4, options );
3451 	int skip = buf.EncodeWords( seq, options.NAA, true );
3452 	required_aan -= skip;
3453 	required_aas -= skip;
3454 	required_aa1 -= skip;
3455 	if( required_aan <= 0 ) required_aan = 1;
3456 	if( required_aas <= 0 ) required_aas = 1;
3457 	if( required_aa1 <= 0 ) required_aa1 = 1;
3458 
3459 	// if minimal alignment length > len, return
3460 	// I can not return earlier, because I need to calc the word_encodes etc
3461 	if (options.min_control>len) return 0; // return flag=0
3462 
3463 	int aan_no = len - options.NAA + 1;
3464 
3465 	// contained_in_old_lib()
3466 	int len_upper_bound = param.len_upper_bound;
3467 	int len_lower_bound = param.len_lower_bound;
3468 	int band_left, band_right, best_score, band_width1, best_sum, len2, alnln, len_eff1;
3469 	int tiden_no, band_center;
3470 	float tiden_pc, distance=0;
3471 	int talign_info[5];
3472 	int j0, comp, lens;
3473 	char *seqj;
3474 
3475 	for(comp=0; comp<2; comp++){
3476 		if( comp ){
3477 			for (j0=0; j0<aan_no; j0++) {
3478 				j = word_encodes[j0];
3479 				if ( j<0 ) aan_list_comp[j0] = j;
3480 				else       aan_list_comp[j0] = Comp_AAN_idx[j];
3481 			}
3482 			make_comp_iseq(len, seqi_comp, seqi);
3483 			seqi = seqi_comp;
3484 		}
3485 		int has_aas = 0;
3486 
3487 		if( comp ){
3488 			table.CountWords(aan_no, aan_list_comp, word_encodes_no, lookCounts, indexMapping, true, required_aan );
3489 		}else{
3490 			table.CountWords(aan_no, word_encodes, word_encodes_no, lookCounts, indexMapping, true, required_aan );
3491 		}
3492 
3493 		IndexCount *ic = lookCounts.items;
3494 		ic = lookCounts.items;
3495 		for(; ic->count; ic++){
3496 			indexMapping[ ic->index ] = 0;
3497 			if ( ic->count < required_aan ) continue;
3498 			Sequence *rep = table.sequences[ic->index];
3499 
3500 			len2 = rep->size;
3501 			if (len2 > len_upper_bound ) continue;
3502 			if (options.has2D && len2 < len_lower_bound ) continue;
3503 
3504 			seqj = rep->data;
3505 
3506 			param.ControlLongCoverage( len2, options );
3507 
3508 			if ( has_aas == 0 )  { // calculate AAP array
3509 				buf.ComputeAAP2( seqi, seq->size );
3510 				has_aas = 1;
3511 			}
3512 
3513 			band_width1 = (options.band_width < len+len2-2 ) ? options.band_width : len+len2-2;
3514 			diag_test_aapn_est(NAA1, seqj, len, len2, buf, best_sum,
3515 					band_width1, band_left, band_center, band_right, required_aa1);
3516 			if ( best_sum < required_aas ) continue;
3517 			//if( comp and flag and (not options.cluster_best) and j > rep->cluster_id ) goto Break;
3518 
3519 			int rc = FAILED_FUNC;
3520 			if (options.print || aln_cover_flag){ //return overlap region
3521 				rc = local_band_align(seqi, seqj, len, len2, mat,
3522 						best_score, tiden_no, alnln, distance, talign_info,
3523 						band_left, band_center, band_right, buf);
3524 				if( comp ){
3525 					talign_info[0] = len - talign_info[0] - 1;
3526 					talign_info[1] = len - talign_info[1] - 1;
3527 				}
3528 			}else{
3529 				//printf( "%5i %5i %5i %5i\n", band_width1, band_right-band_left, band_left, band_right );
3530 				rc = local_band_align(seqi, seqj, len, len2, mat,
3531 						best_score, tiden_no, alnln, distance, talign_info,
3532 						band_left, band_center, band_right, buf);
3533 			}
3534 			if ( rc == FAILED_FUNC ) continue;
3535 			//printf( "%i  %i  %i\n", best_score, tiden_no, required_aa1 );
3536 			if ( tiden_no < required_aa1 ) continue;
3537 			if ( options.is454 ){
3538 				if (talign_info[2] != talign_info[0]) continue; // same start
3539 				if (talign_info[0] > 1) continue; // one mismatch allowed at beginning
3540 				if ((len-talign_info[1]) > 2) continue; // one mismatch allowed at end
3541 			}
3542 
3543 			lens = len;
3544 			if( options.has2D && len > len2 ) lens = len2;
3545 			len_eff1 = (options.global_identity == 0) ? alnln : (lens - talign_info[4]);
3546 			tiden_pc = tiden_no / (float)len_eff1;
3547 			//printf( "%i %f\n", tiden_no, tiden_pc );
3548 			if( options.useDistance ){
3549 				if (distance > options.distance_thd ) continue;
3550 				if (options.cluster_best and distance >= seq->distance) continue; // existing distance
3551 			}else{
3552 				if (tiden_pc < options.cluster_thd) continue;
3553 				if (options.cluster_best and tiden_pc < seq->identity) continue; // existing iden_no
3554 			}
3555 			if (aln_cover_flag) {
3556 				if ( talign_info[3]-talign_info[2]+1 < min_aln_lenL) continue;
3557 				if( comp ){
3558 					if ( talign_info[0]-talign_info[1]+1 < min_aln_lenS) continue;
3559 				}else{
3560 					if ( talign_info[1]-talign_info[0]+1 < min_aln_lenS) continue;
3561 				}
3562 			}
3563 			if( options.cluster_best and fabs(tiden_pc - seq->identity) < 1E-9 and rep->cluster_id >= seq->cluster_id ) continue;
3564 			if( (not options.cluster_best) and flag !=0 and rep->cluster_id >= seq->cluster_id ) continue;
3565 			flag = comp ? -1 : 1;
3566 			seq->identity = tiden_pc;
3567 			seq->distance = distance;
3568 			seq->cluster_id = rep->cluster_id;
3569 			seq->coverage[0] = talign_info[0] +1;
3570 			seq->coverage[1] = talign_info[1] +1;
3571 			seq->coverage[2] = talign_info[2] +1;
3572 			seq->coverage[3] = talign_info[3] +1;
3573 			if (not options.cluster_best) break;
3574 		}
3575 		while( ic->count ){
3576 			indexMapping[ ic->index ] = 0;
3577 			ic += 1;
3578 		}
3579 		lookCounts.size = 0;
3580 		if (not options.option_r ) break;
3581 	}
3582 	if ((flag == 1) || (flag == -1)) { // if similar to old one delete it
3583 		if (! options.cluster_best) {
3584 			seq->Clear();
3585 			seq->state |= IS_REDUNDANT ;
3586 		}
3587 		if( flag == -1 )
3588 			seq->state |= IS_MINUS_STRAND;
3589 		else
3590 			seq->state &= ~IS_MINUS_STRAND;
3591 	}
3592 	return flag;
3593 }
ComputeDistance(const Options & options)3594 void SequenceDB::ComputeDistance( const Options & options )
3595 {
3596 	int i, j, N = sequences.size();
3597 	int best_score, best_sum;
3598 	int band_width1, band_left, band_center, band_right, required_aa1;
3599 	int tiden_no, alnln;
3600 	int talign_info[5];
3601 	float distance;
3602 	WorkingBuffer buf( N, max_len, options );
3603 
3604 	Vector<NVector<float> > dists( N, NVector<float>(N) );
3605 
3606 	Sequence comseq( *sequences[0] );
3607 
3608 	for(i=0; i<N; i++){
3609 		Sequence *seq = sequences[i];
3610 		char *seqi = seq->data;
3611 		int len = seq->size;
3612 		buf.EncodeWords( seq, options.NAA, false );
3613 		buf.ComputeAAP2( seqi, seq->size );
3614 		dists[i][i] = 0.0;
3615 		if((i+1)%1000 ==0) printf( "%9i\n", (i+1) );
3616 		for(j=0; j<i; j++){
3617 			Sequence *rep = sequences[j];
3618 			char *seqj = rep->data;
3619 			int len2 = rep->size;
3620 			band_width1 = (options.band_width < len+len2-2 ) ? options.band_width : len+len2-2;
3621 			diag_test_aapn_est(NAA1, seqj, len, len2, buf, best_sum,
3622 					band_width1, band_left, band_center, band_right, 0);
3623 			local_band_align(seqi, seqj, len, len2, mat,
3624 					best_score, tiden_no, alnln, distance, talign_info,
3625 					band_left, band_center, band_right, buf);
3626 			dists[seq->index][rep->index] = dists[rep->index][seq->index] = distance;
3627 		}
3628 		if (not options.option_r ) break;
3629 		comseq.index = seq->index;
3630 		comseq.size = len;
3631 		for(j=0; j<len; j++) comseq.data[i] = seq->data[len-i-1];
3632 		seqi = comseq.data;
3633 		buf.EncodeWords( &comseq, options.NAA, false );
3634 		buf.ComputeAAP2( seqi, seq->size );
3635 		for(j=0; j<i; j++){
3636 			Sequence *rep = sequences[j];
3637 			char *seqj = rep->data;
3638 			int len2 = rep->size;
3639 			band_width1 = (options.band_width < len+len2-2 ) ? options.band_width : len+len2-2;
3640 			diag_test_aapn_est(NAA1, seqj, len, len2, buf, best_sum,
3641 					band_width1, band_left, band_center, band_right, 0);
3642 			local_band_align(seqi, seqj, len, len2, mat,
3643 					best_score, tiden_no, alnln, distance, talign_info,
3644 					band_left, band_center, band_right, buf);
3645 			if( distance < dists[seq->index][rep->index] )
3646 				dists[seq->index][rep->index] = dists[rep->index][seq->index] = distance;
3647 		}
3648 	}
3649 	std::string output = options.output + ".dist";
3650 	FILE *fout = fopen( output.c_str(), "w+" );
3651 	fprintf( fout, "1" );
3652 	for(i=1; i<N; i++) fprintf( fout, "\t%i", i+1 );
3653 	fprintf( fout, "\n" );
3654 	for(i=0; i<N; i++){
3655 		fprintf( fout, "%g", dists[i][0] );
3656 		for(j=1; j<N; j++) fprintf( fout, "\t%g", dists[i][j] );
3657 		fprintf( fout, "\n" );
3658 	}
3659 	fclose( fout );
3660 }
DoClustering(const Options & options)3661 void SequenceDB::DoClustering( const Options & options )
3662 {
3663 	int i;
3664 	int NAA = options.NAA;
3665 	double aa1_cutoff = options.cluster_thd;
3666 	double aas_cutoff = 1 - (1-options.cluster_thd)*4;
3667 	double aan_cutoff = 1 - (1-options.cluster_thd)*options.NAA;
3668 	int seq_no = sequences.size();
3669 	int frag_no = seq_no;
3670 	int frag_size = options.frag_size;
3671 	int len, len_bound;
3672 	int flag;
3673 
3674 #if 0
3675 	ComputeDistance( options );
3676 	return;
3677 #endif
3678 
3679 	if( options.threads > 1 ){
3680 		DoClustering( options.threads, options );
3681 		temp_files.Clear();
3682 		return;
3683 	}
3684 
3685 	if (frag_size){
3686 		frag_no = 0;
3687 		for (i=0; i<seq_no; i++) frag_no += (sequences[i]->size - NAA) / frag_size + 1;
3688 	}
3689 
3690 	if( not options.isEST )
3691 		cal_aax_cutoff(aa1_cutoff, aas_cutoff, aan_cutoff, options.cluster_thd,
3692 				options.tolerance, naa_stat_start_percent, naa_stat, NAA);
3693 
3694 	WorkingParam param( aa1_cutoff, aas_cutoff, aan_cutoff );
3695 	WorkingBuffer buffer( frag_no, max_len, options );
3696 
3697 	WordTable word_table( options.NAA, NAAN );
3698 
3699 	size_t mem_need = MinimalMemory( frag_no, buffer.total_bytes, 1, options );
3700 	size_t mem_limit = MemoryLimit( mem_need, options );
3701 	size_t mem, mega = 1000000;
3702 	int N = sequences.size();
3703 
3704 	size_t total_letters = total_letter;
3705 	size_t tabsize = 0;
3706 
3707 	Options opts( options );
3708 	opts.ComputeTableLimits( min_len, max_len, len_n50, mem_need );
3709 
3710 	for(i=0; i<N; ){
3711 		float redundancy = (rep_seqs.size() + 1.0) / (i + 1.0);
3712 		int m = i;
3713 		size_t sum = 0;
3714 		size_t max_items = opts.max_entries;
3715 		size_t max_seqs = opts.max_sequences;
3716 		size_t items = 0;
3717 
3718 // find a block from i to m, so that this block can fit into a word table
3719 //     ...
3720 //  i  ++++++++++++++++++++++++++
3721 //     ++++++++++++++++++++
3722 //     ++++++++++++++++
3723 //  m  +++++++++++++
3724 //     ...
3725 		while( m < N && (sum*redundancy) < max_seqs && items < max_items ){
3726 			Sequence *seq = sequences[m];
3727 			if( ! (seq->state & IS_REDUNDANT) ){
3728 				if ( options.store_disk ) seq->SwapIn();
3729 				items += (size_t)(seq->size * redundancy);
3730 				sum += 1;
3731 			}
3732 			m ++;
3733 		}
3734 		if( m > N ) m = N;
3735 		printf( "\rcomparing sequences from  %9i  to  %9i\n", i, m );
3736 		fflush( stdout );
3737 		for(int ks=i; ks<m; ks++){ // clustering this block
3738 			Sequence *seq = sequences[ks];
3739 			i = ks + 1;
3740 			if (seq->state & IS_REDUNDANT) continue;
3741 			ClusterOne( seq, ks, word_table, param, buffer, options );
3742 			total_letters -= seq->size;
3743 			if( options.store_disk && (seq->state & IS_REDUNDANT) ) seq->SwapOut();
3744 			if( word_table.size >= max_items ) break;
3745 			int tmax = max_seqs - (frag_size ? seq->size / frag_size + 1 : 0);
3746 			if( word_table.sequences.size() >= tmax ) break;
3747 		} // finishing word table from this block
3748 		m = i;
3749 		if( word_table.size == 0 ) continue;
3750 		float p0 = 0;
3751 		for(int j=m; j<N; j++){ // use this word table to screen rest sequences m->N
3752 			Sequence *seq = sequences[j];
3753 			if (seq->state & IS_REDUNDANT) continue;
3754 			if ( options.store_disk ) seq->SwapIn();
3755 			CheckOne( seq, word_table, param, buffer, options );
3756 			total_letters -= seq->size;
3757 			if ( options.store_disk && (seq->state & IS_REDUNDANT) ) seq->SwapOut();
3758 			int len_bound = param.len_upper_bound;
3759 			if( word_table.sequences[ word_table.sequences.size()-1 ]->size > len_bound ){
3760 				break;
3761 			}
3762 			float p = (100.0*j)/N;
3763 			if( p > p0+1E-1 ){ // print only if the percentage changed
3764 				printf( "\r%4.1f%%", p );
3765 				fflush( stdout );
3766 				p0 = p;
3767 			}
3768 		}
3769 		if( word_table.size > tabsize ) tabsize = word_table.size;
3770 		//if( i && i < m ) printf( "\r---------- %6i remaining sequences to the next cycle\n", m-i );
3771 		word_table.Clear();
3772 	}
3773 	printf( "\n%9i  finished  %9i  clusters\n", sequences.size(), rep_seqs.size() );
3774 	mem = (mem_need + tabsize*sizeof(IndexCount))/mega;
3775 	printf( "\nApproximated maximum memory consumption: %liM\n", mem );
3776 	temp_files.Clear();
3777 	word_table.Clear();
3778 
3779 #if 0
3780 	int zeros = 0;
3781 	for(i=0; i<word_table.indexCounts.size(); i++) zeros += word_table.indexCounts[i].Size() ==0;
3782 	printf( "%9i  empty entries out of  %9i\n", zeros, word_table.indexCounts.size() );
3783 #endif
3784 }
3785 
ClusterTo(SequenceDB & other,const Options & options)3786 void SequenceDB::ClusterTo( SequenceDB & other, const Options & options )
3787 {
3788 	int i, is, js, flag;
3789 	int len, len_tmp, len_lower_bound, len_upper_bound;
3790 	int NR2_red_no = 0;
3791 	int aan_no = 0;
3792 	char *seqi;
3793 	int NAA = options.NAA;
3794 	double aa1_cutoff = options.cluster_thd;
3795 	double aas_cutoff = 1 - (1-options.cluster_thd)*4;
3796 	double aan_cutoff = 1 - (1-options.cluster_thd)*options.NAA;
3797 	Vector<int>  word_encodes( MAX_SEQ );
3798 	Vector<INTs> word_encodes_no( MAX_SEQ );
3799 
3800 	if( not options.isEST ){
3801 		cal_aax_cutoff(aa1_cutoff, aas_cutoff, aan_cutoff, options.cluster_thd,
3802 				options.tolerance, naa_stat_start_percent, naa_stat, NAA);
3803 	}
3804 
3805 
3806 	int N = other.sequences.size();
3807 	int M = sequences.size();
3808 	int T = options.threads;
3809 
3810 	valarray<size_t>  counts(T);
3811 	Vector<WorkingParam> params(T);
3812 	Vector<WorkingBuffer> buffers(T);
3813 	WorkingParam & param = params[0];
3814 	WorkingBuffer & buffer = buffers[0];
3815 	for(i=0; i<T; i++){
3816 		params[i].Set( aa1_cutoff, aas_cutoff, aan_cutoff );
3817 		buffers[i].Set( N, max_len, options );
3818 	}
3819 	if( T >1 ) omp_set_num_threads(T);
3820 
3821 	size_t mem_need = MinimalMemory( N, buffer.total_bytes, T, options, other.total_letter+other.total_desc );
3822 	size_t mem_limit = MemoryLimit( mem_need, options );
3823 
3824 	Options opts( options );
3825 	opts.ComputeTableLimits( min_len, max_len, len_n50, mem_need );
3826 
3827 	WordTable word_table( options.NAA, NAAN );
3828 
3829 	size_t max_items = opts.max_entries;
3830 	size_t max_seqs = opts.max_sequences;
3831 	for(i=0; i<N; ){
3832 		size_t items = 0;
3833 		size_t sum = 0;
3834 		int m = i;
3835 		while( m < N && sum < max_seqs && items < max_items ){
3836 			Sequence *seq = other.sequences[m];
3837 			if( ! (seq->state & IS_REDUNDANT) ){
3838 				if ( options.store_disk ) seq->SwapIn();
3839 				items += seq->size;
3840 				sum += 1;
3841 			}
3842 			m ++;
3843 		}
3844 		if( m > N ) m = N;
3845 		//printf( "m = %i  %i,  %i\n", i, m, m-i );
3846 		for(int ks=i; ks<m; ks++){
3847 			Sequence *seq = other.sequences[ks];
3848 			len = seq->size;
3849 			seqi = seq->data;
3850 			calc_ann_list(len, seqi, NAA, aan_no, word_encodes, word_encodes_no, options.isEST);
3851 			word_table.AddWordCounts(aan_no, word_encodes, word_encodes_no, ks-i, options.isEST);
3852 			word_table.sequences.Append( seq );
3853 			seq->cluster_id = ks;
3854 			seq->state |= IS_REP;
3855 			if ( (ks+1) % 1000 == 0 ) {
3856 				printf( "." );
3857 				fflush( stdout );
3858 				if ( (ks+1) % 10000 == 0 ) printf( "%9i  finished\n", ks+1 );
3859 			}
3860 		}
3861 		float p0 = 0;
3862 		if( T > 1 ){
3863 			int JM = M;
3864 			counts = 0;
3865 			#pragma omp parallel for schedule( dynamic, 1 )
3866 			for(int j=0; j<JM; j++){
3867 				Sequence *seq = sequences[j];
3868 				if( seq->state & IS_REDUNDANT ) continue;
3869 				int len = seq->size;
3870 				char *seqi = seq->data;
3871 				int len_upper_bound = upper_bound_length_rep(len,options);
3872 				int len_lower_bound = len - options.diff_cutoff_aa2;
3873 
3874 				int len_tmp = (int) ( ((double)len) * options.diff_cutoff2);
3875 				if (len_tmp < len_lower_bound) len_lower_bound = len_tmp;
3876 
3877 				int tid = omp_get_thread_num();
3878 				params[tid].len_upper_bound = len_upper_bound;
3879 				params[tid].len_lower_bound = len_lower_bound;
3880 
3881 				if( word_table.sequences[ word_table.sequences.size()-1 ]->size > len_upper_bound ){
3882 					JM = 0;
3883 					continue;
3884 				}
3885 
3886 				int flag = other.CheckOne( seq, word_table, params[tid], buffers[tid], options );
3887 				if ((flag == 1) || (flag == -1)) { // if similar to old one delete it
3888 					if (! options.cluster_best) {
3889 						seq->Clear();
3890 						seq->state |= IS_REDUNDANT ;
3891 						counts[tid] ++;
3892 					}
3893 					if( flag == -1 ) seq->state |= IS_MINUS_STRAND; // for EST only
3894 				}
3895 				float p = (100.0*j)/M;
3896 				if( p > p0+1E-1 ){ // print only if the percentage changed
3897 					printf( "\r%4.1f%%", p );
3898 					fflush( stdout );
3899 					p0 = p;
3900 				}
3901 			}
3902 			for(int j=0; j<T; j++) NR2_red_no += counts[j];
3903 		}else{
3904 			for(int j=0; j<M; j++){
3905 				Sequence *seq = sequences[j];
3906 				if( seq->state & IS_REDUNDANT ) continue;
3907 				len = seq->size;
3908 				seqi = seq->data;
3909 				len_upper_bound = upper_bound_length_rep(len,options);
3910 				len_lower_bound = len - options.diff_cutoff_aa2;
3911 
3912 				len_tmp = (int) ( ((double)len) * options.diff_cutoff2);
3913 				if (len_tmp < len_lower_bound) len_lower_bound = len_tmp;
3914 				param.len_upper_bound = len_upper_bound;
3915 				param.len_lower_bound = len_lower_bound;
3916 
3917 				if( word_table.sequences[ word_table.sequences.size()-1 ]->size > len_upper_bound ){
3918 					break;
3919 				}
3920 
3921 				flag = other.CheckOne( seq, word_table, param, buffer, options );
3922 				if ((flag == 1) || (flag == -1)) { // if similar to old one delete it
3923 					if (! options.cluster_best) {
3924 						seq->Clear();
3925 						seq->state |= IS_REDUNDANT ;
3926 						NR2_red_no ++;
3927 					}
3928 					if( flag == -1 ) seq->state |= IS_MINUS_STRAND; // for EST only
3929 				}
3930 				float p = (100.0*j)/M;
3931 				if( p > p0+1E-1 ){ // print only if the percentage changed
3932 					printf( "\r%4.1f%%", p );
3933 					fflush( stdout );
3934 					p0 = p;
3935 				}
3936 			}
3937 		}
3938 		printf( "\r..........%9i  compared  %9i  clusters\n", i, NR2_red_no );
3939 		word_table.Clear();
3940 		word_table.size = 0;
3941 		i = m;
3942 	}
3943 
3944 	if (options.cluster_best) {//delete redundant sequences in options.cluster_best mode
3945 		for (i=0; i<(int)sequences.size(); i++){
3946 			Sequence *seq = sequences[i];
3947 			if (seq->identity > 0 ){
3948 				seq->state |= IS_REDUNDANT;
3949 				NR2_red_no ++;
3950 			}
3951 		}
3952 	}
3953 	for (i=0; i<(int)sequences.size(); i++){
3954 		Sequence *seq = sequences[i];
3955 		if( seq->identity <0 ) seq->identity *= -1;
3956 		if( not(seq->state & IS_REDUNDANT) ) rep_seqs.Append( i );
3957 	}
3958 
3959 	cout << endl;
3960 	cout << sequences.size() << " compared\t" << NR2_red_no << " clustered" << endl;
3961 	temp_files.Clear();
3962 }
3963 
calc_ann_list(int len,char * seqi,int NAA,int & aan_no,Vector<int> & aan_list,Vector<INTs> & aan_list_no,bool est)3964 int calc_ann_list(int len, char *seqi, int NAA, int& aan_no, Vector<int> & aan_list, Vector<INTs> & aan_list_no, bool est)
3965 {
3966 	int i, j, k, i0, i1, k1;
3967 
3968 	// check_aan_list
3969 	aan_no = len - NAA + 1;
3970 	for (j=0; j<aan_no; j++) {
3971 		aan_list[j] = 0;
3972 		for (k=0, k1=NAA-1; k<NAA; k++, k1--) aan_list[j] += seqi[j+k] * NAAN_array[k1];
3973 	}
3974 	if( est ){
3975 		// for the short word containing 'N', mask it to '-1'
3976 		for (j=0; j<len; j++){
3977 			if ( seqi[j] >= 4 ) {                      // here N is 4
3978 				i0 = (j-NAA+1 > 0) ? j-NAA+1 : 0;
3979 				i1 = j < aan_no ? j : aan_no - 1;
3980 				for (i=i0; i<=i1; i++) aan_list[i]=-1;
3981 			}
3982 		}
3983 	}
3984 
3985 	std::sort(aan_list.begin(), aan_list.begin() + aan_no);
3986 	for(j=0; j<aan_no; j++) aan_list_no[j]=1;
3987 	for(j=aan_no-1; j; j--) {
3988 		if (aan_list[j] == aan_list[j-1]) {
3989 			aan_list_no[j-1] += aan_list_no[j];
3990 			aan_list_no[j]=0;
3991 		}
3992 	}
3993 	return OK_FUNC;
3994 } // END calc_ann_list
3995 
make_comp_short_word_index(int NAA,int * NAAN_array,Vector<int> & Comp_AAN_idx)3996 void make_comp_short_word_index(int NAA, int *NAAN_array, Vector<int> &Comp_AAN_idx) {
3997 	int i, j, k, icomp, k1;
3998 	int c[4] = {3,2,1,0};
3999 	unsigned char short_word[32]; //short_word[12] is enough
4000 
4001 	int NAA1 = NAAN_array[1];
4002 	int NAAN = NAAN_array[NAA];
4003 
4004 	for (i=0; i<NAAN; i++) {
4005 		// decompose i back to short_word
4006 		for (k=i, j=0; j<NAA; j++) {
4007 			short_word[j] = (unsigned char) (k % NAA1);
4008 			k  = k / NAA1;
4009 		}
4010 
4011 		// calc_comp_aan_list
4012 		icomp=0;
4013 		for (k=0, k1=NAA-1; k<NAA; k++, k1--) icomp += c[short_word[k1]] * NAAN_array[k];
4014 
4015 		Comp_AAN_idx[i] = icomp;
4016 	}
4017 } // make_comp_short_word_index
4018 
4019 //quick_sort_idx calling (a, idx, 0, no-1)
4020 //sort a with another array idx
4021 //so that idx rearranged
quick_sort_idx(int * a,int * idx,int lo0,int hi0)4022 int quick_sort_idx (int *a, int *idx, int lo0, int hi0 ) {
4023   int lo = lo0;
4024   int hi = hi0;
4025   int mid;
4026   int tmp;
4027 
4028   if ( hi0 > lo0) {
4029     mid = a[ ( lo0 + hi0 ) / 2 ];
4030 
4031     while( lo <= hi ) {
4032       while( ( lo < hi0 ) && ( a[lo] < mid ) ) lo++;
4033       while( ( hi > lo0 ) && ( a[hi] > mid ) ) hi--;
4034       if( lo <= hi ) {
4035         tmp=a[lo];   a[lo]=a[hi];     a[hi]=tmp;
4036         tmp=idx[lo]; idx[lo]=idx[hi]; idx[hi]=tmp;
4037         lo++; hi--;
4038       }
4039     } // while
4040 
4041     if( lo0 < hi ) quick_sort_idx(a, idx, lo0, hi );
4042     if( lo < hi0 ) quick_sort_idx(a, idx, lo, hi0 );
4043   } // if ( hi0 > lo0)
4044   return 0;
4045 } // quick_sort_idx
4046 
4047 
4048 //decreasing can not use reverse of quick_sort_idx due to tie
4049 //quick_sort_idxr calling (a, idx, 0, no-1)
4050 //sort a with another array idx
4051 //so that idx rearranged
quick_sort_idxr(int * a,int * idx,int lo0,int hi0)4052 int quick_sort_idxr (int *a, int *idx, int lo0, int hi0 ) {
4053   int lo = lo0;
4054   int hi = hi0;
4055   int mid;
4056   int tmp;
4057 
4058   if ( hi0 > lo0) {
4059     mid = a[ ( lo0 + hi0 ) / 2 ];
4060 
4061     while( lo <= hi ) {
4062       while( ( lo < hi0 ) && ( a[lo] > mid ) ) lo++;
4063       while( ( hi > lo0 ) && ( a[hi] < mid ) ) hi--;
4064       if( lo <= hi ) {
4065         tmp=a[lo];   a[lo]=a[hi];     a[hi]=tmp;
4066         tmp=idx[lo]; idx[lo]=idx[hi]; idx[hi]=tmp;
4067         lo++; hi--;
4068       }
4069     } // while
4070 
4071     if( lo0 < hi ) quick_sort_idxr(a, idx, lo0, hi );
4072     if( lo < hi0 ) quick_sort_idxr(a, idx, lo, hi0 );
4073   } // if ( hi0 > lo0)
4074   return 0;
4075 } // quick_sort_idxr
4076 
4077 /////////////////////////// END ALL ////////////////////////
4078 
4079 int naa_stat_start_percent = 40;
4080 int naa_stat[5][61][4] = {
4081 
4082 	// cover 0.99
4083 	{
4084 		// N=5   N=4   N=3   N=2
4085 		{  0,    0,    0,    7,  },  // 40%
4086 		{  0,    0,    0,    8,  },  // 41%
4087 		{  0,    0,    0,    9,  },  // 42%
4088 		{  0,    0,    0,    9,  },  // 43%
4089 		{  0,    0,    1,   10,  },  // 44%
4090 		{  0,    0,    1,   11,  },  // 45%
4091 		{  0,    0,    1,   12,  },  // 46%
4092 		{  0,    0,    2,   13,  },  // 47%
4093 		{  0,    0,    2,   14,  },  // 48%
4094 		{  0,    0,    4,   16,  },  // 49%
4095 		{  0,    0,    4,   16,  },  // 50%
4096 		{  0,    0,    5,   17,  },  // 51%
4097 		{  0,    0,    5,   18,  },  // 52%
4098 		{  0,    0,    7,   20,  },  // 53%
4099 		{  0,    1,    7,   21,  },  // 54%
4100 		{  0,    1,    7,   21,  },  // 55%
4101 		{  0,    2,    8,   23,  },  // 56%
4102 		{  0,    2,    8,   25,  },  // 57%
4103 		{  0,    2,   10,   25,  },  // 58%
4104 		{  0,    3,   10,   26,  },  // 59%
4105 		{  0,    4,   13,   28,  },  // 60%
4106 		{  0,    5,   13,   30,  },  // 61%
4107 		{  0,    5,   14,   30,  },  // 62%
4108 		{  1,    6,   15,   33,  },  // 63%
4109 		{  2,    7,   17,   34,  },  // 64%
4110 		{  2,    7,   17,   35,  },  // 65%
4111 		{  2,    9,   20,   37,  },  // 66%
4112 		{  4,   10,   20,   37,  },  // 67%
4113 		{  4,   11,   22,   40,  },  // 68%
4114 		{  5,   12,   24,   41,  },  // 69%
4115 		{  5,   12,   25,   42,  },  // 70%
4116 		{  6,   16,   27,   43,  },  // 71%
4117 		{  8,   16,   27,   45,  },  // 72%
4118 		{  9,   17,   29,   47,  },  // 73%
4119 		{ 10,   18,   31,   47,  },  // 74%
4120 		{ 10,   20,   32,   50,  },  // 75%
4121 		{ 12,   20,   32,   51,  },  // 76%
4122 		{ 14,   22,   36,   54,  },  // 77%
4123 		{ 15,   24,   37,   55,  },  // 78%
4124 		{ 17,   26,   41,   58,  },  // 79%
4125 		{ 18,   29,   41,   59,  },  // 80%
4126 		{ 20,   30,   45,   60,  },  // 81%
4127 		{ 24,   35,   48,   62,  },  // 82%
4128 		{ 26,   36,   48,   64,  },  // 83%
4129 		{ 27,   38,   51,   65,  },  // 84%
4130 		{ 31,   43,   54,   68,  },  // 85%
4131 		{ 35,   43,   55,   70,  },  // 86%
4132 		{ 36,   48,   60,   71,  },  // 87%
4133 		{ 36,   50,   61,   73,  },  // 88%
4134 		{ 40,   50,   61,   75,  },  // 89%
4135 		{ 45,   54,   65,   75,  },  // 90%
4136 		{ 52,   60,   70,   79,  },  // 91%
4137 		{ 53,   62,   71,   81,  },  // 92%
4138 		{ 57,   66,   75,   84,  },  // 93%
4139 		{ 57,   66,   76,   85,  },  // 94%
4140 		{ 64,   71,   78,   85,  },  // 95%
4141 		{ 70,   75,   82,   89,  },  // 96%
4142 		{ 77,   81,   86,   92,  },  // 97%
4143 		{ 82,   86,   90,   94,  },  // 98%
4144 		{ 83,   87,   91,   95,  },  // 99%
4145 		{ 91,   93,   95,   97,  },  // 100%
4146 	},
4147 	// cover 0.95
4148 	{
4149 		// N=5   N=4   N=3   N=2
4150 		{  0,    0,    1,    9,  },  // 40%
4151 		{  0,    0,    2,   10,  },  // 41%
4152 		{  0,    0,    2,   11,  },  // 42%
4153 		{  0,    0,    3,   12,  },  // 43%
4154 		{  0,    0,    3,   12,  },  // 44%
4155 		{  0,    0,    4,   14,  },  // 45%
4156 		{  0,    0,    4,   14,  },  // 46%
4157 		{  0,    1,    5,   16,  },  // 47%
4158 		{  0,    1,    6,   17,  },  // 48%
4159 		{  0,    2,    7,   19,  },  // 49%
4160 		{  0,    2,    8,   19,  },  // 50%
4161 		{  0,    2,    8,   20,  },  // 51%
4162 		{  0,    2,    9,   21,  },  // 52%
4163 		{  0,    4,   10,   23,  },  // 53%
4164 		{  1,    4,   11,   24,  },  // 54%
4165 		{  1,    4,   11,   24,  },  // 55%
4166 		{  1,    5,   13,   26,  },  // 56%
4167 		{  2,    5,   13,   27,  },  // 57%
4168 		{  2,    6,   15,   29,  },  // 58%
4169 		{  2,    7,   15,   30,  },  // 59%
4170 		{  3,    8,   16,   31,  },  // 60%
4171 		{  4,    8,   18,   32,  },  // 61%
4172 		{  4,    9,   18,   33,  },  // 62%
4173 		{  5,   11,   20,   36,  },  // 63%
4174 		{  6,   12,   22,   37,  },  // 64%
4175 		{  6,   12,   22,   38,  },  // 65%
4176 		{  8,   14,   24,   40,  },  // 66%
4177 		{  8,   15,   25,   41,  },  // 67%
4178 		{ 10,   16,   27,   42,  },  // 68%
4179 		{ 10,   18,   28,   45,  },  // 69%
4180 		{ 11,   18,   29,   45,  },  // 70%
4181 		{ 14,   21,   31,   47,  },  // 71%
4182 		{ 14,   22,   32,   48,  },  // 72%
4183 		{ 14,   22,   33,   50,  },  // 73%
4184 		{ 17,   24,   36,   52,  },  // 74%
4185 		{ 17,   25,   36,   52,  },  // 75%
4186 		{ 18,   27,   39,   54,  },  // 76%
4187 		{ 20,   29,   41,   56,  },  // 77%
4188 		{ 21,   31,   42,   58,  },  // 78%
4189 		{ 21,   31,   46,   60,  },  // 79%
4190 		{ 27,   35,   46,   60,  },  // 80%
4191 		{ 28,   37,   50,   63,  },  // 81%
4192 		{ 31,   38,   50,   64,  },  // 82%
4193 		{ 34,   43,   53,   66,  },  // 83%
4194 		{ 36,   45,   54,   67,  },  // 84%
4195 		{ 41,   50,   60,   70,  },  // 85%
4196 		{ 43,   51,   60,   71,  },  // 86%
4197 		{ 45,   54,   63,   74,  },  // 87%
4198 		{ 48,   55,   64,   75,  },  // 88%
4199 		{ 54,   60,   68,   78,  },  // 89%
4200 		{ 55,   62,   71,   80,  },  // 90%
4201 		{ 56,   63,   71,   80,  },  // 91%
4202 		{ 64,   70,   76,   84,  },  // 92%
4203 		{ 69,   74,   80,   86,  },  // 93%
4204 		{ 73,   78,   83,   88,  },  // 94%
4205 		{ 74,   78,   84,   89,  },  // 95%
4206 		{ 80,   84,   87,   91,  },  // 96%
4207 		{ 83,   86,   90,   93,  },  // 97%
4208 		{ 86,   89,   92,   95,  },  // 98%
4209 		{ 91,   93,   95,   97,  },  // 99%
4210 		{ 92,   93,   95,   97,  },  // 100%
4211 	},
4212 	// cover 0.9
4213 	{
4214 		// N=5   N=4   N=3   N=2
4215 		{  0,    0,    2,   11,  },  // 40%
4216 		{  0,    0,    3,   12,  },  // 41%
4217 		{  0,    0,    3,   12,  },  // 42%
4218 		{  0,    1,    4,   13,  },  // 43%
4219 		{  0,    1,    5,   14,  },  // 44%
4220 		{  0,    1,    5,   15,  },  // 45%
4221 		{  0,    1,    6,   16,  },  // 46%
4222 		{  0,    2,    7,   18,  },  // 47%
4223 		{  0,    2,    7,   18,  },  // 48%
4224 		{  0,    3,    9,   20,  },  // 49%
4225 		{  1,    4,    9,   20,  },  // 50%
4226 		{  1,    4,   10,   21,  },  // 51%
4227 		{  1,    4,   11,   23,  },  // 52%
4228 		{  2,    5,   12,   24,  },  // 53%
4229 		{  2,    5,   12,   25,  },  // 54%
4230 		{  2,    6,   13,   26,  },  // 55%
4231 		{  3,    7,   14,   28,  },  // 56%
4232 		{  3,    7,   15,   28,  },  // 57%
4233 		{  4,    8,   16,   30,  },  // 58%
4234 		{  5,    9,   17,   31,  },  // 59%
4235 		{  5,   10,   18,   32,  },  // 60%
4236 		{  6,   11,   20,   35,  },  // 61%
4237 		{  6,   11,   20,   35,  },  // 62%
4238 		{  7,   13,   22,   38,  },  // 63%
4239 		{  8,   14,   23,   39,  },  // 64%
4240 		{  8,   15,   24,   39,  },  // 65%
4241 		{ 10,   16,   26,   42,  },  // 66%
4242 		{ 10,   17,   27,   42,  },  // 67%
4243 		{ 12,   19,   29,   44,  },  // 68%
4244 		{ 13,   20,   30,   46,  },  // 69%
4245 		{ 13,   21,   31,   47,  },  // 70%
4246 		{ 16,   23,   33,   48,  },  // 71%
4247 		{ 18,   25,   34,   50,  },  // 72%
4248 		{ 18,   26,   36,   51,  },  // 73%
4249 		{ 19,   28,   38,   53,  },  // 74%
4250 		{ 20,   29,   38,   53,  },  // 75%
4251 		{ 23,   30,   41,   56,  },  // 76%
4252 		{ 24,   33,   43,   57,  },  // 77%
4253 		{ 26,   34,   45,   59,  },  // 78%
4254 		{ 28,   37,   48,   61,  },  // 79%
4255 		{ 30,   37,   48,   62,  },  // 80%
4256 		{ 33,   42,   52,   64,  },  // 81%
4257 		{ 35,   43,   53,   65,  },  // 82%
4258 		{ 38,   47,   56,   68,  },  // 83%
4259 		{ 40,   47,   56,   68,  },  // 84%
4260 		{ 44,   53,   61,   71,  },  // 85%
4261 		{ 45,   53,   62,   73,  },  // 86%
4262 		{ 50,   58,   66,   75,  },  // 87%
4263 		{ 51,   58,   66,   76,  },  // 88%
4264 		{ 57,   63,   71,   79,  },  // 89%
4265 		{ 60,   66,   72,   81,  },  // 90%
4266 		{ 62,   68,   75,   83,  },  // 91%
4267 		{ 70,   74,   80,   85,  },  // 92%
4268 		{ 74,   78,   82,   88,  },  // 93%
4269 		{ 85,   87,   90,   92,  },  // 94%
4270 		{ 86,   88,   90,   92,  },  // 95%
4271 		{ 87,   89,   91,   93,  },  // 96%
4272 		{ 87,   89,   92,   94,  },  // 97%
4273 		{ 89,   91,   93,   96,  },  // 98%
4274 		{ 93,   94,   96,   97,  },  // 99%
4275 		{ 94,   95,   97,   98,  },  // 100%
4276 	},
4277 	// cover 0.8
4278 	{
4279 		// N=5   N=4   N=3   N=2
4280 		{  0,    1,    4,   13,  },  // 40%
4281 		{  0,    1,    5,   13,  },  // 41%
4282 		{  0,    1,    5,   14,  },  // 42%
4283 		{  0,    2,    6,   15,  },  // 43%
4284 		{  0,    2,    6,   16,  },  // 44%
4285 		{  0,    2,    7,   17,  },  // 45%
4286 		{  1,    3,    8,   18,  },  // 46%
4287 		{  1,    4,    9,   20,  },  // 47%
4288 		{  1,    4,    9,   20,  },  // 48%
4289 		{  2,    5,   11,   22,  },  // 49%
4290 		{  2,    5,   11,   22,  },  // 50%
4291 		{  2,    6,   12,   24,  },  // 51%
4292 		{  3,    6,   13,   25,  },  // 52%
4293 		{  3,    7,   14,   26,  },  // 53%
4294 		{  4,    8,   14,   27,  },  // 54%
4295 		{  4,    8,   15,   28,  },  // 55%
4296 		{  5,    9,   17,   30,  },  // 56%
4297 		{  5,    9,   17,   30,  },  // 57%
4298 		{  6,   11,   19,   32,  },  // 58%
4299 		{  7,   12,   20,   34,  },  // 59%
4300 		{  8,   12,   20,   34,  },  // 60%
4301 		{  9,   14,   22,   37,  },  // 61%
4302 		{  9,   14,   23,   37,  },  // 62%
4303 		{ 10,   16,   25,   39,  },  // 63%
4304 		{ 11,   17,   26,   41,  },  // 64%
4305 		{ 12,   18,   27,   41,  },  // 65%
4306 		{ 13,   20,   28,   43,  },  // 66%
4307 		{ 14,   21,   30,   45,  },  // 67%
4308 		{ 15,   22,   31,   46,  },  // 68%
4309 		{ 17,   24,   33,   48,  },  // 69%
4310 		{ 17,   24,   34,   48,  },  // 70%
4311 		{ 19,   26,   36,   50,  },  // 71%
4312 		{ 20,   27,   37,   51,  },  // 72%
4313 		{ 21,   29,   39,   53,  },  // 73%
4314 		{ 23,   31,   41,   55,  },  // 74%
4315 		{ 23,   31,   41,   55,  },  // 75%
4316 		{ 26,   34,   44,   58,  },  // 76%
4317 		{ 28,   36,   46,   59,  },  // 77%
4318 		{ 29,   37,   47,   60,  },  // 78%
4319 		{ 34,   41,   50,   62,  },  // 79%
4320 		{ 34,   42,   51,   63,  },  // 80%
4321 		{ 38,   45,   55,   66,  },  // 81%
4322 		{ 39,   46,   55,   67,  },  // 82%
4323 		{ 44,   51,   60,   70,  },  // 83%
4324 		{ 44,   51,   60,   70,  },  // 84%
4325 		{ 49,   56,   64,   73,  },  // 85%
4326 		{ 50,   57,   64,   74,  },  // 86%
4327 		{ 57,   63,   69,   77,  },  // 87%
4328 		{ 58,   64,   70,   78,  },  // 88%
4329 		{ 68,   71,   76,   82,  },  // 89%
4330 		{ 68,   72,   77,   83,  },  // 90%
4331 		{ 75,   79,   81,   85,  },  // 91%
4332 		{ 86,   87,   89,   90,  },  // 92%
4333 		{ 88,   89,   90,   92,  },  // 93%
4334 		{ 90,   91,   92,   93,  },  // 94%
4335 		{ 91,   92,   93,   94,  },  // 95%
4336 		{ 92,   94,   94,   95,  },  // 96%
4337 		{ 93,   94,   95,   96,  },  // 97%
4338 		{ 94,   95,   95,   96,  },  // 98%
4339 		{ 94,   95,   96,   98,  },  // 99%
4340 		{ 95,   96,   97,   98,  },  // 100%
4341 	},
4342 	// cover 0.6
4343 	{
4344 		// N=5   N=4   N=3   N=2
4345 		{  1,    2,    6,   15,  },  // 40%
4346 		{  1,    3,    7,   16,  },  // 41%
4347 		{  1,    3,    8,   17,  },  // 42%
4348 		{  2,    4,    9,   18,  },  // 43%
4349 		{  2,    4,    9,   19,  },  // 44%
4350 		{  2,    5,   10,   20,  },  // 45%
4351 		{  3,    5,   10,   21,  },  // 46%
4352 		{  3,    6,   12,   22,  },  // 47%
4353 		{  3,    6,   12,   23,  },  // 48%
4354 		{  4,    8,   14,   25,  },  // 49%
4355 		{  4,    8,   14,   25,  },  // 50%
4356 		{  5,    8,   15,   26,  },  // 51%
4357 		{  5,    9,   16,   27,  },  // 52%
4358 		{  6,   10,   17,   29,  },  // 53%
4359 		{  6,   11,   18,   30,  },  // 54%
4360 		{  7,   11,   18,   31,  },  // 55%
4361 		{  8,   12,   20,   32,  },  // 56%
4362 		{  8,   13,   20,   33,  },  // 57%
4363 		{ 10,   14,   22,   35,  },  // 58%
4364 		{ 10,   15,   23,   37,  },  // 59%
4365 		{ 11,   16,   24,   37,  },  // 60%
4366 		{ 12,   18,   26,   39,  },  // 61%
4367 		{ 13,   18,   26,   40,  },  // 62%
4368 		{ 14,   20,   28,   42,  },  // 63%
4369 		{ 16,   22,   30,   43,  },  // 64%
4370 		{ 16,   22,   31,   44,  },  // 65%
4371 		{ 17,   23,   32,   45,  },  // 66%
4372 		{ 18,   25,   33,   47,  },  // 67%
4373 		{ 19,   26,   35,   48,  },  // 68%
4374 		{ 21,   27,   36,   50,  },  // 69%
4375 		{ 22,   29,   37,   51,  },  // 70%
4376 		{ 24,   30,   39,   52,  },  // 71%
4377 		{ 25,   32,   41,   53,  },  // 72%
4378 		{ 26,   33,   42,   55,  },  // 73%
4379 		{ 29,   35,   44,   57,  },  // 74%
4380 		{ 29,   36,   45,   57,  },  // 75%
4381 		{ 32,   39,   48,   60,  },  // 76%
4382 		{ 34,   41,   50,   61,  },  // 77%
4383 		{ 36,   43,   51,   62,  },  // 78%
4384 		{ 40,   46,   54,   65,  },  // 79%
4385 		{ 40,   46,   54,   65,  },  // 80%
4386 		{ 46,   52,   59,   68,  },  // 81%
4387 		{ 46,   52,   60,   69,  },  // 82%
4388 		{ 53,   59,   65,   73,  },  // 83%
4389 		{ 54,   60,   66,   73,  },  // 84%
4390 		{ 63,   67,   73,   78,  },  // 85%
4391 		{ 68,   71,   75,   79,  },  // 86%
4392 		{ 78,   80,   82,   85,  },  // 87%
4393 		{ 79,   81,   83,   85,  },  // 88%
4394 		{ 83,   85,   86,   87,  },  // 89%
4395 		{ 85,   86,   87,   89,  },  // 90%
4396 		{ 86,   88,   89,   90,  },  // 91%
4397 		{ 88,   89,   90,   91,  },  // 92%
4398 		{ 90,   90,   91,   92,  },  // 93%
4399 		{ 91,   92,   92,   93,  },  // 94%
4400 		{ 92,   93,   94,   94,  },  // 95%
4401 		{ 94,   94,   95,   95,  },  // 96%
4402 		{ 95,   95,   96,   96,  },  // 97%
4403 		{ 95,   96,   97,   97,  },  // 98%
4404 		{ 96,   96,   97,   98,  },  // 99%
4405 		{ 97,   98,   98,   99,  },  // 100%
4406 	},
4407 };
4408 
4409